# Import All Libraries 

In [28]:
import pandas as pd
import nltk
import spacy
import swifter 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import preprocess_string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words


  from .autonotebook import tqdm as notebook_tqdm


# Load the dataset


In [2]:
df = pd.read_csv('../data/raw_analyst_ratings.csv')


# 1. Sentiment Analysis

In [6]:
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\habteyes.asfaw\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Apply VADER Sentiment Analysis


In [7]:
df['sentiment'] = df['headline'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))


# 2. Text Preprocessing for Topic Modeling

In [29]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english')).union(sklearn_stop_words)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\habteyes.asfaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
def preprocess_text(text):
    # Lowercase and tokenization with stopword removal
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and not token.is_space]


In [31]:
df['tokens'] = df['headline'].swifter.apply(preprocess_text)


Pandas Apply:  36%|███▌      | 507928/1407328 [57:17<53:47, 278.69it/s]  

# 3. Topic Modeling using LDA


In [None]:
dictionary = Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

### Build LDA Model


In [None]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)


### Display the topics


In [None]:
topics = lda_model.print_topics(num_words=5)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")