In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import Markdown
from wordcloud import WordCloud

from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import os
import sys
from dotenv import load_dotenv

load_dotenv()

repo_path =  os.getenv('REPO_PATH')

sys.path.insert(0, repo_path + r'src_HF')
from utils.main_utils import *
from utils.text_utils import *

### Import data

In [None]:
text_df = pd.read_json(repo_path + r'data\news_data\EIKON_CRU_NEWS_COMPLETE.json', lines=True, orient='records')

_, text_df['cleaned_tokenized'] = clean_token_series(text_df['fullStory'])

display(text_df)

### Latent Dirichlet Allocation (LDA)

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(text_df['cleaned_tokenized'].to_list())

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_df['cleaned_tokenized'].to_list()]

# Creating the object for LDA model using gensim library
Lda = LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

print(ldamodel.print_topics(num_topics=5, num_words=5))

In [None]:
# print a dataframe with the top word for each topic
topics = ldamodel.print_topics(num_topics=10, num_words=10)

for topic_number in range(10):
    topic = topics[topic_number]
    print(topic[0])
    print(topic[1])



In [None]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in tqdm(enumerate(model.components_)):

        # create dataframe
        df = pd.DataFrame(topic, vectorizer.get_feature_names_out(), columns=["score"])

        # sort by score
        df.sort_values(by="score", ascending=False, inplace=True)

        # display top n words
        display(Markdown(f"### Topic {idx}"))
        display(df.head(8))

# Print the topics
print_topics(ldamodel, vectorizer)