In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import os
import sys
from dotenv import load_dotenv

load_dotenv()
pyLDAvis.enable_notebook()
repo_path =  os.getenv('REPO_PATH')

sys.path.insert(0, repo_path + r'src_HF')
from utils.main_utils import *
from utils.text_utils import *

### Import data

In [None]:
eikon_topic = 'CRU'

text_df = pd.read_json(repo_path + rf'data\news_data\EIKON_{eikon_topic}_NEWS_COMPLETE.json', lines=True, orient='records')

_, text_df['cleaned_tokenized'] = clean_token_series(text_df['fullStory'])

display(text_df)

### Latent Dirichlet Allocation (LDA)

In [None]:
num_topics: int = 10
chunksize: int = 500
passes: int = 20
iterations: int = 400
eval_every: int = 1

doc_list: list = text_df['cleaned_tokenized'].to_list()

stop_words =  ignore_words

# remove stop words
doc_list = [[word for word in doc if word not in stop_words] for doc in doc_list]

dictionary = corpora.Dictionary(doc_list)
dictionary.filter_extremes(no_below=5, no_above=0.5)

dictionary.id2token = {id: token for token, id in dictionary.token2id.items()}

corpus = [dictionary.doc2bow(doc) for doc in doc_list]

model = LdaModel(
    corpus=corpus,
    id2word=dictionary.id2token,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)


In [None]:
topics = model.show_topics(formatted=False)

for topic in topics:
    df = pd.DataFrame(topic[1], columns=['word', 'probability'])
    display(df)

### Assign topics to each document

In [None]:
# classify the article
def classify_article(article, model):
    bow = dictionary.doc2bow(article['cleaned_tokenized'])
    topic_distribution = model.get_document_topics(bow)
    topic_nr = max(topic_distribution, key=lambda x: x[1])[0]
    return topic_nr

text_df['topic'] = text_df.apply(lambda x: classify_article(x, model), axis=1)

topic_dict = dict(zip(text_df['storyId'], text_df['topic']))

with open(repo_path + rf'data\topics\{eikon_topic}_TOPICS.json', 'w') as f:
    json.dump(topic_dict, f, indent=2)

In [None]:
pyLDAvis.gensim.prepare(model, corpus, dictionary)
