In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim
import ipywidgets as widgets
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import os
import sys
from dotenv import load_dotenv

load_dotenv()
pyLDAvis.enable_notebook()
REPO_PATH =  os.getenv('REPO_PATH')

sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils import *

### Import data

In [None]:
TOPIC = 'CRU'

text_df = pd.read_json(rf'{REPO_PATH}data\news_data\EIKON_{TOPIC}_NEWS_COMPLETE.json', lines=True, orient='records')

_, text_df['cleaned_tokenized'] = clean_token_series(text_df['fullStory'])

display(text_df.head(2))

### Latent Dirichlet Allocation (LDA)

In [None]:
NUM_TOPICS: int = 4
CHUNKSIZE: int = 500
PASSES: int = 20
ITERATIONS: int = 400
EVAL_EVERY: int = 1

doc_list: list = text_df['cleaned_tokenized'].to_list()

stop_words: set[str] =  IGNORE_WORDS

# remove stop words
doc_list = [[word for word in doc if word not in stop_words] for doc in doc_list]

dictionary = corpora.Dictionary(doc_list)
dictionary.filter_extremes(no_below=5, no_above=0.5)

dictionary.id2token = {id: token for token, id in dictionary.token2id.items()}

corpus = [dictionary.doc2bow(doc) for doc in doc_list]

model = LdaModel(
    corpus=corpus,
    id2word=dictionary.id2token,
    chunksize=CHUNKSIZE,
    alpha='auto',
    eta='auto',
    iterations=ITERATIONS,
    num_topics=NUM_TOPICS,
    passes=PASSES,
    eval_every=EVAL_EVERY
)

In [None]:
topics = model.show_topics(formatted=False, num_topics=NUM_TOPICS, num_words=15)

dataframes = {}

for i, topic in enumerate(topics):
    data = {
        'word': [word for word, _ in topic[1]],
        'weight': [weight for _, weight in topic[1]]
    }
    dataframes[f'Topic {i}'] = pd.DataFrame(data)

dropdown = widgets.Dropdown(
    options=list(dataframes.keys()),
    value=dataframes.keys(),
    description='Select DataFrame:',
    disabled=False,
)

# Function to display the DataFrame
def display_dataframe(change):
    display(dataframes[change['new']])

# Call display_dataframe function when the dropdown value changes
dropdown.observe(display_dataframe, names='value')

# Show the dropdown
display(dropdown)

# Initially display the first DataFrame
display(dataframes[dropdown.value])

### LDAvis visualization of gensim LDA model

In [None]:
pyLDAvis.gensim.prepare(
    topic_model = model, 
    corpus = corpus, 
    dictionary = dictionary
)

### Assign topics to each document

In [None]:
# classify the article
def classify_article(article, model):
    bow = dictionary.doc2bow(article['cleaned_tokenized'])
    topic_distribution = model.get_document_topics(bow)
    topic_nr = max(topic_distribution, key=lambda x: x[1])[0]
    return topic_nr

text_df['topic'] = text_df.apply(lambda x: classify_article(x, model), axis=1)

topic_dict = dict(zip(text_df['storyId'], text_df['topic']))

with open(repo_path + rf'data\topics\{eikon_topic}_TOPICS.json', 'w') as f:
    json.dump(topic_dict, f, indent=2)