In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import seaborn as sns

import pyLDAvis.gensim

import os
import sys
from dotenv import load_dotenv

load_dotenv()
pyLDAvis.enable_notebook()
REPO_PATH =  os.getenv('REPO_PATH')

sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils.text_utils import clean_token_series, IGNORE_WORDS
from utils.topic_utils import classify_article, LDAModelSetup
from utils.main_utils import combload_topic_dfs

### Import data

In [None]:
TOPICS: list[str] = ['CRU', 'CWP', 'CEN']

text_df = combload_topic_dfs(
    TOPICS, 
    lambda topic: rf'{REPO_PATH}data\news_data\EIKON_{topic}_NEWS_COMPLETE.json'
)

text_df['cleaned_tokenized'] = clean_token_series(text_df['fullStory'])

display(text_df.head(2))

### Latent Dirichlet Allocation (LDA)

In [None]:
lda_params = {
    'num_topics': 3,
    'chunksize': 500,
    'passes': 20,
    'iterations': 100,
    'eval_every': 1
}

models = {}
for topic in TOPICS:
    print(f'Creating model for {topic}...')
    model = LDAModelSetup(
        text_df.loc[text_df['topic'] == topic, 'cleaned_tokenized'],
        name=topic,
        stopwords=IGNORE_WORDS, 
        lda_params=lda_params
    ).create_model()

    models[topic] = model


### LDAvis visualization of gensim LDA model

In [None]:
TOPIC = 'CRU'

display(models[TOPIC].visfig)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(21, 7), dpi=200)

for i, topic in enumerate(TOPICS):
    LDA_fig = models[topic].plot_pyLDAvis()
    axs[i].imshow(LDA_fig)

fig.savefig(rf'images\pyLDAvis_topic_PC.png', dpi=200)

### Assign topics to each document

In [None]:
for topic in TOPICS:
    df = text_df[text_df['topic'] == topic]
    df['topic'] = df.apply(
        lambda x: classify_article(
            x, 
            models[topic].dictionary, 
            models[topic].model
        ), axis=1
    )
    topic_dict = dict(zip(df['storyId'], df['topic']))

    with open(rf'{REPO_PATH}data\topics\{topic}_TOPICS.json', 'w') as f:
        json.dump(topic_dict, f, indent=2)