In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json

import pyLDAvis.gensim
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from gensim import corpora

import os
import sys
from dotenv import load_dotenv

load_dotenv()
pyLDAvis.enable_notebook()
REPO_PATH =  os.getenv('REPO_PATH')

sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils.text_utils import clean_token_series, IGNORE_WORDS
from utils.topic_utils import classify_article, LDAModelSetup
from utils.main_utils import combload_topic_dfs

### Import data

In [None]:
TOPICS: list[str] = ['CRU', 'CWP', 'CEN']

text_df = combload_topic_dfs(
    TOPICS, 
    lambda topic: rf'{REPO_PATH}data\news_data\EIKON_{topic}_NEWS_COMPLETE.json'
)

text_df['cleaned_tokenized'] = clean_token_series(text_df['fullStory'])

display(text_df.head(2))

In [None]:
lda_params: dict[str, int] = {
    'num_topics': 3,
    'chunksize': 500,
    'passes': 20,
    'iterations': 100,
    'eval_every': 1
}

tokenized_series = text_df[text_df['topic'] == 'CRU']['cleaned_tokenized']

doc_list: list = tokenized_series.to_list()

stop_words: set[str] =  IGNORE_WORDS

# addnumbers to stop words
stop_words.update({str(i) for i in range(3000)})

# remove stop words
doc_list = [[word for word in doc if word not in stop_words] for doc in doc_list]

dictionary = corpora.Dictionary(doc_list)
dictionary.filter_extremes(no_below=5, no_above=0.5)

dictionary.id2token = {id: token for token, id in dictionary.token2id.items()}

corpus = [dictionary.doc2bow(doc) for doc in doc_list]

model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary.id2token,
    eta='auto',
    workers=6,
    **lda_params

)

### Latent Dirichlet Allocation (LDA) model setup for subtopic

In [None]:
lda_params: dict[str, int] = {
    'num_topics': 3,
    'chunksize': 500,
    'passes': 20,
    'iterations': 100,
    'eval_every': 1
}

models = {}
for topic in TOPICS:
    model = LDAModelSetup(
        text_df.loc[text_df['topic'] == topic, 'cleaned_tokenized'],
        name=topic,
        stopwords=IGNORE_WORDS, 
        lda_params=lda_params
    )

    models[topic] = model


### Model topics

In [None]:
for key, model in models.items():
    print(f'Creating model for {key}...')
    model.generate_model()
    model.generate_pyLDAvis()

### LDAvis visualization of gensim LDA model

In [None]:
TOPIC = 'CEN'

display(models[TOPIC].visfig)

In [None]:

fig, axs = plt.subplots(1, 3, figsize=(18, 6), dpi=200)

for i, topic in enumerate(TOPICS):
    models[topic].plot_pyLDAvis(axs[i])

fig.tight_layout(pad=1)

fig.savefig(rf'images\pyLDAvis_topic_PC.png', dpi=200)

### LDA setup for cross-topic analysis

In [None]:
lda_params: dict[str, int] = {
    'num_topics': 5,
    'chunksize': 500,
    'passes': 20,
    'iterations': 500,
    'eval_every': 1
}

full_model = LDAModelSetup(
    text_df['cleaned_tokenized'],
    name='All topics',
    stopwords=IGNORE_WORDS, 
    lda_params=lda_params
)

full_model.generate_model()
full_model.generate_pyLDAvis()



In [None]:
display(full_model.visfig)

In [None]:
df = text_df.copy()
df['crosstopic'] = df.apply(
    lambda x: classify_article(
        x, 
        full_model.dictionary, 
        full_model.model
    ), axis=1
)

In [None]:
import seaborn as sns

fig, ax = plt.subplots(1, 5, figsize=(15, 3), dpi=200)

colors = sns.color_palette('twilight', n_colors=3)
order = []

for topic in range(len(df['crosstopic'].unique())):
    values = df[df['crosstopic'] == topic]['topic'].value_counts().reindex(TOPICS)
    values.plot.pie(ax=ax[topic], colors=colors)

    ax[topic].set_ylabel('')
    ax[topic].set_title(f'Topic {topic + 1}')

    # print top 10 words for each topic
    top_words = full_model.model.show_topic(topic, topn=10)
    words = [word for word, _ in top_words]
    order.append(words)
    print(f'Topic {topic + 1}: {words}')

fig.tight_layout(pad=0)

### PCA visualization of LDA model



In [None]:
fig, ax = plt.subplots(figsize=(7,7), dpi=200)

full_model.plot_pyLDAvis(ax)

### Assign topics to each document

In [None]:
for topic in TOPICS:
    df = text_df[text_df['topic'] == topic]
    df['topic'] = df.apply(
        lambda x: classify_article(
            x, 
            models[topic].dictionary, 
            models[topic].model
        ), axis=1
    )
    topic_dict = dict(zip(df['storyId'], df['topic']))

    with open(rf'{REPO_PATH}data\topics\{topic}_TOPICS.json', 'w') as f:
        json.dump(topic_dict, f, indent=2)