In [None]:
import matplotlib.pyplot as plt
import pyLDAvis.gensim
import seaborn as sns
from tqdm.notebook import tqdm

import warnings
import json
import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH =  os.getenv('REPO_PATH')
sys.path.insert(0, rf'{REPO_PATH}src')

from utils.text_utils import clean_token_series, IGNORE_WORDS
from utils.topic_utils import classify_article, LDAModelSetup
from utils.main_utils import combload_topic_dfs, apply_nb_style

apply_nb_style()
pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore")

### Import data

In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']

text_df = combload_topic_dfs(
    TOPICS,
    lambda topic: rf'{REPO_PATH}data\news_data\EIKON_{topic}_NEWS_COMPLETE.json'
)

text_df['cleaned_tokenized'] = clean_token_series(text_df['fullStory'])


### Latent Dirichlet Allocation (LDA) model setup for subtopic


In [None]:
LDA_PARAMS = {
    'num_topics': 3,
    'chunksize': 500,
    'passes': 20,
    'iterations': 100,
    'eval_every': 1
}

models = {}
for topic in TOPICS:
    model = LDAModelSetup(
        text_df.loc[text_df['topic'] == topic, 'cleaned_tokenized'],
        name=topic,
        stopwords=IGNORE_WORDS, 
        lda_params=LDA_PARAMS
    )

    models[topic] = model

for topic, model in models.items():
    print(f'Creating model for {topic}...')
    model.generate_model()
    model.generate_pyLDAvis()
    model.print_top_words(20)



### LDAvis visualization of gensim LDA model

In [None]:
TOPIC = 'CRU'

display(models[TOPIC].visfig)

In [None]:
CRU_labels = {
    0: 'Crude Oil Production and Prices',
    1: 'Financial Markets and Economic Indicators',
    2: 'Financial Instruments and Regulations'
}

CWP_labels = {
    0: 'Middle East and Eastern Europe Conflicts',
    1: 'International Security and Diplomacy',
    2: 'Domestic Unrest and Government Actions'
}

CEN_labels = {
    0: 'US Federal Reserve and Monetary Policy',
    1: 'Economic Conditions and Government Policies',
    2: 'Financial Markets and Global Banking'
}


fig, axs = plt.subplots(1, 3, figsize=(18, 6), dpi=200)

for i, topic in enumerate(TOPICS):
    models[topic].plot_pyLDAvis(axs[i])

fig.tight_layout(pad=1)

fig.savefig(rf'images\pyLDAvis_topic_PC.png', dpi=200)

### LDA setup for cross-topic analysis

In [None]:
LDA_PARAMS = {
    'num_topics': 5,
    'chunksize': 1000,
    'passes': 30,
    'iterations': 500,
    'eval_every': 1
}

full_model = LDAModelSetup(
    text_df['cleaned_tokenized'],
    name='All topics',
    stopwords=IGNORE_WORDS, 
    lda_params=LDA_PARAMS
)

full_model.generate_model()
full_model.generate_pyLDAvis()

### Cross-domain pyLDAvis visualization

In [None]:
display(full_model.visfig)

In [None]:
tqdm.pandas(desc='Classifying articles with cross-domain topics')
df = text_df.copy()
df['crosstopic'] = df.progress_apply(
    lambda x: classify_article(
        x, 
        full_model.dictionary, 
        full_model.model
    ), axis=1
)

In [None]:
topic_names = {
    0: 'Securities and Commodity Markets',
    1: 'Interest Rates and Economic Policy',
    2: 'Geopolitical Conflicts',
    3: 'Banking and Finance',
    4: 'Oil and Gas Production'
}

fig = plt.figure(figsize=(12, 6), dpi=200)

locs = [(0,0), (0,2), (0,4), (1,1), (1,3)]
ax = [plt.subplot2grid((2,6), loc, colspan=2, fig=fig) for loc in locs]

colors = sns.color_palette('twilight', n_colors=3)
order = []

size_df = df['crosstopic'].value_counts().sort_index()

for i, topic in enumerate(size_df.index.values):
    # print topic size in %
    values = df[df['crosstopic'] == topic]['topic'].value_counts().reindex(TOPICS)
    values.plot.pie(ax=ax[i], colors=colors)
    topic_size = size_df[topic] / size_df.sum()
    ax[i].set_ylabel('')
    ax[i].set_title(
        f'$\\mathbf{{Topic\\ {i + 1}}}$ | {topic_size:.0%}\n{topic_names[i]}', 
        fontsize=11
    )

    # print top 10 words for each topic
    top_words = full_model.model.show_topic(topic, topn=20)
    words = [word for word, _ in top_words]
    order.append(words)
    print(f'Topic {i + 1}: {words}')

fig.tight_layout(h_pad=0)

fig.savefig(rf'images\pyLDAvis_all_crosstopics.png')

### PCA visualization of LDA model



In [None]:

fig, ax = plt.subplots(figsize=(7,7), dpi=200)

full_model.plot_pyLDAvis(ax)

fig.savefig(rf'images\pyLDAvis_crosstopic_PCA.png')

### Assign topics to each document

In [None]:
for topic in TOPICS:
    df_temp = text_df[text_df['topic'] == topic]
    tqdm.pandas(desc=f"Adding subtopics to {topic}")
    df_temp['topic'] = df_temp.progress_apply(
        lambda x: classify_article(
            x,
            models[topic].dictionary, 
            models[topic].model
        ), axis=1
    )
    topic_dict = dict(zip(df_temp['storyId'], df_temp['topic']))

    with open(rf'{REPO_PATH}data\topic_data\{topic}_TOPICS.json', 'w') as f:
        json.dump(topic_dict, f, indent=2)


cross_topic_dict = dict(zip(df['storyId'], df['crosstopic']))

with open(rf'{REPO_PATH}data\topic_data\CROSS_TOPICS.json', 'w') as f:
        json.dump(cross_topic_dict, f, indent=2)
