In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import Markdown

from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import sys

sys.path.insert(0, r'c:\Users\joneh\master_thesis\src_HF')
from utils.main_utils import *

### Import text data

In [None]:
text_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\raw_news_stories\EIKON_CRU_NEWS_FULL.csv')
# remove all rows where fullStory is 'error'
text_df = text_df[text_df['fullStory'] != 'error']
# remove all rows where fullStory is float
text_df = text_df[text_df['fullStory'].apply(lambda x: isinstance(x, str))]
# reset index
text_df.reset_index(drop=True, inplace=True)

display(text_df)

### Demonstration

In [None]:
text_str: str = text_df['fullStory'][555]
display(text_str)

# Tokenize the text
tokens: list = word_tokenize(text_str)
print('Tokens:', len(tokens))

# Remove punctuation
tokens_wo_punct = [word for word in tokens if word.isalnum()]
print('Tokens without punctuation:', len(tokens_wo_punct))

# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens_wo_sw = [word for word in tokens_wo_punct if word.lower() not in stop_words]
display(tokens_wo_sw)
print('Tokens without stopwords:', len(tokens_wo_sw))


### Text cleaning

In [None]:
stop_words = set(stopwords.words('english'))
other_words = set(
    [
        'Full', 'Story', 'Reuters', 'copyright', 'c', 'Thomson', 'Click', 'Restrictions',
        'Thomson Reuters', 'Full Story', 'Click Restrictions', 'c Copyright', 'Copyright Thomson',
        'Restrictions https', 'Reuters Click', 'Final Terms'
    ]
)
stop_words = stop_words.union(other_words)

def clean_tokens(tokens: list[str]) -> list[str]:
    tokens_wo_punct: list = [word for word in tokens if word.isalnum()]
    tokens_wo_sw: list = [word for word in tokens_wo_punct if word.lower() not in stop_words]
    return tokens_wo_sw

def clean_series(series: pd.Series) -> pd.DataFrame:
    tokenized: pd.Series = series.apply(word_tokenize)
    cleaned: pd.Series = tokenized.apply(clean_tokens)
    return tokenized, cleaned

text_df['tokenized'], text_df['cleaned'] = clean_series(text_df['fullStory'])

display(text_df)

### Word Cloud

In [None]:
from wordcloud import WordCloud

Wordcloud = WordCloud(
    width=800, 
    height=400,
    max_font_size=100,
    colormap='twilight',
    background_color='white'
).generate(' '.join(text_df['cleaned'].sum()))

fig = plt.figure(figsize=(8, 4), facecolor=None)
plt.imshow(Wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

fig.savefig(r'C:\Users\joneh\master_thesis\src_HF\5 Topic Modelling\wordcloud.png')

### Latent Dirichlet Allocation (LDA)

In [None]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(text_df['cleaned'].to_list())

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_df['cleaned'].to_list()]

# Creating the object for LDA model using gensim library
Lda = LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

print(ldamodel.print_topics(num_topics=10, num_words=3))

In [None]:
# print a dataframe with the top word for each topic
topics = ldamodel.print_topics(num_topics=10, num_words=10)

for topic_number in range(10):
    topic = topics[topic_number]
    print(topic[0])
    print(topic[1])



In [None]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in tqdm(enumerate(model.components_)):

        # create dataframe
        df = pd.DataFrame(topic, vectorizer.get_feature_names_out(), columns=["score"])

        # sort by score
        df.sort_values(by="score", ascending=False, inplace=True)

        # display top n words
        display(Markdown(f"### Topic {idx}"))
        display(df.head(8))

# Print the topics
print_topics(ldamodel, vectorizer)