# Using LDA for Topic Modeling

Add LDA description here.

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer


## Load Data

In [19]:
# load cleaned and preprocessed data
data = pd.read_csv('cleaned_processed_data.csv')
data.head()

Unnamed: 0,Content,ID
0,"['congressional', 'republican', 'new', 'fear',...",0
1,"['bullet', 'shell', 'get', 'counted', 'blood',...",1
2,"['walt', 'disney', 'bambi', 'opened', 'critic'...",2
3,"['death', 'may', 'great', 'equalizer', 'isnt',...",3
4,"['north', 'korea', 'leader', 'kim', 'said', 's...",4


## Additional Cleaning

Further cleaning is necessary to come up with better results in choosing topics.

In [20]:
# remove rows containg NaN value

nan_in_context = data['Content'].isna()
num_nans = data['Content'].isna().sum()
print(num_nans)

data.dropna(subset=['Content'], inplace=True)
data.reset_index(drop=True, inplace=True)

data['ID'] = range(len(data))

data

0


Unnamed: 0,Content,ID
0,"['congressional', 'republican', 'new', 'fear',...",0
1,"['bullet', 'shell', 'get', 'counted', 'blood',...",1
2,"['walt', 'disney', 'bambi', 'opened', 'critic'...",2
3,"['death', 'may', 'great', 'equalizer', 'isnt',...",3
4,"['north', 'korea', 'leader', 'kim', 'said', 's...",4
...,...,...
148866,"['program', 'beat', 'dialler', 'scam', 'introd...",148866
148867,"['spam', 'email', 'tempt', 'net', 'shopper', '...",148867
148868,"['careful', 'code', 'new', 'european', 'direct...",148868
148869,"['cyber', 'security', 'chief', 'resigns', 'man...",148869


In [21]:
# observed that '_' was not removed. Further cleaning process
import re

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text) # remove punctuations
    text = re.sub(r'_', '', text)
    ## Experiment
    text = [word for word in text if word != "trump" and word != "Clinton" and wo
    return text

data['Content'] = data['Content'].apply(clean_text)


## Using TF-IDF scores and Gensim's dictionary to further preprocess the data

In [22]:
import pyLDAvis.gensim_models
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [23]:
noBelow = len(data) * 0.001
noAbove = 0.4

In [25]:
# Using TF-IDF scores and gensim's Dictionary to further preprocess the data

corpus = data['Content'].tolist()
tokenized_docs = [article.split() for article in corpus]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# This returns words in the order they appear in the tfidf_matrix
feature_names = tfidf_vectorizer.get_feature_names_out()

# Take the maximum TF-IDF score across all documents for each word to identify the
# hightest importance that each word has in any single document.
tfidf_scores = tfidf_matrix.max(axis=0).toarray()[0]

# experimenting with threshold values.
threshold = np.mean(tfidf_scores)

word_tfidf_dict = dict(zip(feature_names, tfidf_scores))

filtered_tokenized_docs = []
for doc in tokenized_docs:
    # filter out using TF-IDF threshold
    filtered_doc = [word for word in doc if word_tfidf_dict.get(word, 0) >= threshold]
    filtered_tokenized_docs.append(filtered_doc)

# Here, we use gensim's Dictionary to filter out extremes.
# According to ChatGPT, it calculates the document frequency for each word, and use that to filter out extremes.
tokenized_docs = filtered_tokenized_docs
dictionary = Dictionary(tokenized_docs)


dictionary.filter_extremes(no_below= noBelow, no_above= noAbove, keep_n=100000)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_docs]

## Finding the optimal number of topics using coherence score

In [None]:
# compute coherence score using C_v to find opt number of topics.

topic = []
score = []

# to choose opt number of topics, I ran from 1 ~ 10 number of topics to compare coherence score.
# There are many ways to calculate coherence score, but I chose to go with the most genearlized one, which is c_v score.
# I put number of workers same as the number of cores for my device.
# I read few articles, and learned that c_v score of 0.5 is good.
for i in tqdm(range(1,11)):
    lda_model = LdaMulticore(corpus=doc_term_matrix, id2word=dictionary,
                        iterations=10, num_topics=i, workers=8, passes=10, chunksize=1000,random_state=100)
    cm = CoherenceModel(model=lda_model, texts=tokenized_docs,
                       corpus=doc_term_matrix, coherence='c_v')
    topic.append(i)
    score.append(cm.get_coherence())


 20%|██        | 2/10 [04:56<19:56, 149.51s/it]

In [None]:
plt.plot(topic, score)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.show()

## LDA Model

In [29]:
lda_model_opt = LdaMulticore(corpus=doc_term_matrix, id2word=dictionary,
                        iterations=50, num_topics=9, workers=8, passes=50, random_state=100)



lda_model_opt.print_topics(-1)

[(0,
  '0.016*"police" + 0.012*"court" + 0.009*"law" + 0.008*"officer" + 0.007*"case" + 0.005*"told" + 0.005*"gun" + 0.005*"justice" + 0.004*"judge" + 0.004*"right"'),
 (1,
  '0.011*"country" + 0.008*"united" + 0.007*"government" + 0.007*"china" + 0.007*"world" + 0.005*"minister" + 0.005*"european" + 0.005*"russia" + 0.005*"north" + 0.005*"leader"'),
 (2,
  '0.004*"woman" + 0.004*"get" + 0.004*"know" + 0.004*"game" + 0.004*"thing" + 0.004*"way" + 0.004*"think" + 0.004*"life" + 0.003*"show" + 0.003*"back"'),
 (3,
  '0.013*"attack" + 0.008*"war" + 0.008*"military" + 0.007*"muslim" + 0.007*"group" + 0.007*"force" + 0.007*"islamic" + 0.007*"syria" + 0.005*"isi" + 0.005*"country"'),
 (4,
  '0.010*"news" + 0.007*"house" + 0.006*"email" + 0.006*"official" + 0.006*"campaign" + 0.005*"medium" + 0.005*"white" + 0.005*"former" + 0.005*"investigation" + 0.005*"report"'),
 (5,
  '0.007*"health" + 0.006*"school" + 0.005*"student" + 0.004*"study" + 0.004*"university" + 0.004*"work" + 0.004*"woman" + 

In [27]:
lda_model_opt.save("lda_model_9")

## Visualize Topics

In [30]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model_opt, doc_term_matrix, dictionary)
pyLDAvis.display(lda_display)

# Observing Result

In [17]:
# Just wanted to check out if each topics made sense for certain readme files
n = 3
print(lda_model_opt[doc_term_matrix][n])
print(data['Content'].tolist()[n])

[(0, 0.07449697), (1, 0.70184094), (3, 0.15960659), (5, 0.029452505), (6, 0.029310867)]
death may great equalizer isnt necessarily evenhanded field endeavor suffered mortal loss consider muhammad ali arnold palmer sport hollywood death carrie fisher debbie reynolds pop music world had hand down bleakest year start david bowie whose stage persona androgynous glam rocker dance pop star electronic experimentalist music year day old news came died cancer hinted time short lyric final album released two day death otherwise gone great length hide illness public wish privacy ensured death would appear come blue came another shock three month later prince accidentally overdosed painkiller collapsed elevator sprawling home studio near minneapolis death came indication one including prince rogers nelson seen coming energetic onstage ever holding otherwise healthy regimen successfully defied age sixth decade death too leonard cohen hand year undoubtedly see coming shoulder went hesitate say merry