# Topic Modeling using LDA

In [None]:
import pandas as pd
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer

import gensim
from gensim.models import TfidfModel
from gensim import corpora
from gensim.models import CoherenceModel

import numpy as np
import re
import os

import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

## Step 1: Load Data

In [None]:
project_base = os.path.dirname(os.path.realpath('.'))
print(f'Project base path: {project_base}')

In [None]:
# could possibly combine the test set to get an even larger pool of words
data_path = os.path.join(project_base, 'data', 'WikiLarge_Train.csv')
full_df = pd.read_csv(data_path)
print(f'full_df column names: {list(full_df)}')
print(f'full training data df shape: {full_df.shape}')

In [None]:
text_df = full_df[['original_text']]

In [None]:
#check size
print(len(text_df))

In [None]:
text_df.head()

## Step 2: Data Preprocessing

In [None]:
# convert text to lowercase
text_df['original_text'] = text_df['original_text'].str.lower()

In [None]:
#replace hyphens with spaces
text_df['original_text'] = text_df['original_text'].str.replace('[-]',' ')

In [None]:
#remove punctuation
text_df['original_text'] = text_df['original_text'].str.replace('[^\w\s]','')

In [None]:
#remove multiple spaces
text_df['original_text'] = text_df['original_text'].str.replace('\s+', ' ')

In [None]:
# stem and lemmatize words 
# Snowball stemmer is a bit more agressive than porter stemmer
# TODO: Try getting bigrams and trigrams

stop_words = set(stopwords.words('english'))

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def stem(text):
    return SnowballStemmer('english').stem(text)

def tokenize_and_preprocess(text):
    
    # tokenize words and remove any that have 3 or less letters
    result = [stem(lemmatize(token)) for token in word_tokenize(text) if token not in stop_words and len(token) > 3]
    
    return result

In [None]:
# sanity check for document 500
sample = text_df.iloc[500][0]

words = [word for word in sample.split(' ')]


print('Original Document:')
print(words)
print('Document After Tokenization, Stemming, and Lemmatization: ')
print(tokenize_and_preprocess(sample))

In [None]:
%%time
#map function to text and examine at processed sentences
processed_sentences = text_df['original_text'].map(tokenize_and_preprocess)

## Step 3: Create Bag of Words

In [None]:
#create gensim dictionary to get word count
gensim_dictionary = gensim.corpora.Dictionary(processed_sentences)

In [None]:
# filter out words that are very rare and very common
# no_below = words that appear less than n times
# no_above words that appear in more than tselected % of documents

gensim_dictionary.filter_extremes(no_below=5, no_above=.5)

In [None]:
corpus = [gensim_dictionary.doc2bow(sentence) for sentence in processed_sentences]

In [None]:
# check what example sentence looks like
sample = [[(gensim_dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]
sample

## Step 4: Build Model

In [None]:
# function to find the optimum number of topics where coherence score is the highest
#this takes a really long time to run. Uncomment the code 


def find_optimum_no_topics(dic, corpus, text, limit, start=2, step=2):
    
    model_coherence_scores = []
    models = []
    perplexity_scores = []
    for num_topics in tqdm(range(start, limit, step)):
        
        # Build LDA model. Multicore is faster
        lda = gensim.models.LdaMulticore(corpus,
                                         id2word=dic,
                                         num_topics=num_topics,
                                         passes=2, 
                                         workers=2,
                                         random_state=17)
        models.append(lda)
        coherence_model = CoherenceModel(model=lda, texts=text, dictionary=dic, coherence='c_v')
        model_coherence_scores.append(coherence_model.get_coherence())
        perplexity_scores.append(lda.log_perplexity(corpus)) # returns perplexity bound, later np.exp2(-bound) is applied
    return models, model_coherence_scores, perplexity_scores

In [None]:
# # this takes about 20 mins to run using an i9-10900k CPU, times may be significantly longer with other CPUs.
# # Therefore, it is commented out. Take my word for it that the optimum number of topics is 22. LDAmulticore models are not 100% reproducible even with random_state, so the coherence scores may vary but are generally close.
# models, coherence_scores, perplexity_scores = find_optimum_no_topics(gensim_dictionary, corpus, processed_sentences, 50, 2, 5)

In [None]:
# apply np.exp2 transformation
# def calc_perplexity(x):
#     return np.exp2(x)
# pscores = list(map(calc_perplexity, perplexity_scores))

In [None]:
pscores = [0.002863704297799762,
 0.002811022226120487,
 0.0014841978570930149,
 0.0011133278299366141,
 0.0008427106743769833,
 0.0006268748525414524,
 0.0004645755247013632,
 0.00034550270074056255,
 0.0002564365410782232,
 0.00018780669786111197]

In [None]:
# here are the scores returned from executing the cell above
coherence_scores = [0.16895017893660008,
 0.4037807434997028,
 0.4140849990233262,
 0.4406477190626243,
 0.474506627250172,
 0.4464699825189897,
 0.4606479110595841,#
 0.43374942861540317,
 0.42031969686544524,
 0.397537490213966]

In [None]:
# plot coherence scores
limit = 50; start=2; step=5
x = range(start, limit, step)
plt.plot(x, coherence_scores)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("Coherenec Score"), loc='best')
plt.show()

In [None]:
# plot coherence scores
limit = 50; start=2; step=5
x = range(start, limit, step)
plt.plot(x, pscores)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity score")
plt.legend(("Perplexity Score"), loc='best')
plt.show()

In [None]:
# normally we could just take the index for the model since each model is saved in the "models" list. Like this:

# optimal_model = models[4]

# However, that would require you to run find_optimum_no_topics(). We will just run the model once below using the
# optimum number of topics that was originally extracted with find_optimum_no_topics() to get a model to display.

In [None]:
%%time
# takes aboiut 1 minute

num_topics = 22
# Build LDA model. Multicore is faster
lda = gensim.models.LdaMulticore(corpus,
                                 id2word=gensim_dictionary,
                                 num_topics=num_topics,
                                 passes=2, 
                                 workers=2,
                                 random_state=17)
# Print the Keywords
print(lda.print_topics())

## Step 5: Explore

In [None]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
# display weights for each topic
for idx, topic in lda.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}')
    print('\n')

In [None]:
# show rankings for example 500
lda[corpus[500]]

In [None]:
%%time
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, gensim_dictionary)
vis

## Step 6: Add topic number as a feature

In [None]:
# function to extract the topic # with the highest percentage
def extract_topic(idx):
    x = lda[corpus[idx]]
    return tuple(max(x, key=lambda x:x[1]))[0]

In [None]:
extract_topic(500)

In [None]:
text_df['index'] = text_df.index

In [None]:
# apply function to extract the topic and use as another feature in supervised learning
full_df['topic'] = text_df['index'].apply(extract_topic)

In [None]:
full_df.to_csv(os.path.join(project_base, 'data', 'unsupervised_data', 'WikiLarge_Train_With_Topics.csv'), index=False)

## Sources

Demonstration of topic modeling with Gensim's LDA Multicore model came from [this lecture](https://www.youtube.com/watch?v=JznDBeqS1lg&ab_channel=GradientGroup) by Carlos Lara. This lecture helped us to understand the best hyperparameters for a Gensim LDA model and how to use it effectively with the given dataset.
