# Topic Modeling in Gensim

In [None]:
#Imports
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import LdaModel
from gensim.models import CoherenceModel

from pathlib import Path  
import glob

import re

from pprint import pprint

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

## Preprocessing steps

In these initial steps we're preparing our corpus in order to create topic models using Gensim. 

We follow some preprocessing steps already familiar to us by now (tokenzing, removing stopwords, creating bigrams and trigrams) and we're also structuring the data in a certain way that Gensim requires in order to train our LDA models (as a dictionary assigning unique IDs to words and as vector representations ie. each document is represented by a vector where each vector element is the frequency count of a particular word in that document and the frequency count for each word can be mapped onto the unique ID in the dictionary.

**Tokenize your text either using gensim built-in tokenizing or using your own tokenizing function**

In [None]:
# Tokenize using gensim built-in tokenization

#Loop through the texts and tokenize them with gensim tokenizing function
directory_path = 'soderberg-corpus/'
all_docs = []

for filepath in Path(directory_path).glob("*.txt"):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = gensim.utils.simple_preprocess(text)
        all_docs.append(tokenized_text)

#See the first document as tokenized list of words
all_docs[0]

In [None]:
# Tokenize using cutsom tokenizing function

#Put all texts into a single list
#Loop through the texts and tokenize them with custom tokenizing function
from pathlib import Path
directory_path = 'soderberg-corpus/'
all_docs = []

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split(r'\W+', lowercase_text)
    tokenized = [word for word in split_words if word.isalpha()]
    return tokenized

for filepath in Path(directory_path).glob("*.txt"):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = tokenize(text)
        all_docs.append(tokenized_text)

#See the first document as tokenized list of words
all_docs[0]

**Remove stopwords**

In [None]:
#Load custom stopwords list
#open your txt file and convert to a Python list
with open("custom-stopwords.txt", "r") as file_object:
    custom_stopwords = [s.rstrip('\n') for s in file_object.readlines()] 

custom_stopwords

In [None]:
def remove_stopwords(list_of_tokens, stopwords):
    return [token for token in list_of_tokens if token not in stopwords]

all_docs_no_stop = []

for file in all_docs: 
    nostop = remove_stopwords(file, custom_stopwords)
    all_docs_no_stop.append(nostop)
    
all_docs_no_stop[0]

**Creating Bigrams and Trigrams**

Bigrams are two words frequently occurring together that need to be grouped together to make sense (e.g. "black hole", "European Union"). Trigrams are 3 words frequently occurring together that need to be grouped together to make sense. Identifying bigrams and trigrams in our corpus will improve the quality of the models.

In [None]:
# Identify bigrams and trigrams
# min_count: minimum number of times words occur together to be considered a bigram
# threshhold: the higher the number the fewer number of ngrams will be identified
bigram = gensim.models.Phrases(all_docs_no_stop, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[all_docs_no_stop], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_bigrams = make_bigrams(all_docs_no_stop)
data_bigrams_trigrams = make_trigrams(data_bigrams)

#You can find the ngram by searching for words linked with underscore 
#(command + F and search for underscore)
#If you're not staisfied with the bigrams you're getting (capturing too many
#or too few then modify the min_count and threshhold parameters
print(data_bigrams_trigrams[0])

**Creating a dictionary representation of the documents**

In [None]:
# Create Dictionary
# We assign a unique integer ID (key) to all words in vocabulary of the corpus
id2word = corpora.Dictionary(data_bigrams_trigrams)

In [None]:
# We can use token2id to see the mapping between words and their ids
print(id2word.token2id)

In [None]:
#Create a Corpus
#Vectorize the texts
#(ie. count the number of occurrences of each word in each text)
#and associcate these frequency counts
#with the word ID in the dictionary

corpus = []
for text in data_bigrams_trigrams:
    new = id2word.doc2bow(text)
    corpus.append(new)

In [None]:
#Print corpus (vector representation) for first document
#You will see a list of unique word ID, and its frequency
print (corpus[0])

In [None]:
# Human readable format of corpus (term and its frequency)
#Lists the words and their frequency for first document
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

## Training an LDA Model

In [None]:
"""
Parameters: 
We train our model on the corpus and dictionary we created above

num_topics: the number of topics you want
can increase and descrease until find topics useful for analysis

passes: total number of training passes
(ie. how often we train the model on the entire corpus)
iterations: controls how often we repeat a particular loop over each document.
It is important to set the number of “passes” and "iterations" high enough 
so that by the final passes, most of the documents have converged.

chunksize: the number of documents to be used in each training chunk.
I’ve set chunksize = 100, which is more than the amount of documents, 
so I process all the data in one go.

alpha = 'auto' and eta = 'auto': essentially we are automatically learning 
two parameters in the model that we usually would have to specify explicitly.

Experiment with changing the parameters (especially num_topics) until you get meaningful results
Link to documentation: https://radimrehurek.com/gensim/models/ldamodel.html
"""

# Train LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=8,
    chunksize=100,
    passes=200,
    iterations=500,
    alpha='auto',
    eta='auto',
    per_word_topics=True
)

In [None]:
# Topics (probability distributions of words across the corpus)
#This list the topics (Topic 0,1,2 etc.)
#and print the list of words most characteristic for each topic
#preceded by its proability score (how strongly it is characteristic of the topic)
#change the num_words to get more or less words for each topic
pprint(lda_model.print_topics(num_words = 10))

In [None]:
# Distributions of topics over documents: 
#what topics are associated with each document
#This returns a list of each document which lists the most characteristic topics 
#for that document and their weight of association (topic proability) with that document

topics_per_document=[lda_model.get_document_topics(item, minimum_probability=None) for item in corpus]
topics_per_document

## Vizualizing the models

**Visualizing using pyLDAvis**

Each bubble on the left-hand side of the plot represents a topic. The larger the bubble, the more prevalent is that topic and the more documents associated with that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant. Although slightly overlapping topics is not a bad thing: they reveal connections between topics.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

If you move the cursor over the bubbles, the words and bars on the right-hand side will update. These words are the words characteristic of that topic.

This visualization can give you sense of how you can tune your models by adjusting your parameters (e.g. increasing or decreasing number of topics). If words are not meaningful can also add them to custom stopwords list.

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds')
pyLDAvis.display(lda_display)

**Visualize which topics are associated with each document in a heatmap**

In [None]:
#Creating a matrix from topics_per_document results above
#The number of topics you used to train your model above
num_topics = 8

num_texts=len(topics_per_document)

document_topic_matrix = np.zeros((num_texts, num_topics))

In [None]:
for i, document in enumerate(topics_per_document):
    for (topic_index, probability) in document: 
        document_topic_matrix[i][topic_index]=probability

document_topic_matrix

In [None]:
#Create columns that match the topics
topic_names = ["Topic {}".format(i) for i in range(num_topics)]

#Get file names
filepath = 'soderberg-corpus/'
text_files = [s.lstrip(filepath) for s in glob.glob(filepath + '*.txt')]

#Make dataframe 
df = pd.DataFrame(document_topic_matrix, columns=topic_names, index=text_files)
df

In [None]:
#Heatmap of topic distributions for each document
fig, ax = plt.subplots(figsize=(10,10))
figure = sns.heatmap(df, annot=True, cmap="magma_r", ax = ax)

# Uncommment lines below to save the figure
#plt.savefig("Topics-heatmap.pdf")

**Visualize how topics change across time**

In [None]:
#Sort the index by date in the document name
df_sorted = df.sort_index()
df_sorted

In [None]:
#Heatmap of topic distributions for each document sorted by date
fig, ax = plt.subplots(figsize=(10,10))
figure = sns.heatmap(df_sorted, annot=True, cmap="magma_r", ax = ax)

_Acknowledgements_: This notebook is inspired by William Mattingly's ["Topic Modeling and Text Classification with Python" tutorial](https://www.youtube.com/watch?v=N0crN8YnF8Y&list=PL2VXyKi-KpYttggRATQVmgFcQst3z6OlX).