# Imports

In [21]:
#import nltk
import re
#import spacy # for lemmatization
#!python -m spacy download en_core_web_md

import numpy as np
import pandas as pd
from pprint import pprint

from pathlib import Path  
import glob

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


#import pyLDAvis.gensim as gensimvis
#import pyLDAvis

#import matplotlib.pyplot as plt
#%matplotlib inline

# 1-Setting up for Building the Models

There's some set-up involved for using Genism to create topic models. 

If we follow some preprocessing steps (most of which should be familiar to us by now) this will increase the quality of the models. 

Gensim also expects the data to be structured in a certain way to generate models (as a dictionary and corpus created from that dictionary).

We'll go over these steps which involve: preprocessing then making a dictionary and corpus.

## Preprocessing

**Tokenize your text either using gensim built-in tokenizing or using your own tokenizing function

In [22]:
# Tokenize using gensim built-in tokenization

#Put all texts into a single list
#Loop through the texts and tokenize them with custom tokenizing function
directory_path = 'kafka-corpus'
all_docs = []

for filepath in Path(directory_path).glob("*.txt"):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = gensim.utils.simple_preprocess(text)
        all_docs.append(tokenized_text)

#See the first document as tokenized list of words
all_docs[0]

['was',
 'in',
 'great',
 'difficulty',
 'an',
 'urgent',
 'journey',
 'was',
 'facing',
 'me',
 'seriously',
 'ill',
 'man',
 'was',
 'waiting',
 'for',
 'me',
 'in',
 'village',
 'ten',
 'miles',
 'distant',
 'severe',
 'snowstorm',
 'filled',
 'the',
 'space',
 'between',
 'him',
 'and',
 'me',
 'had',
 'carriage',
 'light',
 'one',
 'with',
 'large',
 'wheels',
 'entirely',
 'suitable',
 'for',
 'our',
 'country',
 'roads',
 'wrapped',
 'up',
 'in',
 'furs',
 'with',
 'the',
 'bag',
 'of',
 'instruments',
 'in',
 'my',
 'hand',
 'was',
 'already',
 'standing',
 'in',
 'the',
 'courtyard',
 'ready',
 'for',
 'the',
 'journey',
 'but',
 'the',
 'horse',
 'was',
 'missing',
 'the',
 'horse',
 'my',
 'own',
 'horse',
 'had',
 'died',
 'the',
 'previous',
 'night',
 'as',
 'result',
 'of',
 'over',
 'exertion',
 'in',
 'this',
 'icy',
 'winter',
 'my',
 'servant',
 'girl',
 'was',
 'at',
 'that',
 'very',
 'moment',
 'running',
 'around',
 'the',
 'village',
 'to',
 'see',
 'if',
 'she'

In [23]:
# Tokenize using cutsom tokenizing function

#Put all texts into a single list
#Loop through the texts and tokenize them with custom tokenizing function
from pathlib import Path
directory_path = 'kafka-corpus'
all_docs = []

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split(r'\W+', lowercase_text)
    tokenized = [word for word in split_words if word.isalpha()]
    return tokenized

for filepath in Path(directory_path).glob("*.txt"):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_text = tokenize(text)
        all_docs.append(tokenized_text)

#See the first document as tokenized list of words
all_docs[0]

['i',
 'was',
 'in',
 'great',
 'difficulty',
 'an',
 'urgent',
 'journey',
 'was',
 'facing',
 'me',
 'a',
 'seriously',
 'ill',
 'man',
 'was',
 'waiting',
 'for',
 'me',
 'in',
 'a',
 'village',
 'ten',
 'miles',
 'distant',
 'a',
 'severe',
 'snowstorm',
 'filled',
 'the',
 'space',
 'between',
 'him',
 'and',
 'me',
 'i',
 'had',
 'a',
 'carriage',
 'a',
 'light',
 'one',
 'with',
 'large',
 'wheels',
 'entirely',
 'suitable',
 'for',
 'our',
 'country',
 'roads',
 'wrapped',
 'up',
 'in',
 'furs',
 'with',
 'the',
 'bag',
 'of',
 'instruments',
 'in',
 'my',
 'hand',
 'i',
 'was',
 'already',
 'standing',
 'in',
 'the',
 'courtyard',
 'ready',
 'for',
 'the',
 'journey',
 'but',
 'the',
 'horse',
 'was',
 'missing',
 'the',
 'horse',
 'my',
 'own',
 'horse',
 'had',
 'died',
 'the',
 'previous',
 'night',
 'as',
 'a',
 'result',
 'of',
 'over',
 'exertion',
 'in',
 'this',
 'icy',
 'winter',
 'my',
 'servant',
 'girl',
 'was',
 'at',
 'that',
 'very',
 'moment',
 'running',
 'aro

**Remove stopwords**

In [24]:
#Stopwords: refer to "Preprocessing" notebook for more details on stopwords

#Load custom stopwords list (this is the default spacy list)
#open your txt file and convert to a Python list
with open("custom-stopwords.txt", "r") as file_object:
    custom_stopwords = [s.rstrip('\n') for s in file_object.readlines()] 

custom_stopwords

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [27]:
def remove_stopwords(list_of_tokens, stopwords):
    return [token for token in list_of_tokens if token not in stopwords]

all_docs_no_stop = []

for file in all_docs: 
    nostop = remove_stopwords(file, custom_stopwords)
    all_docs_no_stop.append(nostop)
    
all_docs_no_stop[0]

['great',
 'difficulty',
 'urgent',
 'journey',
 'facing',
 'seriously',
 'ill',
 'man',
 'waiting',
 'village',
 'miles',
 'distant',
 'severe',
 'snowstorm',
 'filled',
 'space',
 'carriage',
 'light',
 'large',
 'wheels',
 'entirely',
 'suitable',
 'country',
 'roads',
 'wrapped',
 'furs',
 'bag',
 'instruments',
 'hand',
 'standing',
 'courtyard',
 'ready',
 'journey',
 'horse',
 'missing',
 'horse',
 'horse',
 'died',
 'previous',
 'night',
 'result',
 'exertion',
 'icy',
 'winter',
 'servant',
 'girl',
 'moment',
 'running',
 'village',
 'borrow',
 'horse',
 'hopeless',
 'knew',
 'stood',
 'useless',
 'increasingly',
 'covered',
 'snow',
 'time',
 'immobile',
 'girl',
 'appeared',
 'gate',
 'swinging',
 'lantern',
 'course',
 'going',
 'lend',
 'horse',
 'journey',
 'walked',
 'courtyard',
 'couldn',
 't',
 'distracted',
 'tormented',
 'kicked',
 'foot',
 'cracked',
 'door',
 'pig',
 'sty',
 'years',
 'door',
 'opened',
 'banged',
 'fro',
 'hinges',
 'warmth',
 'smell',
 'horses'

## Creating Bigrams and Trigrams

Bigrams are two words frequently occurring together that need to be grouped together to make sense (e.g. "black hole", "European Union". Trigrams are 3 words frequently occurring together that need to be grouped together to make sense. Identifying bigrams and trigrams in our corpus will improve the quality of the models.

In [None]:
# Build the bigram and trigram models
# min_count: minimum number of times words occur together to be counted as/considered a bigram
# threshhold: number of phrases that are found - the higher the number the fewer number of phrases
bigram = gensim.models.Phrases(all_docs_no_stop, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[all_docs_no_stop], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[text_files[0]]])

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

## search for words linked with underscore: those are the bigrams and trigrams
#if you're not staisfied with the bigrams you're getting (capturing too many
#or too little then modify the min_count and threshhold parameters
print(data_bigrams_trigrams[0][0:20])

In [36]:
# Build the bigram and trigram models
# min_count: minimum number of times words occur together to be counted as/considered a bigram
# threshhold: number of phrases that are found - the higher the number the fewer number of phrases
bigram = gensim.models.Phrases(all_docs_no_stop, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[all_docs_no_stop], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_bigrams = make_bigrams(all_docs_no_stop)
data_bigrams_trigrams = make_trigrams(data_bigrams)

## search for words linked with underscore: those are the bigrams and trigrams
#if you're not staisfied with the bigrams you're getting (capturing too many
#or too little then modify the min_count and threshhold parameters
print(data_bigrams_trigrams[0])

['great', 'difficulty', 'urgent', 'journey', 'facing', 'seriously', 'ill', 'man', 'waiting', 'village', 'miles', 'distant', 'severe', 'snowstorm', 'filled', 'space', 'carriage', 'light', 'large', 'wheels', 'entirely', 'suitable', 'country', 'roads', 'wrapped', 'furs', 'bag', 'instruments', 'hand', 'standing', 'courtyard', 'ready', 'journey', 'horse', 'missing', 'horse', 'horse', 'died', 'previous', 'night', 'result', 'exertion', 'icy', 'winter', 'servant', 'girl', 'moment', 'running', 'village', 'borrow', 'horse', 'hopeless', 'knew', 'stood', 'useless', 'increasingly', 'covered', 'snow', 'time', 'immobile', 'girl', 'appeared', 'gate', 'swinging', 'lantern', 'course', 'going', 'lend', 'horse', 'journey', 'walked', 'courtyard', 'couldn', 't', 'distracted', 'tormented', 'kicked', 'foot', 'cracked', 'door', 'pig', 'sty', 'years', 'door', 'opened', 'banged', 'fro', 'hinges', 'warmth', 'smell', 'horses', 'came', 'dim', 'stall', 'lantern', 'rope', 'swayed', 'inside', 'man', 'huddled', 'stall'

In [None]:
#Let’s define the functions to remove the stopwords, make bigrams 
#and call them sequentially.

def remove_stopwords(texts):
    return [[word for word in tokenized_text_files(str(doc)) if word not in custom_stopwords] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [None]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])

In [None]:
# Remove Stop Words
text_files_nostops = remove_stopwords(tokenized_text_files)

# Form Bigrams
text_files_bigrams = make_bigrams(text_files_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
#lemmatize: refer to preprocessing notebook to lemmatize
#need to do this file by file because spacy crashes the kernel
#Load language model (it needs to match the name above)
nlp = spacy.load('en_core_web_md')

#Open your text and create spaCy document
filepath = 'kafka_metamorphosis.txt'
text = open(filepath, encoding='utf-8').read()
document = nlp(text)

outname = filepath.replace('.txt', '-lemmatized.txt')
with open(outname, 'w', encoding='utf8') as out:   
    for token in document:
        # Get the lemma for each token
        out.write(token.lemma_.lower())
        # Insert white space between each token
        out.write(' ')

## Create Corpus and Dictionary needed for Topic Modeling

Creating a dictionary from the corpus restructures the text in a way that prepares it for topic modeling. 
Gensim creates a unique id for each unique word in the documents/across the documents and it's frequency. 
The produced corpus shown above is a mapping of (word_id, word_frequency). For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on. This is used as the input by the LDA model.

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

#Word at key 0 and its frequency
word = id2word[[0][:1][0]]
print (word)

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# 2-Building the Topic Models

In [None]:
"""
Parameters: 
corpus and dictionary we created above

num_topics is the number of topics

chunksize: the number of documents to be used in each training chunk

passes: total number of training passes / number of passes through training data

update_every: determines how often the model parameters should be updated

"""

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the words in the 10 topics
#words for each topic and the weightage(importance) of each word (how strongly characteristic of topic)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = lda_model.print_topics(num_words = 10)
for topic in topics:
    print(topic)

In [None]:
## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l=[lda_model.get_document_topics(item) for item in corpus]
### print result
text_raw_tokens[0:5]
l[0:5]

## Visualizing models

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

In [None]:
## Visualize - may not work on jhub yet
import pyLDAvis.gensim as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

Alright, if you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

if topics overlap, they overlap
the larger the bubble the more documents associated with that cluster

Can give you sense of how you can refine your models. adn how you might want to adjust your parameters. 
If there are a lot of big clusters (bubbles) all overlapping then might need to increase your number of topics.
If words are not meaningful can also add them to custom stopwords list (cf. Preprocessing noteook)

# Refining topic models

You can refine your models using the visualization above. 
If you want to get more in-depth you could use perplexity and coherence scores.

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. In my experience, topic coherence score, in particular, has been more helpful.

In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score
#The higher the better
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

**How to find the optimal number of topics**

My approach to finding the optimal number of topics is to build many LDA models with different values of number of topics (k) and pick the one that gives the highest coherence value.

Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics.

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

If the coherence score seems to keep increasing, it may make better sense to pick the model that gave the highest CV before flattening out. 

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

## Topics distribution across documents

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

## Visualizing models

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

In [None]:
pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(ldamodel, corpus, dictionary)

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

Alright, if you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.



if topics overlap, they overlap
the larger the bubble the more documents associated with that cluster