# Topic Modeling with Latent Dirichlet Allocation Model
In this project extension I will explore applying an LDA model to the data. This model aims to uncover hidden structure in a collection of texts. This type of modeling can be compared to clustering (thus an interesting extension for this project) but with LDA it builds clusters of words rather than clusters of texts.  


> LDA is a generative probabilistic model that assumes each topic is a mixture over an underlying set of words, and each document is a mixture of over a set of topic probabilities.

# Libraries and Data

In [1]:
#custom functions 
from projectfunctions import * 

In [2]:
import pandas as pd  
import numpy as np   
np.random.seed(42)

import pickle   

%matplotlib inline
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import seaborn as sns

import gensim 
from gensim.utils import simple_preprocess 
from gensim.parsing.preprocessing import STOPWORDS 
import gensim.corpora as corpora  

import nltk 
from nltk.stem import PorterStemmer
from nltk.stem.porter import * 

from pprint import pprint  

import os 

from wordcloud import WordCloud, STOPWORDS   

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Prepare Data For LDA Analysis 

In [3]:
#load in question data 
classroom_questions_csv = pd.read_csv(r'PDFfiles/classroom_questions.csv')
cq_list = classroom_questions_csv['question'].values.tolist()

In [63]:
def lower_words(text): 
    #return list lowered
    return [t.lower() for t in text]  

def remove_punc(text):  
    #returns a list without punctuation 
    import re 
    return [re.sub(r'[^a-zA-Z0-9]', ' ', t) for t in text]  

def lemmatize(text):  
    from nltk.stem.wordnet import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemma_list_of_words = []
    for i in range(0, len(text)):
         l1 = text[i]
         l2 = ' '.join([lemmatizer.lemmatize(word) for word in l1])
         lemma_list_of_words.append(l2)
    return lemma_list_of_words 

def remove_stopwords(text):   
    #returns a list with stopwords removed 
    from gensim.parsing.preprocessing import remove_stopwords 
    return [remove_stopwords(word) for word in text]

def stemm(text): 
    ps = PorterStemmer()
    return [[ps.stem(token) for token in sentence.split(" ")] for sentence in text]

def preprocess(text): 
    lowered = lower_words(text) #lower all words 
    alphanumeric = remove_punc(lowered) #remove punctuation  
    stopped = remove_stopwords(alphanumeric) #remove stop words
    sentence_list = [text.split(",") for text in stopped] #create a list for each sentence
    lemmatized = lemmatize(sentence_list) #lemmatize list
    stemmed = stemm(stopped) # stem words
    return [" ".join(x).split() for x in stemmed] #remove any blanks

  and should_run_async(code)


In [65]:
#apply processing to document
corpi_list = preprocess(cq_list) 

  and should_run_async(code)


In [66]:
#filter out just the words that are greater that 3
for sentence in corpi_list: 
    for word in sentence: 
        if len(word) < 5: #the data here was a bit off on lenght so I needed to adjust it to 5 to actually filter for 3
            sentence.remove(word)

  and should_run_async(code)


In [67]:
corpi_list

  and should_run_async(code)


[['pound'],
 ['illustr', 'water'],
 ['knowledg', 'latitud', 'longitud', 'locat', 'greenland'],
 ['water', 'basement', 'use', 'water'],
 ['factor', 'rust'],
 ['mammal'],
 ['differ', 'breath', 'water'],
 ['construct', 'tower', 'tall', 'block'],
 ['think', 'benjamin', 'franklin', 'famou'],
 ['chang', 'earth', 'orbit'],
 ['direct', 'shadow', 'point', 'directli'],
 ['direct', 'observ', 'noontim'],
 ['direct', 'set'],
 ['s', 'chang', 'chang', 'march', 'septemb'],
 ['direct', 'observ', 'noontim'],
 ['happen', 'altitud', 'januari', 'decemb'],
 ['month', 'lowest', 'noon', 'highest', 'noon'],
 ['5', 'signific', 'number'],
 ['directli', 'overhead', '21st', 'equat'],
 ['briefli', 'explain', 'differ', 'weight'],
 ['identifi', 'similar', 'differ', 'inner', 'outer', 'planet'],
 ['planet', 'weigh', 'explain'],
 ['object',
  'solar',
  'greatest',
  'graviti',
  'happen',
  'gravit',
  'increas',
  'reduc'],
 ['planet', 'orbit'],
 ['eclips', 'occur', 'new', 'moon'],
 ['affect', 'phase'],
 ['locat', 'ti

# Train a Vanilla LDA Model 

In [68]:
#create a dictionary of words 
id2word = corpora.Dictionary(corpi_list) 

#create corpus 
texts = corpi_list

#TDF 
corpus = [id2word.doc2bow(text) for text in corpi_list]

print(corpus[:1][0][:30]) 

#sanity check 
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

[(0, 1)]


  and should_run_async(code)
  self._context.run(self._callback, *self._args)


[[('pound', 1)]]

In [70]:
#build model 
lda_model = gensim.models.LdaModel(corpus=corpus, 
                                      id2word=id2word, 
                                      num_topics=10,  
                                      random_state=42,  
                                      alpha='auto', 
                                      per_word_topics=True)

#print keywords in each topic 
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

  and should_run_async(code)


[(0,
  '0.036*"typic" + 0.035*"data" + 0.035*"hurrican" + 0.034*"contain" + '
  '0.023*"consid" + 0.022*"earth" + 0.020*"power" + 0.020*"determin" + '
  '0.020*"bacteria" + 0.019*"energi"'),
 (1,
  '0.066*"increas" + 0.045*"plant" + 0.036*"experi" + 0.035*"function" + '
  '0.034*"approxim" + 0.027*"popul" + 0.021*"temperatur" + 0.020*"identifi" + '
  '0.020*"explain" + 0.020*"extinct"'),
 (2,
  '0.075*"weight" + 0.057*"student" + 0.051*"organ" + 0.050*"cloud" + '
  '0.037*"follow" + 0.036*"certain" + 0.025*"spread" + 0.024*"form" + '
  '0.024*"reproduct" + 0.023*"end"'),
 (3,
  '0.035*"statement" + 0.030*"primari" + 0.030*"system" + 0.029*"base" + '
  '0.029*"summer" + 0.029*"commun" + 0.029*"conclud" + 0.029*"respons" + '
  '0.027*"daili" + 0.026*"fiber"'),
 (4,
  '0.037*"damag" + 0.037*"identifi" + 0.035*"fertil" + 0.032*"occur" + '
  '0.029*"day" + 0.026*"method" + 0.021*"result" + 0.021*"solut" + '
  '0.020*"appear" + 0.019*"air"'),
 (5,
  '0.049*"chang" + 0.044*"chemic" + 0.036*"p

# Model Analysis 

## Dominant Topic & Percentage Contribution 

In [71]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(5)

  and should_run_async(code)


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.5702,"increas, plant, experi, function, approxim, popul, temperatur, identifi, explain, extinct",[pound]
1,1,9.0,0.3753,"water, direct, chang, factor, state, dissolv, solid, earth, travel, rotat","[illustr, water]"
2,2,6.0,0.6082,"earth, layer, provid, reason, surfac, explain, like, chart, popul, gener","[knowledg, latitud, longitud, locat, greenland]"
3,3,9.0,0.8336,"water, direct, chang, factor, state, dissolv, solid, earth, travel, rotat","[water, basement, use, water]"
4,4,9.0,0.7195,"water, direct, chang, factor, state, dissolv, solid, earth, travel, rotat","[factor, rust]"


## The Most Representative Sentence for Each Topic

In [72]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib",
                                       "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(5)

  and should_run_async(code)


Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.9081,"typic, data, hurrican, contain, consid, earth, power, determin, bacteria, energi","[determin, latitud, longitud, posit, center, hurrican, betsi, septemb]"
1,1.0,0.9311,"increas, plant, experi, function, approxim, popul, temperatur, identifi, explain, extinct","[happen, bitoic, factor, arctic, ecosystem, dramat, increas, temperatur, answer, complet, sentenc]"
2,2.0,0.9408,"weight, student, organ, cloud, follow, certain, spread, form, reproduct, end","[certain, speci, whiptail, lizard, femal, individu, lizard, reproduc, asexu, disadvantag, asexu,..."
3,3.0,0.9244,"statement, primari, system, base, summer, commun, conclud, respons, daili, fiber","[littl, place, 150cm, meter, stick, 50cm, 10, second, s, veloc]"
4,4.0,0.9543,"damag, identifi, fertil, occur, day, method, result, solut, appear, air","[scientist, identifi, problem, shape, build, paper, airplan, brainstorm, construct, model, plane..."


## PyLDA Visualization 

In [73]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)


# Resources: 
* [Topic Modeling in Python: Latent Dirichlet Allocation (LDA)](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0) 
* [Topic Modeling Visualization - How to present the results of LDA models?](https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/) 
* [Topic Modeling and Latent Dirichlet Allocation (LDA) in Python](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)