# Topic Modeling with Latent Dirichlet Allocation Model
In this project extension I will explore applying an LDA model to the data. This model aims to uncover hidden structure in a collection of texts. This type of modeling can be compared to clustering (thus an interesting extension for this project) but with LDA it builds clusters of words rather than clusters of texts.  


> LDA is a generative probabilistic model that assumes each topic is a mixture over an underlying set of words, and each document is a mixture of over a set of topic probabilities.

# Libraries and Data

In [None]:
#custom functions 
from projectfunctions import * 

In [71]:
import pandas as pd  
import numpy as np   
np.random.seed(42)

import pickle   

%matplotlib inline
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import seaborn as sns

import gensim 
from gensim.utils import simple_preprocess 
from gensim.parsing.preprocessing import STOPWORDS 
import gensim.corpora as corpora  

import nltk 
from nltk.stem import PorterStemmer
from nltk.stem.porter import * 

from pprint import pprint  

import os 

from wordcloud import WordCloud, STOPWORDS   

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Prepare Data For LDA Analysis 

In [None]:
#load in question data 
classroom_questions_csv = pd.read_csv(r'PDFfiles/classroom_questions.csv')
cq_list = classroom_questions_csv['question'].values.tolist()

In [115]:
def lower_words(text): 
    #return list lowered
    return [t.lower() for t in text]  

def remove_punc(text):  
    #returns a list without punctuation 
    import re 
    return [re.sub(r'[^a-zA-Z0-9]', ' ', t) for t in text]  

def lemmatize(text):  
    from nltk.stem.wordnet import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemma_list_of_words = []
    for i in range(0, len(text)):
         l1 = text[i]
         l2 = ' '.join([lemmatizer.lemmatize(word) for word in l1])
         lemma_list_of_words.append(l2)
    return lemma_list_of_words 

def remove_stopwords(text):   
    #returns a list with stopwords removed 
    import nltk 
    from nltk.corpus import stopwords
    stopword=stopwords.words('english') 
    #return list with stopwords removed 
    return [t for t in text if word not in stopword]   

def stemm(text): 
    ps = PorterStemmer()
    return [[ps.stem(token) for token in sentence.split(" ")] for sentence in text]

def preprocess(text): 
    lowered = lower_words(text) #lower all words
    alphanumeric = remove_punc(lowered) #remove punctuation  
    sentence_list = [text.split(",") for text in alphanumeric] #create a list for each sentence
    lemmatized = lemmatize(sentence_list) #lemmatize list
    stopped = remove_stopwords(lemmatized) #remove stop words
    stemmed = stemm(stopped) # stem words
    return [" ".join(x).split() for x in stemmed] #remove any blanks

In [116]:
corpi_list = preprocess(cq_list)

# Train a Vanilla LDA Model 

In [117]:
#create a dictionary of words 
id2word = corpora.Dictionary(corpi_list) 

#create corpus 
texts = corpi_list

#TDF 
corpus = [id2word.doc2bow(text) for text in corpi_list]

print(corpus[:1][0][:30]) 

#sanity check 
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]


[[('a', 1), ('how', 1), ('in', 1), ('mani', 1), ('ounc', 1), ('pound', 1)]]

In [118]:
#build model 
lda_model = gensim.models.LdaModel(corpus=corpus, 
                                      id2word=id2word, 
                                      num_topics=10, 
                                      random_state=42, 
                                      chunksize=100, 
                                      alpha='auto', 
                                      per_word_topics=True)

#print keywords in each topic 
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.086*"describ" + 0.043*"use" + 0.039*"can" + 0.035*"and" + 0.034*"be" + '
  '0.031*"a" + 0.026*"to" + 0.024*"where" + 0.023*"found" + 0.022*"you"'),
 (1,
  '0.090*"for" + 0.053*"wa" + 0.051*"name" + 0.048*"organ" + 0.043*"the" + '
  '0.043*"will" + 0.032*"to" + 0.023*"locat" + 0.022*"thi" + 0.021*"plant"'),
 (2,
  '0.109*"the" + 0.044*"in" + 0.044*"most" + 0.038*"what" + 0.033*"on" + '
  '0.030*"and" + 0.029*"be" + 0.026*"one" + 0.025*"would" + 0.024*"a"'),
 (3,
  '0.151*"the" + 0.080*"of" + 0.058*"which" + 0.051*"by" + 0.040*"identifi" + '
  '0.033*"at" + 0.028*"explain" + 0.027*"best" + 0.021*"temperatur" + '
  '0.017*"and"'),
 (4,
  '0.047*"a" + 0.045*"call" + 0.043*"or" + 0.040*"plant" + 0.036*"do" + '
  '0.031*"work" + 0.025*"area" + 0.024*"function" + 0.021*"all" + 0.021*"and"'),
 (5,
  '0.068*"how" + 0.060*"doe" + 0.047*"than" + 0.041*"mani" + 0.035*"day" + '
  '0.034*"one" + 0.031*"cell" + 0.030*"take" + 0.027*"these" + 0.026*"earth"'),
 (6,
  '0.147*"the" + 0.086*"of"

# Model Analysis 

## Dominant Topic & Percentage Contribution 

In [119]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,5.0,0.4335,"how, doe, than, mani, day, one, cell, take, th...","[how, mani, ounc, in, a, pound]"
1,1,6.0,0.33,"the, of, is, what, to, a, in, that, which, s","[how, would, you, illustr, the, water, cycl]"
2,2,2.0,0.2617,"the, in, most, what, on, and, be, one, would, a","[how, would, you, use, your, knowledg, of, lat..."
3,3,7.0,0.3952,"in, type, of, and, food, are, produc, energi, ...","[if, you, had, eight, inch, of, water, in, you..."
4,4,9.0,0.4019,"the, are, of, state, gener, in, system, factor...","[what, are, some, of, the, factor, that, caus,..."


## The Most Representative Sentence for Each Topic

In [120]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib",
                                       "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.6626,"describ, use, can, and, be, a, to, where, found, you","[what, do, you, want, to, be, abl, to, do]"
1,1.0,0.6209,"for, wa, name, organ, the, will, to, locat, thi, plant","[what, organ, is, respons, for, return, dead, plant, and, anim, back, to, the, soil]"
2,2.0,0.7636,"the, in, most, what, on, and, be, one, would, a","[if, you, are, sit, perfectli, still, in, a, move, car, are, you, in, motion]"
3,3.0,0.6639,"the, of, which, by, identifi, at, explain, best, temperatur, and","[the, solubl, of, which, substanc, is, least, affect, by, the, temperatur, of, the, water]"
4,4.0,0.7707,"a, call, or, plant, do, work, area, function, all, and","[true, or, fals, a, book, sit, on, a, shelf, ha, no, energi]"
5,5.0,0.7676,"how, doe, than, mani, day, one, cell, take, these, earth","[approxim, how, mani, day, doe, it, take, to, complet, 1, 4, revolut]"
6,6.0,0.8985,"the, of, is, what, to, a, in, that, which, s","[the, caus, of, the, sun, rise, and, set, in, the, sky, is, the, earth, s, rotat, what, is, the,..."
7,7.0,0.7312,"in, type, of, and, food, are, produc, energi, water, a","[whi, weren, t, time, zone, creat, until, the, late, 1800]"
8,8.0,0.4992,"follow, scientist, list, web, statement, electr, weight, bodi, perform, order","[scientist, hope, robot, similar, to, the, pr2, pancak, flip, bot, will, someday, perform]"
9,9.0,0.5796,"the, are, of, state, gener, in, system, factor, form, were","[what, are, the, 6, abiot, factor]"


## PyLDA Visualization 

In [123]:
import pyLDAvis.gensim


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)


# Resources: 
* [Topic Modeling in Python: Latent Dirichlet Allocation (LDA)](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0) 
* [Topic Modeling Visualization - How to present the results of LDA models?](https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/) 
* [Topic Modeling and Latent Dirichlet Allocation (LDA) in Python](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24)