# Topic Modeling with Latent Dirichlet Allocation Model
In this project extension I will explore applying an LDA model to the data. This model aims to uncover hidden structure in a collection of texts. This type of modeling can be compared to clustering (thus an interesting extension for this project) but with LDA it builds clusters of words rather than clusters of texts.  


> LDA is a generative probabilistic model that assumes each topic is a mixture over an underlying set of words, and each document is a mixture of over a set of topic probabilities.

# Libraries and Data

In [1]:
#custom functions 
from projectfunctions import * 

In [2]:
import pandas as pd  
import numpy as np  
import pickle   

%matplotlib inline
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import seaborn as sns

import gensim.corpora as corpora 

from pprint import pprint  

import os 

from wordcloud import WordCloud, STOPWORDS

In [3]:
#import text data
df = pickle.load( open( "Pickles/standards_corpi.pkl", "rb" ) ) 
df.drop(df.tail(1).index,inplace=True)  
df

Unnamed: 0,level_0,state,corpus
0,0,TXTfiles/alabama,"'information', 'regarding', 'course', 'study',..."
1,1,TXTfiles/alaska,"'dept', 'education', 'early', 'development', '..."
2,2,TXTfiles/arizona,"'department', 'education', 'academic', 'introd..."
3,3,TXTfiles/colorado,"'review', 'revision', 'committee', 'chairperso..."
4,4,TXTfiles/flordia,"'specifications', 'florida', 'state', 'adoptio..."
5,5,TXTfiles/georgia,"'excellence', 'first', 'excellence', 'designed..."
6,6,TXTfiles/idaho,"'content', 'state', 'superintendent', 'public'..."
7,7,TXTfiles/indiana,"'physics', 'engineering', 'process', 'seps', '..."
8,8,TXTfiles/louisiana,"'shifts', 'following', 'key', 'shifts', 'calle..."
9,9,TXTfiles/mass,"'massachusetts', 'technology', 'framework', 'e..."


# Prepare Data For LDA Analysis 

In [4]:
#set features & labels to list
corpi_list = df['corpus'].values.tolist() 
state_list = df['state'].values.tolist()

In [5]:
#clean / from strings in list 
cleaned_list = []

for word in corpi_list:  
    cleaned_string = " "
    for character in word: 
        if character.isalnum(): 
            cleaned_string += character 
        else: 
            cleaned_string += " " 
    cleaned_string = cleaned_string.split()
    cleaned_list.append(cleaned_string)

In [6]:
print(cleaned_list[:1][0][:30])

['information', 'regarding', 'course', 'study', 'materials', 'contact', 'math', 'technology', 'initiative', 'amsti', 'section', 'state', 'department', 'education', 'gordon', 'persons', 'building', 'ripley', 'street', 'montgomery', 'mail', 'box', 'montgomery', 'telephone', 'thomas', 'bice', 'state', 'superintendent', 'education', 'state']


In [7]:
len(cleaned_list)

30

# Train a Vanilla LDA Model 

In [8]:
#create a dictionary of words 
id2word = corpora.Dictionary(cleaned_list) 

#create corpus 
texts = cleaned_list

#TDF 
corpus = [id2word.doc2bow(text) for text in cleaned_list]

print(corpus[:1][0][:30]) 

#sanity check 
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

[(0, 8), (1, 12), (2, 6), (3, 22), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 3), (12, 3), (13, 1), (14, 1), (15, 12), (16, 6), (17, 2), (18, 1), (19, 5), (20, 1), (21, 3), (22, 4), (23, 1), (24, 1), (25, 1), (26, 3), (27, 2), (28, 2), (29, 1)]


[[('abilities', 8),
  ('ability', 12),
  ('abiotic', 6),
  ('able', 22),
  ('abo', 1),
  ('absolute', 2),
  ('absorb', 1),
  ('absorbed', 1),
  ('absorbency', 1),
  ('absorbing', 1),
  ('absorbs', 1),
  ('absorption', 3),
  ('abstract', 3),
  ('abundance', 1),
  ('abundant', 1),
  ('academic', 12),
  ('academies', 6),
  ('accelerate', 2),
  ('accelerating', 1),
  ('acceleration', 5),
  ('acceptable', 1),
  ('access', 3),
  ('accessed', 4),
  ('accessible', 1),
  ('accessories', 1),
  ('accessory', 1),
  ('accidents', 3),
  ('accomplish', 2),
  ('accomplished', 2),
  ('accomplishments', 1),
  ('accordance', 3),
  ('according', 7),
  ('account', 1),
  ('accountability', 1),
  ('accounts', 2),
  ('accuracy', 4),
  ('accurate', 8),
  ('accurately', 1),
  ('achieve', 3),
  ('achievement', 3),
  ('achieving', 1),
  ('acid', 3),
  ('acidification', 1),
  ('acids', 5),
  ('acknowledge', 1),
  ('acknowledgments', 2),
  ('acquire', 1),
  ('acquired', 6),
  ('acquiring', 2),
  ('acquisition', 2),

In [9]:
#build model 
lda_model = gensim.models.LdaModel(corpus=corpus, 
                                      id2word=id2word, 
                                      num_topics=10, 
                                      random_state=42, 
                                      chunksize=100, 
                                      alpha='auto', 
                                      per_word_topics=True)

#print keywords in each topic 
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"energy" + 0.009*"evidence" + 0.008*"use" + 0.007*"information" + '
  '0.007*"engineering" + 0.007*"earth" + 0.007*"data" + 0.006*"scientific" + '
  '0.006*"concepts" + 0.006*"using"'),
 (1,
  '0.011*"energy" + 0.008*"evidence" + 0.008*"earth" + 0.007*"use" + '
  '0.006*"include" + 0.006*"systems" + 0.006*"organisms" + 0.006*"scientific" '
  '+ 0.006*"data" + 0.006*"describe"'),
 (2,
  '0.013*"energy" + 0.008*"using" + 0.007*"evidence" + 0.007*"use" + '
  '0.007*"engineering" + 0.007*"include" + 0.007*"data" + 0.006*"information" '
  '+ 0.005*"design" + 0.005*"describe"'),
 (3,
  '0.008*"energy" + 0.008*"evidence" + 0.007*"information" + 0.007*"use" + '
  '0.007*"earth" + 0.006*"different" + 0.006*"include" + 0.005*"engineering" + '
  '0.005*"data" + 0.005*"using"'),
 (4,
  '0.010*"energy" + 0.009*"earth" + 0.009*"use" + 0.008*"evidence" + '
  '0.007*"scientific" + 0.006*"describe" + 0.006*"data" + 0.006*"information" '
  '+ 0.005*"changes" + 0.005*"natural"'),
 (5,
  '0.

# Model Analysis 

## Dominant Topic & Percentage Contribution 

In [10]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(5)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,4.0,0.32,"energy, earth, use, evidence, scientific, desc...","[information, regarding, course, study, materi..."
1,1,2.0,0.6228,"energy, using, evidence, use, engineering, inc...","[dept, education, early, development, board, e..."
2,2,0.0,0.334,"energy, evidence, use, information, engineerin...","[department, education, academic, introduction..."
3,3,2.0,0.3702,"energy, using, evidence, use, engineering, inc...","[review, revision, committee, chairpersons, jo..."
4,4,4.0,0.3614,"energy, earth, use, evidence, scientific, desc...","[specifications, florida, state, adoption, ins..."


## The Most Representative Sentence for Each Topic

In [11]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib",
                                       "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.3953,"energy, evidence, use, information, engineering, earth, data, scientific, concepts, using","[content, february, department, public, instruction, department, public, instruction, kirsten, b..."
1,1.0,0.9828,"energy, evidence, earth, use, include, systems, organisms, scientific, data, describe","[designed, help, educators, teach, essential, course, study, ncdpi, staff, continually, updating..."
2,2.0,0.8648,"energy, using, evidence, use, engineering, include, data, information, design, describe","[excellence, first, excellence, designed, provide, foundational, knowledge, skills, develop, pro..."
3,3.0,0.3908,"energy, evidence, information, use, earth, different, include, engineering, data, using","[academic, introduction, introduction, prekindergarten, physical, kindergarten, chemistry, physi..."
4,4.0,0.9893,"energy, earth, use, evidence, scientific, describe, data, information, changes, natural","[elementary, essential, knowledge, skills, subchapter, elementary, statutory, authority, provisi..."
5,5.0,0.3379,"energy, earth, evidence, engineering, scientific, information, models, describe, use, data","[academic, performance, indicators, mick, zais, ph, state, superintendent, education, department..."
6,6.0,0.4255,"energy, earth, scientific, data, use, evidence, information, using, change, describe","[physics, engineering, process, seps, engineering, process, processes, skills, expected, learn, ..."
7,7.0,0.6375,"earth, energy, evidence, scientific, use, data, system, water, engineering, matter","[academic, value, statement, possesses, citizenry, known, intelligent, knowledgeable, hardworkin..."


## PyLDA Visualization 

In [15]:
import pyLDAvis.sklearn 
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)


# Resources: 
* [Topic Modeling in Python: Latent Dirichlet Allocation (LDA)](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0) 
* [Topic Modeling Visualization - How to present the results of LDA models?](https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/)