# Topic Modeling with Latent Dirichlet Allocation Model
In this project extension I will explore applying an LDA model to the data. This model aims to uncover hidden structure in a collection of texts. This type of modeling can be compared to clustering (thus an interesting extension for this project) but with LDA it builds clusters of words rather than clusters of texts.  


> LDA is a generative probabilistic model that assumes each topic is a mixture over an underlying set of words, and each document is a mixture of over a set of topic probabilities.

# Libraries and Data

In [92]:
#custom functions 
from projectfunctions import * 

  and should_run_async(code)


In [93]:
import warnings 
warnings.simplefilter('always')

import pandas as pd 
import pickle  
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np  

import gensim.corpora as corpora 

from pprint import pprint 

import pyLDAvis.gensim 
import pyLDAvis 
import os

  and should_run_async(code)


In [94]:
#import text data
df = pickle.load( open( "Pickles/standards_corpi.pkl", "rb" ) ) 
df.drop(df.tail(1).index,inplace=True)  
df

  and should_run_async(code)
  df = pickle.load( open( "Pickles/standards_corpi.pkl", "rb" ) )


Unnamed: 0,level_0,state,corpus
0,0,TXTfiles/alabama,"'information', 'regarding', 'course', 'study',..."
1,1,TXTfiles/alaska,"'dept', 'education', 'early', 'development', '..."
2,2,TXTfiles/arizona,"'department', 'education', 'academic', 'introd..."
3,3,TXTfiles/colorado,"'review', 'revision', 'committee', 'chairperso..."
4,4,TXTfiles/flordia,"'specifications', 'florida', 'state', 'adoptio..."
5,5,TXTfiles/georgia,"'excellence', 'first', 'excellence', 'designed..."
6,6,TXTfiles/idaho,"'content', 'state', 'superintendent', 'public'..."
7,7,TXTfiles/indiana,"'physics', 'engineering', 'process', 'seps', '..."
8,8,TXTfiles/louisiana,"'shifts', 'following', 'key', 'shifts', 'calle..."
9,9,TXTfiles/mass,"'massachusetts', 'technology', 'framework', 'e..."


# Prepare Data For LDA Analysis 

In [95]:
#set features & labels to list
corpi_list = df['corpus'].values.tolist() 
state_list = df['state'].values.tolist()

  and should_run_async(code)


In [96]:
#clean / from strings in list 
cleaned_list = []

for word in corpi_list:  
    cleaned_string = " "
    for character in word: 
        if character.isalnum(): 
            cleaned_string += character 
        else: 
            cleaned_string += " " 
    cleaned_string = cleaned_string.split()
    cleaned_list.append(cleaned_string)

  and should_run_async(code)


In [97]:
print(cleaned_list[:1][0][:30])

['information', 'regarding', 'course', 'study', 'materials', 'contact', 'math', 'technology', 'initiative', 'amsti', 'section', 'state', 'department', 'education', 'gordon', 'persons', 'building', 'ripley', 'street', 'montgomery', 'mail', 'box', 'montgomery', 'telephone', 'thomas', 'bice', 'state', 'superintendent', 'education', 'state']


  and should_run_async(code)


In [98]:
#create a dictionary of words 
id2word = corpora.Dictionary(cleaned_list)  
id2word

  and should_run_async(code)


<gensim.corpora.dictionary.Dictionary at 0x7f886578daf0>

In [50]:
#create a dictionary of words 
id2word = corpora.Dictionary(cleaned_list) 

#create corpus 
texts = cleaned_list

#TDF 
corpus = [id2word.doc2bow(text) for text in texts]

#sanity check 
print(corpus[:1][0][:30])

[(0, 8), (1, 12), (2, 6), (3, 22), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 3), (12, 3), (13, 1), (14, 1), (15, 12), (16, 6), (17, 2), (18, 1), (19, 5), (20, 1), (21, 3), (22, 4), (23, 1), (24, 1), (25, 1), (26, 3), (27, 2), (28, 2), (29, 1)]


# Train a Vanilla LDA Model 

In [90]:
#number of topics
num_topics = 12  

#build model 
lda_model = gensim.models.LdaMulticore(corpus=corpus, 
                                      id2word=id2word, 
                                      num_topics=num_topics)

#print keywords in each topic 
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

  and should_run_async(code)


[(0,
  '0.008*"evidence" + 0.008*"earth" + 0.008*"energy" + 0.008*"engineering" + '
  '0.008*"information" + 0.007*"data" + 0.006*"using" + 0.006*"ideas" + '
  '0.005*"scientific" + 0.005*"include"'),
 (1,
  '0.009*"energy" + 0.008*"scientific" + 0.006*"earth" + 0.006*"use" + '
  '0.006*"data" + 0.005*"evidence" + 0.005*"water" + 0.005*"examples" + '
  '0.005*"using" + 0.004*"different"'),
 (2,
  '0.010*"energy" + 0.008*"use" + 0.008*"information" + 0.008*"evidence" + '
  '0.007*"engineering" + 0.006*"describe" + 0.006*"scientific" + 0.006*"earth" '
  '+ 0.006*"using" + 0.006*"systems"'),
 (3,
  '0.013*"energy" + 0.009*"evidence" + 0.008*"use" + 0.007*"data" + '
  '0.007*"using" + 0.007*"earth" + 0.006*"scientific" + 0.006*"information" + '
  '0.006*"describe" + 0.005*"include"'),
 (4,
  '0.012*"energy" + 0.008*"earth" + 0.007*"scientific" + 0.007*"use" + '
  '0.006*"include" + 0.006*"evidence" + 0.006*"data" + 0.006*"engineering" + '
  '0.005*"concepts" + 0.005*"changes"'),
 (5,
  '0.

# Model Analysis 

In [91]:
#visualize topics 
pyLDAvis.enable_notebook() 

LDAvis_data_filepath= os.path.join('./results/ldavis_prepared_'+str(num_topics)) 
             
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
              
with open('Pickles/LDA_model', 'wb') as f: pickle.dump(LDAvis_prepared, f)
             
LDAvis_prepared

  and should_run_async(code)


In [None]:
with open(LDA_data_filepath, 'rb') as f: 
             LDAvis_prepared = pickle.load(f)
             
pyLDAvis.save_html(LDAvis_prepared, ',/results/ldavis_prepared_' + 
                  str(num_topics) + '.html')

Resources: 
* [Topic Modeling in Python: Latent Dirichlet Allocation (LDA)](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0)