In [9]:
# Latent Dirichlet Allocation (LDA) with Python for Elliston Archive
import matplotlib
%pylab inline
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
sp_stop = stopwords.words('spanish')
from nltk import tokenize
from nltk.stem.porter import PorterStemmer
from gensim import corpora
from gensim import models, corpora
import glob

Populating the interactive namespace from numpy and matplotlib


In [9]:
# collect documents and perform simple cleaning to imporve topic model generation

p_stemmer = PorterStemmer() #use stemming to reduce topically similar words to their root

lst_filenames = []
for filename in glob.glob('data/*.txt'):
    lst_filenames.append(filename)

In [35]:
import io
def doc_preprocessor(lst_filenames):
    files = []
    for filename in lst_filenames:
        f = io.open(filename, 'r+', encoding='utf-8')
        read_data = f.read() #read in raw data
        file = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',filename) #remove URLs
        file = re.sub(r'[^\x00-\x7F]+','', file)
        file = re.sub(r'[^\w\s]','', file) #remove elements that are not words
        file = file.lower() #turn all words lowercase
        file = word_tokenize(file) #tokenize the data into its atomic elements 
        file = [i for i in file if not i in en_stop] #remove stopwords
        file = [p_stemmer.stem(i) for i in file] #stem tokens
        files.append(file)
        f.close()
    return files

In [36]:
docs = doc_preprocessor(lst_filenames)

In [39]:
dictionary = corpora.Dictionary(docs) # turn tokenized documents into a id <-> term dictionary
dictionary.compactify() #Assign new word ids to all words, shrinking gaps. Eliminate potential imaninary number representations
# dictionary.save('./warfarinldamodel_dict.dict')  # store the dictionary, for future reference
corpus = [dictionary.doc2bow(text) for text in docs] # convert tokenized documents into a document-term matrix

In [40]:
# Set training parameters.
num_topics = 35
chunksize = 500 # size of the doc looked at every pass
passes = 100 # number of passes through documents
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.


In [41]:
##### Hyperparamter options and default values for gensim LDA model
# gensim.models.ldamodel.LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, \
# passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, \
# gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, \
# per_word_topics=False, callbacks=None, dtype=<type 'numpy.float32'>)

%time ldamodel = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, chunksize=chunksize, passes=passes, eval_every = eval_every, iterations = iterations)

Wall time: 3d 1h 54min 17s


In [8]:
# keyword = ''
# print(ldamodel.get_term_topics('keyword'))
# print(ldamodel.get_topic_terms(ldamodel.get_term_topics('keyword')[0][0], topn=30))

[(10, 0.014984129), (11, 0.044843286), (12, 0.024069669), (15, 0.24911822), (20, 0.068382934), (23, 0.035111353)]
[(323, 0.057552308), (97, 0.053632706), (1042, 0.031103043), (3179, 0.029733561), (3178, 0.029733561), (3180, 0.029733561), (3176, 0.029733561), (3177, 0.029733561), (330, 0.023668991), (492, 0.022669429), (979, 0.021218568), (207, 0.019862682), (577, 0.016152158), (117, 0.01533929), (989, 0.012882323), (901, 0.0115640825), (41, 0.011399057), (1591, 0.011334097), (925, 0.011334097), (932, 0.010194605), (126, 0.010135128), (3100, 0.00996462), (820, 0.00996462), (3147, 0.00996462), (3164, 0.00996462), (3163, 0.00996462), (3162, 0.00996462), (3159, 0.00996462), (3160, 0.00996462), (3101, 0.00996462)]


In [42]:
# show the topics
topics = ldamodel.show_topics()
for topic in topics:
    print(topic)

(52, '0.289*"got" + 0.157*"that" + 0.071*"ur" + 0.040*"id" + 0.035*"boy" + 0.035*"stop" + 0.031*"left" + 0.019*"ago" + 0.016*"awww" + 0.016*"cuz"')
(38, '0.069*"hahaha" + 0.056*"forward" + 0.055*"kid" + 0.038*"almost" + 0.036*"woke" + 0.034*"breakfast" + 0.022*"quot" + 0.021*"chill" + 0.020*"save" + 0.015*"note"')
(43, '0.151*"ye" + 0.105*"read" + 0.079*"world" + 0.059*"book" + 0.048*"end" + 0.044*"true" + 0.040*"doesnt" + 0.019*"sent" + 0.016*"mood" + 0.015*"appl"')
(42, '0.119*"song" + 0.067*"two" + 0.053*"hot" + 0.040*"hang" + 0.039*"minut" + 0.033*"support" + 0.028*"tea" + 0.021*"addict" + 0.018*"awak" + 0.015*"sleepi"')
(36, '0.221*"watch" + 0.157*"home" + 0.053*"exam" + 0.033*"go" + 0.032*"im" + 0.025*"sing" + 0.025*"face" + 0.020*"suck" + 0.019*"award" + 0.019*"cook"')
(25, '0.065*"okay" + 0.062*"noth" + 0.052*"mom" + 0.047*"anyth" + 0.043*"worri" + 0.028*"lil" + 0.017*"past" + 0.016*"deserv" + 0.014*"color" + 0.013*"feet"')
(3, '0.220*"twitter" + 0.135*"right" + 0.089*"done" + 

In [43]:
#Visualize the LDA topics using pyLDAVis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [44]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [45]:
p = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.save_html(p, keyword)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
