In [1]:
import numpy
import scipy
import pandas
import spacy
import textacy

In [2]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

In [3]:
corpus = textacy.Corpus.load(path='/home/immersinn/gits/ncga/data/processed/CORPUS_bills_filed_pipe01/',
                           name='CORPUS_bills_filed_pipe01',
                           compression='gzip')

### Tokenize and Vectorize Corpus

In [6]:
terms_lists = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)
doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(\
                                                       terms_lists, 
                                                       weighting='tfidf', normalize=True, smooth_idf=True,
                                                       min_df=3, max_df=0.95, max_n_terms=1000)
doc_term_matrix

<2098x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 247765 stored elements in Compressed Sparse Row format>

### Topic Model

In [83]:
models = {}
for n in range(2,20):
    model = textacy.tm.TopicModel('lda', n_topics=n)
    model.fit(doc_term_matrix)
    models[n] = model

In [94]:
perplex = {}
for i in models:
    perplex[i] = models[i].model.perplexity(doc_term_matrix)

In [96]:
perplex

{2: 1430.1864400469474,
 3: 2827.8161827641475,
 4: 6272.5030497747548,
 5: 15934.908496879447,
 6: 36600.007525626272,
 7: 74907.573165759444,
 8: 146728.35641135511,
 9: 297812.59956085897,
 10: 619162.11230097502,
 11: 1304322.6565713866,
 12: 2662087.5674209734,
 13: 5244250.9785775589,
 14: 10552345.667645756,
 15: 22265191.540630952,
 16: 45055216.468628302,
 17: 87031707.79531166,
 18: 196917646.10981026,
 19: 357422111.80390805}

In [100]:
n_topics = 4
model = models[n_topics]

#### Transform the corpus and Interpret the Model:

In [101]:
doc_topic_matrix = model.transform(doc_term_matrix)

In [115]:
doc_topic_matrix.shape

(2098, 4)

In [116]:
# Rows sum to unity...
doc_topic_matrix[:10,:]

array([[ 0.87789927,  0.04000221,  0.04185012,  0.0402484 ],
       [ 0.78584883,  0.07202284,  0.07105155,  0.07107679],
       [ 0.01644671,  0.01682715,  0.95049358,  0.01623256],
       [ 0.05354897,  0.05337645,  0.8403571 ,  0.05271748],
       [ 0.04846911,  0.04658629,  0.85820091,  0.04674369],
       [ 0.1851266 ,  0.0281551 ,  0.75838843,  0.02832987],
       [ 0.04757388,  0.04503149,  0.86168051,  0.04571412],
       [ 0.02975781,  0.03088225,  0.52167538,  0.41768456],
       [ 0.02572517,  0.02507037,  0.77626385,  0.17294062],
       [ 0.03307557,  0.19271051,  0.74182157,  0.03239234]])

#### View Top Terms, Top Keywords from Top Documents

In [103]:
pandas.Series(doc_topic_matrix.reshape((doc_topic_matrix.shape[0]*doc_topic_matrix.shape[1],))).describe(percentiles=[0.75, 0.80, 0.90, 0.95, 0.975, 0.99])

count    8392.000000
mean        0.250000
std         0.315964
min         0.016233
50%         0.051009
75%         0.445956
80%         0.608934
90%         0.855764
95%         0.882560
97.5%       0.899533
99%         0.914462
max         0.950494
dtype: float64

In [104]:
sum(doc_topic_matrix > 0.1)

array([ 612,  319, 1717,  413])

In [105]:
top_term_table = []
index = []
for topic_idx, top_terms in model.top_topic_terms(id2term, top_n = 15, topics=range(n_topics)):
    index.append('topic ' + str(topic_idx))
    top_term_table.append({i : tt for i,tt in enumerate(top_terms)})
top_term_table = pandas.DataFrame(data=top_term_table, index=index)

In [106]:
top_term_table.transpose()

Unnamed: 0,topic 0,topic 1,topic 2,topic 3
0,district,school,shall,court
1,member,student,state,person
2,resolution,education,fund,offense
3,senate,board,service,enforcement
4,election,teacher,department,defendant
5,board,close,tax,shall
6,shall,opening,property,criminal
7,state,local,county,officer
8,local,college,health,judge
9,relate,year,public,violation


## Density-based Method for Adaptive LDA Model Selection

In [114]:
model.model.components_[:,:4]

array([[  30.05015004,    3.57920361,    8.17694721,   10.24565258],
       [  10.72089208,    1.22379688,    0.25878505,    0.25630825],
       [ 111.48732977,   12.13275677,    2.78352654,    1.70384573],
       [  14.4967847 ,    1.71215637,    0.33399709,    0.2757228 ]])

In [108]:
# Calculate correlations between Topics in a given Model via the Topic - Term Weighting Vectors
def calc_p_topic_given_word(model):
    # 01: Convert model components to a distribution --> p(w|t)
    # 02: Calculate estimate of p(t) from corpus
    # 03: p(t|w) = p(w|t) * p(t) / p(w)
    pass


def topic_cosdists(model):
    cos_dists = scipy.spatial.distance.pdist(model.model.components_, 'cosine')
    return(cos_dists)

In [109]:
cds = topic_cosdists(model)
cds

array([ 0.80333408,  0.54682641,  0.7489354 ,  0.7718644 ,  0.89350818,
        0.59276996])

In [110]:
scipy.spatial.distance.squareform(cds)

array([[ 0.        ,  0.80333408,  0.54682641,  0.7489354 ],
       [ 0.80333408,  0.        ,  0.7718644 ,  0.89350818],
       [ 0.54682641,  0.7718644 ,  0.        ,  0.59276996],
       [ 0.7489354 ,  0.89350818,  0.59276996,  0.        ]])

In [92]:
type(model.model)

sklearn.decomposition.online_lda.LatentDirichletAllocation

In [54]:
top_keywords = {}
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=range(n_topics), top_n=10):
    topic = 'topic ' + str(topic_idx)
    keywords = []
    for j in top_docs:
        keywords.extend(corpus[j].metadata['keywords'])
    top_keywords[topic] = set(keywords)

In [55]:
for topic_idx in range(n_topics):
    k = "topic " + str(topic_idx)
    print("\n")
    print(k + ":")
    print(top_keywords[k])



topic 0:
{'BUILDING CODES', 'IMMIGRATION', 'PERSONNEL', 'NAME CHANGE', 'NOTIFICATION', 'LOCAL GOVERNMENT COMN.', 'AIRPORTS', 'DARE COUNTY', 'TEXTBOOK COMN.', 'ROADS & HIGHWAYS', 'OPTICIANS & OPTOMETRISTS', 'CIVIL PROCEDURE', 'AQUARIUMS', 'ENVIRONMENT', 'STATE EMPLOYEES', 'MORTGAGES', 'DISCRIMINATION', 'SMALL BUSINESSES', 'TEXTBOOKS', 'AIDS', 'SUBSTANCE ABUSE', 'PROPERTY', 'SALARIES & BENEFITS', 'BUILDING CODE COUNCIL', 'COLLEGES & UNIVERSITIES', 'EMERGENCY SERVICES', 'TRANSPORTATION', 'SHERIFFS', 'HIGHWAY FUND', 'INFORMATION TECHNOLOGY DEPT.', 'PSYCHOLOGY', 'EDUCATION BOARDS', 'INTERSTATE COOPERATION', 'ADAP', 'EXCISE', 'FISHERIES PRODUCTS', 'TAXES', 'COURTS', 'FEES', 'JUDICIAL DEPT.', 'INDUSTRIAL COMN.', 'CATAWBA COUNTY', 'ADVERTISING', 'SOLID WASTE', 'UNREGULATED VEHICLES', 'INDIAN AFFAIRS COMN.', 'COMMUNITY COLLEGES OFFICE', 'FOREST PRODUCTS', 'HUMAN RELATIONS COMN.', 'BUDGETING', 'REPORTS', 'INSURANCE DEPT.', 'TELEVISION', 'EROSION', 'ALIENS', 'PUBLIC HEALTH COMN.', 'STORAGE SYST