### Unsupervised Analysis of Congressional Laws

In [231]:
import numpy as np
import pandas as pd


#gensim
from gensim import corpora, models, similarities, matutils

#sklearn
from sklearn.feature_extraction.text import (CountVectorizer, 
                                             TfidfTransformer,
                                             TfidfVectorizer)
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer

__Import data__

In [232]:
laws_df = pd.read_csv('all_laws.csv',index_col=0)

In [233]:
laws_names = list(laws_df.index)
laws_list = laws_df['text'].tolist()

__Import Law Summary Data__

In [234]:
laws_summarys = pd.read_csv('laws_summary.csv')

__Count Vectorizer__

In [235]:
count_vectorizer = CountVectorizer(ngram_range=(0, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]{3,100}\\b",
                                   max_df=0.8,
                                   min_df = 50)
count_vectorizer.fit(laws_list)



CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=50,
        ngram_range=(0, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]{3,100}\\b',
        tokenizer=None, vocabulary=None)

In [236]:
# Create the term-document matrix
# Transpose it so the terms are the rows
counts = count_vectorizer.transform(laws_list).transpose()

In [237]:
np.shape(counts)

(17274, 3896)

__Convert to gensim__

We need to convert our sparse `scipy` matrix to a `gensim`-friendly object called a Corpus:

In [238]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

__Map matrix rows to words (tokens)__

We need to save a mapping (dict) of row id to word (token) for later use by gensim:

In [239]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [240]:
len(id2word)

17274

## LDA
At this point we can simply plow ahead in creating an LDA model.  It requires our corpus of word counts, mapping of row ids to words, and the number of topics (3).

In [241]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=3)

In [242]:
lda.print_topics(num_words=50)

[(0,
  '0.009*"state" + 0.008*"secretary" + 0.007*"subsection" + 0.006*"federal" + 0.006*"paragraph" + 0.005*"agency" + 0.005*"title" + 0.005*"general" + 0.004*"inserting" + 0.004*"information" + 0.004*"including" + 0.004*"program" + 0.004*"education" + 0.004*"striking" + 0.004*"services" + 0.004*"local" + 0.004*"described" + 0.003*"term" + 0.003*"year" + 0.003*"programs" + 0.003*"national" + 0.003*"activities" + 0.003*"commission" + 0.003*"subparagraph" + 0.003*"following" + 0.003*"date" + 0.003*"provide" + 0.003*"amended" + 0.003*"educational" + 0.003*"funds" + 0.003*"appropriate" + 0.003*"fiscal" + 0.003*"assistance" + 0.003*"grant" + 0.002*"eligible" + 0.002*"means" + 0.002*"service" + 0.002*"report" + 0.002*"financial" + 0.002*"provided" + 0.002*"plan" + 0.002*"requirements" + 0.002*"board" + 0.002*"school" + 0.002*"agencies" + 0.002*"secretary shall" + 0.002*"entity" + 0.002*"children" + 0.002*"fiscal year" + 0.002*"based"'),
 (1,
  '0.013*"defense" + 0.010*"secretary" + 0.009*"s

In [243]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda[corpus]
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x7f62c8279b38>

In [244]:
# Store the documents' topic vectors in a list so we can take a peak
lda_docs = [doc for doc in lda_corpus]

In [245]:
len(lda_docs)

3896

In [246]:
lda_docs[900]

[(2, 0.99110745781226373)]

## NMF

In [315]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]{3,100}\\b",
                                   max_df = 0.80,
                                   min_df = 50)

In [316]:
dtm = vectorizer.fit_transform(laws_list) 
#pd.DataFrame(dtm.toarray(), index=laws_names, columns=vectorizer.get_feature_names()).head(10)

In [317]:
dtm

<3896x17274 sparse matrix of type '<class 'numpy.float64'>'
	with 2505068 stored elements in Compressed Sparse Row format>

In [326]:
num_groups = 20
nmf_model = NMF(num_groups )
dtm_nmf = nmf_model.fit_transform(dtm)
# dtm_nmf = Normalizer(copy=False).fit_transform(dtm_nmf)

In [327]:
features = vectorizer.get_feature_names()
print("These are the documents extracted from the data and their respective topics:")
print('')
#nmf.components_


num_words = 20
for topicidx, topic in enumerate(nmf_model.components_):
    
    print('Topic: ' + str(topicidx))
    print("Keywords: " + " ".join([features[i] for i in topic.argsort()[:-1*num_words-1:-1]]) )# 
    print('')

These are the documents extracted from the data and their respective topics:

Topic: 0
Keywords: office building post office building post facility service located facility united states postal postal service postal reference located building references building approved building note service designation facility designate facility states facility facility referred

Topic: 1
Keywords: secretary health state program paragraph year fiscal services subsection care research fiscal year subparagraph secretary shall information plan report grant including drug

Topic: 2
Keywords: courthouse states courthouse courthouse located courthouse note courthouse references courthouse approved courthouse referred street designate united reference designation united states united infrastructure congressional section designation comm transportation located congress designate reference regulation paper record document paper

Topic: 3
Keywords: inserting striking amended amended striking paragraph title c

In [328]:
np.shape(dtm_nmf)

(3896, 20)

__Investigate whether Topics identified match title__

In [329]:
def rank_df(df,rank):
    coln = 'MAX' + str(rank) 
    sortID = np.argpartition(
        -df[list(np.arange(0,num_groups))].values,rank,axis=1)[:,rank-1]
    df[coln] = df.columns[sortID]

In [330]:
nmf_df = pd.DataFrame(dtm_nmf, index = laws_df.index)

In [331]:
rank_df(nmf_df,1)
rank_df(nmf_df,2)
rank_df(nmf_df,3)
nmf_df.reset_index(inplace=True)
nmf_df.rename(columns={'index':'id'},inplace=True)

In [332]:
nmf_df = nmf_df.merge(laws_summarys[['id','title']],on = 'id',how = 'left')

In [337]:
pd.options.display.max_colwidth = 100
nmf_df[['id','title','MAX1','MAX2','MAX3']].iloc[
    np.random.choice(np.arange(0,len(laws_list)),size = 20,replace = False)]

Unnamed: 0,id,title,MAX1,MAX2,MAX3
1344,c108publ288,H.R.1572 - To designate the United States courthouse located at 100 North Palafox Street in Pens...,2,6,19
323,c104publ90,"H.J.Res.153 - Making further continuing appropriations for the fiscal year 1996, and for other p...",5,9,12
3894,c115publ8,"H.J.Res.40 - Providing for congressional disapproval under chapter 8 of title 5, United States C...",5,12,7
3757,c114publ298,H.R.5591 - To designate the facility of the United States Postal Service located at 810 N US Hig...,6,8,16
1926,c109publ363,H.R.4957 - Tylersville Fish Hatchery Conveyance Act 109th Congress (2005-2006),19,18,4
2408,c110publ363,H.R.5872 - Boy Scouts of America Centennial Commemorative Coin Act 110th Congres...,11,10,9
655,c105publ39,H.R.1198 - To direct the Secretary of the Interior to convey certain land to the City of Grants ...,4,19,3
3640,c114publ192,H.R.4372 - To designate the facility of the United States Postal Service located at 15 Rochester...,6,13,10
3349,c113publ197,S.2583 - E-LABEL Act 113th Congress (2013-2014),10,7,16
1125,c107publ90,H.R.10 - Railroad Retirement and Survivors' Improvement Act of 2001 107th Congre...,3,7,9


## Try LSA too! ... but for now, try clustering on what I have