In [1]:
import pandas as pd
import numpy as np 

from pathlib import Path

In [2]:
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.utils import lemmatize

In [3]:
docdir = Path('../data/docs')

In [4]:
raw_corpus = []

In [5]:
files = list(docdir.glob('*.body'))
for f in files:
    with open(f, 'r') as fp:
        raw_corpus.append(fp.read())
        fp.close()

In [6]:
corpus = []
for doc in raw_corpus:
    # from this lemmatization tutorial
    # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    lemmatized = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(doc)]
    corpus.append(lemmatized)


In [7]:
# strip short words and stop words from the lemmatized corpus 
def strip_list(wordlist):
    return [w for w in wordlist if len(w)> 3 and w not in gensim.parsing.preprocessing.STOPWORDS ]

corpus = [ strip_list(words) for words in corpus ]

In [8]:
dictionary = corpora.Dictionary(tokens for tokens in corpus)

In [9]:
tokened_corpus = [dictionary.doc2bow(tokens) for tokens in corpus ]

In [22]:
# dimensionality reduction
from gensim.models import TfidfModel, LsiModel

# first convert words to tfidf values
tfidf = TfidfModel(dictionary=dictionary)
vectored_corpus = [ tfidf[doc] for doc in tokened_corpus]

# next do dimensionalty reduction
lsi = LsiModel(corpus=vectored_corpus, num_topics=150, id2word=dictionary, onepass=False, power_iters=3)

In [23]:
lsi.print_topics()[:5]

[(0,
  '0.175*"provide" + 0.161*"expense" + 0.144*"section" + 0.136*"assistance" + 0.133*"appropriation" + 0.132*"fund" + 0.130*"program" + 0.124*"united" + 0.124*"office" + 0.122*"states"'),
 (1,
  '0.471*"inspector" + 0.345*"general" + 0.258*"expense" + 0.254*"office" + 0.199*"necessary" + 0.165*"representation" + 0.163*"reception" + -0.140*"loan" + 0.134*"exceed" + 0.132*"official"'),
 (2,
  '-0.530*"inspector" + -0.377*"general" + 0.198*"representation" + 0.194*"reception" + -0.176*"loan" + 0.162*"official" + 0.134*"expense" + -0.129*"office" + 0.129*"code" + 0.125*"exceed"'),
 (3,
  '-0.538*"loan" + -0.271*"rural" + -0.262*"housing" + -0.194*"guarantee" + -0.147*"cost" + 0.138*"appropriation" + -0.135*"direct" + 0.129*"transfer" + 0.122*"committee" + -0.121*"development"'),
 (4,
  '0.340*"assistance" + 0.263*"foreign" + -0.214*"fiscal" + -0.205*"year" + -0.204*"transfer" + -0.186*"current" + -0.180*"appropriation" + -0.163*"obligation" + 0.159*"international" + -0.148*"expressly"'

In [24]:
lsi_corpus = [ lsi[doc] for doc in vectored_corpus ]

lsi_array = np.array(lsi_corpus)
lsi_array = lsi_array[:,:,1]
lsi_array.shape

(1248, 150)

In [36]:
# cluster with hdbscan
import hdbscan
hdb = hdbscan.HDBSCAN(cluster_selection_method='leaf')
hdb.fit(lsi_array)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
    approx_min_span_tree=True, cluster_selection_method='leaf',
    core_dist_n_jobs=4, gen_min_span_tree=False, leaf_size=40,
    match_reference_implementation=False, memory=Memory(location=None),
    metric='euclidean', min_cluster_size=5, min_samples=None, p=None,
    prediction_data=False)

In [37]:
hdb.labels_

array([-1, -1,  8, ...,  3, -1,  8])

In [38]:
labels = pd.Series(hdb.labels_)

In [39]:
labels.value_counts()

-1    1043
 8     132
 4      13
 6      12
 3      10
 1       7
 9       7
 7       7
 5       6
 2       6
 0       5
dtype: int64

In [42]:
for idx, lab in enumerate(hdb.labels_):
    if lab == 1:
        print(" " )
        print(raw_corpus[idx])

 
814.None of the Federal funds appropriated in this Act shall remain available for obligation beyond the current fiscal year, nor may any be transferred to other appropriations, unless expressly so provided herein.
 
501.No part of any appropriation contained in this Act shall remain available for obligation beyond the current fiscal year unless expressly so provided herein.
 
703.No part of any appropriation contained in this Act shall remain available for obligation beyond the current fiscal year unless expressly so provided herein.
 
502.No part of any appropriation contained in this Act shall remain available for obligation beyond the current fiscal year unless expressly so provided herein.
 
402.None of the funds appropriated in this Act shall remain available for obligation beyond the current fiscal year, nor may any be transferred to other appropriations, unless expressly so provided herein.
 
602.None of the funds appropriated in this Act shall remain available for obligation 

In [41]:
for idx, lab in enumerate(hdb.labels_):
    if lab == 4:
        print(" " )
        print(raw_corpus[idx])

 
For payments authorized under section 11–2604 and section 11–2605, D.C. Official Code (relating to representation provided under the District of Columbia Criminal Justice Act), payments for counsel appointed in proceedings in the Family Court of the Superior Court of the District of Columbia under chapter 23 of title 16, D.C. Official Code, or pursuant to contractual agreements to provide guardian ad litem representation, training, technical assistance, and such other services as are necessary to improve the quality of guardian ad litem representation, payments for counsel appointed in adoption proceedings under chapter 3 of title 16, D.C. Official Code, and payments authorized under section 21–2060, D.C. Official Code (relating to services provided under the District of Columbia Guardianship, Protective Proceedings, and Durable Power of Attorney Act of 1986), $46,005,000, to remain available until expended: Provided , That not more than $20,000,000 in unobligated funds provided in t

In [40]:
import random
for idx, lab in enumerate(hdb.labels_):
    if lab == -1 and random.random() > .8:
        print(" " )
        print(raw_corpus[idx])

 
For necessary expenses related to Federal-State Partnership for State of Good Repair Grants as authorized by section 24911 of title 49, United States Code, $400,000,000, to remain available until expended: Provided , That the Secretary may withhold up to one percent of the amount provided under this heading for the costs of award and project management oversight of grants carried out under section 24911 of title 49, United States Code: Provided further , That the Secretary shall issue the Notice of Funding Opportunity that encompasses funds provided under this heading in this Act and previously unawarded funds provided under this heading in fiscal year 2017 by Public Law 115–31 and fiscal year 2018 by Public Law 115–141 , no later than 30 days after enactment of this Act: Provided further , That the Secretary shall announce the selection of projects to receive awards for the funds in the previous proviso no later than 180 days after enactment of this Act.
 
For activities authorized 

In [30]:
labels = [3,1,0]

In [31]:
for label in labels:
    print('  ')
    print('---------', label, '----------')
    for ind,ilbl in enumerate(hdb.labels_):
        if ilbl == label:
            print("  ")
            print(raw_corpus[ind])

  
--------- 3 ----------
  
532.(a) None of the funds made available in this Act may be used to maintain or establish a computer network unless such network blocks the viewing, downloading, and exchanging of pornography.(b) Nothing in subsection (a) shall limit the use of funds necessary for any Federal, State, tribal, or local law enforcement agency or any other entity carrying out criminal investigations, prosecution, adjudication, or other law enforcement- or victim assistance-related activity.
  
713.(a) None of the funds made available in this Act may be used to maintain or establish a computer network unless such network blocks the viewing, downloading, and exchanging of pornography.(b) Nothing in subsection (a) shall limit the use of funds necessary for any Federal, State, tribal, or local law enforcement agency or any other entity carrying out criminal investigations, prosecution, or adjudication activities.
  
628.(a) None of the funds made available in this Act may be used t