In [1]:
import pandas as pd
import numpy as np 

from gensim import corpora
from gensim.models import TfidfModel
from gensim.models.ldamulticore import LdaMulticore

In [2]:
from budget_corpus import read_documents, read_raw_corpus

In [3]:
from pathlib import Path

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
raw_corpus = read_raw_corpus()
corpus = read_documents()

In [6]:
dictionary = corpora.Dictionary(tokens for tokens in corpus)

2019-02-28 13:18:30,584 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-02-28 13:18:30,656 : INFO : built Dictionary(4392 unique tokens: ['acquisition', 'aircraft', 'authorize', 'capital', 'derive']...) from 1248 documents (total 69715 corpus positions)


In [7]:
# thanks to the stemmer, butterfly is now misspelled
dictionary.token2id['butterfly']

2129

In [8]:
tokened_corpus = [ dictionary.doc2bow(tokens) for tokens in corpus ]

In [9]:
# Attempt LDA  - cranked the numer of passes up but the perplexity scores
# do not decrease

In [10]:
lda = LdaMulticore(tokened_corpus, id2word=dictionary, passes=20, num_topics=20)

2019-02-28 13:18:34,919 : INFO : using symmetric alpha at 0.05
2019-02-28 13:18:34,920 : INFO : using symmetric eta at 0.05
2019-02-28 13:18:34,922 : INFO : using serial LDA version on this node
2019-02-28 13:18:34,938 : INFO : running online LDA training, 20 topics, 20 passes over the supplied corpus of 1248 documents, updating every 14000 documents, evaluating every ~1248 documents, iterating 50x with a convergence threshold of 0.001000
2019-02-28 13:18:34,939 : INFO : training LDA model using 7 processes
2019-02-28 13:18:34,981 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1248/1248, outstanding queue size 1
2019-02-28 13:18:35,822 : INFO : topic #13 (0.050): 0.019*"necessary" + 0.018*"remain" + 0.017*"include" + 0.013*"law" + 0.012*"appropriate" + 0.012*"carry" + 0.011*"head" + 0.011*"public" + 0.011*"year" + 0.010*"authorize"
2019-02-28 13:18:35,823 : INFO : topic #7 (0.050): 0.015*"head" + 0.014*"law" + 0.009*"necessary" + 0.009*"project" + 0.009*"purpose" + 0

In [13]:
datapath =  Path('../data')
filepath = str(datapath / 'lda.model')

In [14]:
lda.save(filepath)

2019-02-28 13:19:28,687 : INFO : saving LdaState object under ../data/lda.model.state, separately None
2019-02-28 13:19:28,691 : INFO : saved ../data/lda.model.state
2019-02-28 13:19:28,695 : INFO : saving LdaMulticore object under ../data/lda.model, separately ['expElogbeta', 'sstats']
2019-02-28 13:19:28,696 : INFO : storing np array 'expElogbeta' to ../data/lda.model.expElogbeta.npy
2019-02-28 13:19:28,699 : INFO : not storing attribute state
2019-02-28 13:19:28,700 : INFO : not storing attribute id2word
2019-02-28 13:19:28,700 : INFO : not storing attribute dispatcher
2019-02-28 13:19:28,702 : INFO : saved ../data/lda.model


In [15]:
# When a term only appears once, LDA cannot assign it to a topic

In [16]:
dictionary.token2id['butterfly']

2129

In [17]:
lda.get_term_topics(2129)

[]

In [18]:
# But we can see what the model did with the butterfly document as whole

In [19]:
lda.get_document_topics(tokened_corpus[234] )

[(13, 0.9693548)]

In [21]:
for term, prob in lda.get_topic_terms(13):
    print( dictionary.id2token[term], prob ) 

necessary 0.033961415
remain 0.029824775
carry 0.021809978
include 0.021231653
law 0.01578866
public 0.01504455
inspector 0.014662895
authorize 0.014362977
exceed 0.013320975
year 0.012662586


In [22]:
# what else wound up in topic 13? 

In [23]:
all_topics = [ lda.get_document_topics(doc) for doc in tokened_corpus ] 

In [24]:
topic_docs = []
for idx, cat in enumerate(all_topics):
    for weights in cat:
        if weights[0] == 13 and weights[1] > .50:
            topic_docs.append((idx, cat))
            

In [25]:
len(topic_docs)

95

In [27]:
# What else got clustered with the butterflies? 
# print a sampling

In [30]:
import random

for entry in topic_docs:
    if random.random() < .15:
        index = entry[0]
        topics = entry[1]
        for t13 in topics:
            if t13[0] == 13:
                break
            
        print('---', t13[1], '---')
        print(raw_corpus[index][:500]) # first 500 chars in case its a long one
        print('')

--- 0.71291846 ---
For necessary expenses of the Office of Inspector General in carrying out the provisions of the Inspector General Reform Act of 2008, Public Law 110–409 , 122 Stat. 4302–16 (2008), and the Inspector General Act of 1978 (5 U.S.C. App.), and for the hire of passenger motor vehicles, $4,823,000.

--- 0.6044413 ---
For necessary expenses for protection, use, improvement, development, disposal, cadastral surveying, classification, acquisition of easements and other interests in lands, and performance of other functions, including maintenance of facilities, as authorized by law, in the management of lands and their resources under the jurisdiction of the Bureau of Land Management, including the general administration of the Bureau, and assessment of mineral potential of public lands pursuant to section 1010(

--- 0.6354518 ---
For necessary expenses of the Council of Economic Advisers in carrying out its functions under the Employment Act of 1946 ( 15 U.S.C. 1021 et seq.),

In [31]:
# Again, because of the high prevalance of unique words, this is 
# not going to be fixed by any sort of tuning.
# This is just to show an attempt at standard clustering.