In [32]:
import re

import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from nltk.corpus import stopwords
from scipy import stats

In [3]:
import joblib

In [11]:
bill_text_df = pd.read_csv('116bill_text.csv')

In [12]:
bill_text_df.pop('Unnamed: 0')
bill_text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17503 entries, 0 to 17502
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   bill_num  17503 non-null  object
 1   type      17503 non-null  object
 2   text      17503 non-null  object
dtypes: object(3)
memory usage: 410.4+ KB


In [9]:
bill_text_df

Unnamed: 0,bill_num,type,text
0,s1379,enr,html body pre ...
1,s1379,is,html body pre ltdocgt congre...
2,s1379,es,html body pre ltdocgt ...
3,s1977,is,html body pre ltdocgt congre...
4,s4439,is,html body pre ltdocgt congre...
...,...,...,...
17498,hjres107,enr,html body pre ...
17499,hjres107,eh,html body pre ltdocgt ...
17500,hjres107,rds,html body pre ltdocgt congress ...
17501,hjres9,ih,html body pre ltdocgt congre...


In [17]:
stopwords_ = set(stopwords.words('english'))
additional_stopwords = ('congress', 'act', 'states', 'united', 
                        'house', '116th', 'html', 'bill', 'introduced', 
                        'title','gt')
stopwords_ = stopwords_.union(additional_stopwords)
tf_vectorizer = CountVectorizer(stop_words=stopwords_, max_df=0.85, min_df=2, max_features=1000)

word_vec = tf_vectorizer.fit_transform(bill_text_df.text)

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(learning_method='online', n_jobs=-2, random_state=1659)

In [19]:
lda.fit(word_vec)

LatentDirichletAllocation(learning_method='online', n_jobs=-2,
                          random_state=1659)

In [20]:
joblib.dump(lda, 'lda_model.joblib')
joblib.dump(tf_vectorizer, 'tf_vec.joblib')

['tf_vec.joblib']

In [21]:
lda = joblib.load('lda_model.joblib')
tf_vectorizer = joblib.load('tf_vec.joblib')

In [22]:
lda.components_

array([[8.22400821e+02, 1.28948696e+03, 1.00271428e-01, ...,
        1.53179978e+04, 1.00081536e-01, 1.00141389e-01],
       [7.81430956e+02, 1.07736657e+03, 9.40515257e+02, ...,
        4.66940613e+03, 4.45692339e+01, 1.00029288e-01],
       [2.48178171e+02, 3.70953908e+02, 1.04984032e-01, ...,
        2.12976919e+03, 1.09146577e-01, 1.00005226e-01],
       ...,
       [1.00011518e-01, 2.97704969e+02, 5.59001560e+02, ...,
        2.37459098e+04, 1.00240071e-01, 2.56129263e+02],
       [1.23700166e+02, 1.17044859e+02, 1.11901973e-01, ...,
        4.00392504e+02, 1.00042118e-01, 5.10931007e-01],
       [4.78846432e+02, 1.11306781e+01, 8.51907314e+01, ...,
        9.78441708e+03, 1.95067672e+04, 5.73390184e+01]])

In [25]:
def top_topic_features(model, feature_names, num_features=15):
    sorted_topics = feature_names[model.components_.argsort(axis=1)[:, ::-1][:, :num_features]]
    return sorted_topics

In [26]:
feature_names = np.array(tf_vectorizer.get_feature_names())
top_topic_features(lda, feature_names=feature_names)

array([['secretary', 'include', 'energy', 'subsection', 'federal',
        'technology', 'use', 'research', 'program', 'project',
        'paragraph', 'administrator', 'year', 'system', 'term'],
       ['person', 'information', 'law', 'report', 'subsection',
        'include', 'state', 'agency', 'federal', 'commission', 'term',
        'foreign', 'date', 'government', 'security'],
       ['loan', 'election', 'business', 'veteran', 'member', 'small',
        'veterans', 'individual', 'state', 'federal', 'subsection',
        'secretary', 'date', 'make', 'department'],
       ['land', 'secretary', 'water', 'area', 'national', 'federal',
        'indian', 'management', 'state', 'tribe', 'date', 'public',
        'wilderness', 'use', 'usc'],
       ['year', 'plan', 'amount', 'subsection', 'paragraph', 'drug',
        'ii', 'amend', 'respect', 'subparagraph', 'code', 'make',
        'payment', 'period', 'secretary'],
       ['health', 'service', 'care', 'secretary', 'child', 'include',
    

In [35]:
X_train, X_test = train_test_split(word_vec, random_state=1659)

params = {'n_components': [10, 15, 20, 25], 
          'doc_topic_prior': stats.uniform(),
          'topic_word_prior': stats.uniform(),
          'learning_offset': stats.uniform(10, 90)}
lda.set_params(**{'verbose': 0, 'n_jobs': -2})
lda_cv = RandomizedSearchCV(lda, params, n_iter=1, n_jobs=-2)

results = {'mean_test_score': [],
'std_test_score': [],
'params': []}

In [36]:

n_iter = 2

for _ in range(n_iter):
    lda_cv.fit(X_train)
    results['mean_test_score'].append(lda_cv.cv_results_['mean_test_score'][0])
    results['std_test_score'].append(lda_cv.cv_results_['std_test_score'][0])
    results['params'].append(lda_cv.cv_results_['params'][0])

In [37]:
df_results = pd.DataFrame(results)
df_results.to_csv('lda_tuning.csv', index=False)
df_results.head()

Unnamed: 0,mean_test_score,std_test_score,params
0,-12288870.0,838624.589772,"{'doc_topic_prior': 0.772393402455979, 'learni..."
1,-12349320.0,837992.711743,"{'doc_topic_prior': 0.24210823184129304, 'lear..."


In [39]:
thing = df_results['params']

In [40]:
thing[0]

{'doc_topic_prior': 0.772393402455979,
 'learning_offset': 43.89834108330891,
 'n_components': 15,
 'topic_word_prior': 0.3674649966864506}

In [41]:
thing[1]

{'doc_topic_prior': 0.24210823184129304,
 'learning_offset': 20.32172534056673,
 'n_components': 10,
 'topic_word_prior': 0.33782122069279574}