In [1]:
import re

import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from nltk.corpus import stopwords
from scipy import stats

In [2]:
import joblib

In [3]:
bill_text_df = pd.read_csv('116bill_text.csv')

In [4]:
bill_text_df.pop('Unnamed: 0')
bill_text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14167 entries, 0 to 14166
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   bill_num  14167 non-null  object
 1   type      14167 non-null  object
 2   text      14167 non-null  object
dtypes: object(3)
memory usage: 332.2+ KB


In [5]:
bill_text_df

Unnamed: 0,bill_num,type,text
0,s1379,enr,html body pre s ...
1,s1977,is,html body pre lt doc gt ...
2,s4439,is,html body pre lt doc gt ...
3,s3278,is,html body pre lt doc gt ...
4,s4861,is,html body pre lt doc gt ...
...,...,...,...
14162,hjres109,ih,html body pre lt doc gt ...
14163,hjres7,ih,html body pre lt doc gt ...
14164,hjres107,enr,html body pre h j res ...
14165,hjres9,ih,html body pre lt doc gt ...


In [10]:
stopwords_ = set(stopwords.words('english'))
additional_stopwords = ('congress', 'act', 'states', 'united', 
                        'house', '116th', 'html', 'bill', 'introduced', 
                        'title','gt', 'subsection', 'paragraph', 'subparagraph',
                        'insert', 'section', 'mr', 'ms', 'shall', 'sec')
roman_numerals = ('i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv')
stopwords_ = stopwords_.union(additional_stopwords)
stopwords_ = stopwords_.union(roman_numerals)

tf_vectorizer = CountVectorizer(stop_words=stopwords_, 
                                max_df=0.85, min_df=2, 
                                max_features=5000)

word_vec = tf_vectorizer.fit_transform(bill_text_df.text)


In [11]:
#trying with tfidf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, max_df=0.9, min_df=3, stop_words=stopwords_)
word_vec = vectorizer.fit_transform(bill_text_df.text)

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(learning_method='online',
                                n_jobs=-2,
                                random_state=0, 
                                n_components = 14,)
lda.fit(word_vec)

LatentDirichletAllocation(learning_method='online', n_components=14, n_jobs=-2,
                          random_state=0)

In [77]:
joblib.dump(lda, 'lda_model.joblib')
joblib.dump(tf_vectorizer, 'tf_vec.joblib')

['tf_vec.joblib']

In [78]:
lda = joblib.load('lda_model.joblib')
tf_vectorizer = joblib.load('tf_vec.joblib')

In [13]:
lda.components_

array([[0.07142865, 0.07142857, 0.07142857, ..., 0.07142859, 0.07142863,
        0.07142857],
       [0.07142861, 0.07142857, 0.07143138, ..., 0.07142857, 0.07142857,
        0.07142857],
       [0.07142857, 0.07142857, 0.07142857, ..., 0.07142857, 0.07142857,
        0.07142857],
       ...,
       [0.07142857, 0.07142857, 0.07142857, ..., 0.07142857, 0.07142857,
        0.07142857],
       [0.07142859, 0.0714286 , 0.07142864, ..., 0.07142872, 0.07142868,
        0.07142857],
       [0.07142889, 0.07142911, 0.07142865, ..., 0.07142876, 0.07143023,
        0.07142917]])

In [14]:
def top_topic_features(model, feature_names, num_features=15):
    sorted_topics = feature_names[model.components_.argsort(axis=1)[:, ::-1][:, :num_features]]
    return sorted_topics


In [15]:
feature_names = np.array(tf_vectorizer.get_feature_names())
top_topic_features(lda, feature_names=feature_names)

array([['elderly', 'vote', 'ballot', 'volunteer', 'voter', 'ceiling',
        'register', 'magnum', 'absentee', 'political', 'rico', 'cash',
        'offering', 'battlefield', 'redesignate'],
       ['operator', 'addiction', 'ninth', 'horse', 'packer', 'pakistan',
        'oceans', 'franchisee', 'overall', 'abatement', 'competitive',
        'origination', 'hereafter', 'coronavirus', 'maximum'],
       ['futures', 'ride', 'battalion', 'acres', 'type', 'ar',
        'occupant', 'therapeutic', 'precede', 'class', 'car',
        'assessment', 'sustainment', 'merger', 'heading'],
       ['futures', 'ride', 'battalion', 'type', 'task', 'acres', 'ar',
        'precede', 'class', 'car', 'sustainment', 'merger', 'heading',
        'tax', 'forgive'],
       ['post', 'possibility', 'abortion', 'reef', 'build', 'offering',
        'cohort', 'preserve', 'facilities', 'members', 'desert',
        'locality', 'six', 'humanitarian', 'warren'],
       ['vegetation', 'motion', 'alzheimer', 'trail', 'mo

In [83]:
X_train, X_test = train_test_split(word_vec, random_state=0)

params = {'n_components': [8, 9, 10, 11, 12, 13, 14, 15, 16], 
          'doc_topic_prior': stats.uniform(),
          'topic_word_prior': stats.uniform(),
          'learning_offset': stats.uniform(10, 90)}
lda.set_params(**{'verbose': 0, 'n_jobs': -2})
lda_cv = RandomizedSearchCV(lda, params, n_iter=1, n_jobs=-2)

results = {'mean_test_score': [],
'std_test_score': [],
'params': []}

In [84]:
n_iter = 10

for _ in range(n_iter):
    lda_cv.fit(X_train)
    results['mean_test_score'].append(lda_cv.cv_results_['mean_test_score'][0])
    results['std_test_score'].append(lda_cv.cv_results_['std_test_score'][0])
    results['params'].append(lda_cv.cv_results_['params'][0])

In [94]:
df_results = pd.DataFrame(results)
df_results.to_csv('lda_tuning.csv', index=False)

In [95]:
df_results

Unnamed: 0,mean_test_score,std_test_score,params
0,-12713050.0,1050444.0,"{'doc_topic_prior': 0.12246925172545775, 'lear..."
1,-12695190.0,1047662.0,"{'doc_topic_prior': 0.8373986825327147, 'learn..."
2,-12778750.0,1078701.0,"{'doc_topic_prior': 0.39107048660397303, 'lear..."
3,-12670630.0,1050531.0,"{'doc_topic_prior': 0.41222948099047985, 'lear..."
4,-12839000.0,1073545.0,"{'doc_topic_prior': 0.8387933849302702, 'learn..."
5,-12758670.0,1050105.0,"{'doc_topic_prior': 0.27506917734206915, 'lear..."
6,-12707860.0,1049039.0,"{'doc_topic_prior': 0.6801782077844617, 'learn..."
7,-12752790.0,1048893.0,"{'doc_topic_prior': 0.3287094284099852, 'learn..."
8,-12675660.0,1059974.0,"{'doc_topic_prior': 0.73544623886862, 'learnin..."
9,-12736490.0,1043974.0,"{'doc_topic_prior': 0.2994912602346441, 'learn..."


In [86]:
thing = df_results['params']

In [93]:
for i in range(10):
    print(thing[i])

{'doc_topic_prior': 0.12246925172545775, 'learning_offset': 42.936564106507426, 'n_components': 14, 'topic_word_prior': 0.8894895831552581}
{'doc_topic_prior': 0.8373986825327147, 'learning_offset': 50.727836635620285, 'n_components': 14, 'topic_word_prior': 0.012842900659330136}
{'doc_topic_prior': 0.39107048660397303, 'learning_offset': 54.06028475400152, 'n_components': 9, 'topic_word_prior': 0.06761739037782155}
{'doc_topic_prior': 0.41222948099047985, 'learning_offset': 91.86860557710408, 'n_components': 16, 'topic_word_prior': 0.4890108026487656}
{'doc_topic_prior': 0.8387933849302702, 'learning_offset': 65.49550904704746, 'n_components': 8, 'topic_word_prior': 0.8947303138113302}
{'doc_topic_prior': 0.27506917734206915, 'learning_offset': 43.875365804729775, 'n_components': 11, 'topic_word_prior': 0.9673306224691209}
{'doc_topic_prior': 0.6801782077844617, 'learning_offset': 26.54815391242329, 'n_components': 14, 'topic_word_prior': 0.6573087723900943}
{'doc_topic_prior': 0.3287

In [16]:
#from reviewmodel import ReviewLDA # OOP file for LDA and GridSearchCV process
import pyLDAvis
import pyLDAvis.sklearn


In [34]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words=stopwords_)
word_vec = vectorizer.fit_transform(bill_text_df.text)

neg_lda = LatentDirichletAllocation(learning_method='online',
                                n_jobs=-1,
                                random_state=0, 
                                n_components = 20)

neg_lda.fit(word_vec)



  and should_run_async(code)


LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=0)

  and should_run_async(code)


In [35]:
neg_ldavis = pyLDAvis.sklearn.prepare(neg_lda, word_vec, vectorizer) #Pass model, document-term matrix, and tfidf vectors to pyLDAvis.sklearn package
pyLDAvis.display(neg_ldavis) #Display visual

  and should_run_async(code)


TypeError: Object of type complex is not JSON serializable