In [1]:
import numpy as np
import pandas as pd
import pymongo
import os
from bson.json_util import loads, dumps
import psycopg2
from sqlalchemy import create_engine

In [2]:
postgres_password = os.environ['POSTGRES_PASSWORD']
mongo_username = os.environ['MONGO_INITDB_ROOT_USERNAME']
mongo_password = os.environ['MONGO_INITDB_ROOT_PASSWORD']
mongo_init_db = os.environ['MONGO_INITDB_DATABASE']

In [3]:
myclient = pymongo.MongoClient(f"mongodb://{mongo_username}:{mongo_password}@mongo:27017/{mongo_init_db}?authSource=admin")
contrans_db = myclient['contrans']
bills = contrans_db['bills']

In [4]:
myquery = bills.find({}, 
                     {'_id':0,
                     'sponsor_id':1,
                     'bill_text':1})
bills_df = pd.DataFrame.from_records(loads(dumps(myquery)))
bills_df['bill_text'] = bills_df['bill_text'].astype('str')

In [5]:
bills_df = bills_df.groupby(['sponsor_id'])['bill_text'].apply(' '.join).reset_index()

In [6]:
bills_df = bills_df.set_index('sponsor_id')

In [7]:
bills_df

Unnamed: 0_level_0,bill_text
sponsor_id,Unnamed: 1_level_1
,This Act may be cited as the COVID–19 Health D...
A000055,"Whereas created by farmers, led by farmers, an..."
A000148,This Act may be cited as the No Bank Accounts ...
A000369,That the House has heard with profound sorrow ...
A000370,This Act may be cited as the Protecting Wages ...
...,...
W000827,This Act may be cited as the Advanced Aviation...
Y000033,This Act may be cited as the Bringing Oligarch...
Y000062,Whereas family service learning is a method un...
Y000064,This Act may be cited as the Ukraine Human Rig...


In [8]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [9]:
tfIdfVectorizer= TfidfVectorizer(stop_words='english', 
                                 max_df = .8, 
                                 ngram_range = (1,3))
tfIdf = tfIdfVectorizer.fit_transform(bills_df['bill_text'])

In [10]:
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False).reset_index()
df = df.rename({'index':'word', 'TF-IDF':'tf_idf'}, axis=1)
df.head(10)

Unnamed: 0,word,tf_idf
0,bifia program,0.192886
1,bifia,0.192886
2,project,0.177575
3,secured loan,0.142876
4,assistant secretary,0.129462
5,assistant,0.112714
6,hydrogen,0.105573
7,line credit,0.096787
8,obligor,0.096476
9,credit,0.08506


In [11]:
%%time
charwords = pd.DataFrame()
for t in range(0,tfIdf.shape[0]):
    #print(f'Now working on {t} of {tfIdf.shape[0]}', end="\r")
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False).reset_index().head(10)
    df = df.rename({'index':'word', 'TF-IDF':'tf_idf'}, axis=1)
    df['sponsor_id'] = bills_df.index[t]
    charwords = pd.concat([charwords, df], ignore_index=True)

CPU times: user 42min 36s, sys: 1min 16s, total: 43min 52s
Wall time: 43min 34s


In [12]:
engine = create_engine('postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}'.format(
    user = 'postgres',
    password = postgres_password,
    host = 'postgres',
    port = '5432',
    db = 'contrans'))

In [13]:
charwords.to_sql('charwords', con=engine, if_exists='replace', index=False, chunksize=1000)

5500

In [14]:
myquery = '''
SELECT *
FROM charwords
'''
pd.read_sql_query(myquery, con=engine)

Unnamed: 0,word,tf_idf,sponsor_id
0,bifia program,0.192886,
1,bifia,0.192886,
2,project,0.177575,
3,secured loan,0.142876,
4,assistant secretary,0.129462,
...,...,...,...
5495,assistant,0.112714,Z000017
5496,hydrogen,0.105573,Z000017
5497,line credit,0.096787,Z000017
5498,obligor,0.096476,Z000017


In [15]:
def get_top_bills(t):
    df = pd.DataFrame(t.T.todense(), index=tfIdfVectorizer.get_feature_names_out(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False).reset_index().head(10)
    df = df.rename({'index':'word', 'TF-IDF':'tf_idf'}, axis=1)
    #df['sponsor_id'] = bills_df.index[t]
    return df

In [16]:
%%time
top_bills_list = [get_top_bills(t) for t in tfIdf]

CPU times: user 47min, sys: 1min 22s, total: 48min 22s
Wall time: 48min


In [17]:
charwords = pd.read_sql_query(myquery, con=engine)

In [18]:
charwords.to_csv('charwords.csv')