In [1]:
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [2]:
# Try to stem, lemmatize, remove stop words, and use tf/idf on subject areas and titles

In [3]:
# The predefined subjects
important_subjects = ['Antitrust', 'Banking and Finance', 'Bankruptcy', 'Corporate Mergers and Acquisitions', 
                      'Employee Benefits', 'Health', 'Intellectual Property',  'Labor and Employment', 'Securities', 
                      'Tax']

In [4]:
# save it as a TextBlob object
subjects = [TextBlob(s) for s in important_subjects]

In [12]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/joeljoel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joeljoel/nltk_data...


True

In [9]:
# initialize stemmer
stemmer = SnowballStemmer('english')

# stem each word
print([stemmer.stem(word) for s in subjects for word in s.words])

['antitrust', 'bank', 'and', 'financ', 'bankruptci', 'corpor', 'merger', 'and', 'acquisit', 'employe', 'benefit', 'health', 'intellectu', 'properti', 'labor', 'and', 'employ', 'secur', 'tax']


In [10]:

# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
    text = text.lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

def split_into_lemmas2(text):
    text = text.lower()
    words = TextBlob(text).words
    
    return " ".join([word.lemmatize() for word in words])

def stem_words(text):

    return stemmer.stem(text)

# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)

# TfidfVectorizer
#vect = TfidfVectorizer(preprocessor=stem_words, ngram_range=(1,3), stop_words='english')
vect = TfidfVectorizer(preprocessor=split_into_lemmas2, ngram_range=(1,3), stop_words='english')

In [16]:
pd.DataFrame(vect.fit_transform(important_subjects).toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,acquisition,antitrust,banking,banking finance,bankruptcy,benefit,corporate,corporate merger,corporate merger acquisition,employee,...,health,intellectual,intellectual property,labor,labor employment,merger,merger acquisition,property,security,tax
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.408248,0.0,...,0.0,0.0,0.0,0.0,0.0,0.408248,0.408248,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.57735,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
## Now try the same queries, but in python!
import psycopg2

# connect:
dbname='bills_db'
username='joeljoel'
con = psycopg2.connect(database = dbname, user = username)

# query:
sql_query = """
SELECT DISTINCT(subject) FROM bill_subject;
"""
all_subjects = pd.read_sql_query(sql_query,con)['subject']

all_subjects.head()

  all_subjects = pd.read_sql_query(sql_query,con)['subject']


0                  Venezuela
1    Congressional elections
2         Dominican Republic
3          Medical education
4                   Portugal
Name: subject, dtype: object

In [19]:
pd.DataFrame(vect.fit_transform(all_subjects).toarray(), columns=vect.get_feature_names())

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [14]:
vect.fit_transform(all_subjects)

NameError: name 'all_subjects' is not defined

In [5]:
# Now let's do the same for the titles
# query:
sql_query = """
SELECT bill_name FROM us_bills;
"""
title_query = pd.read_sql_query(sql_query,con)['bill_name']

In [6]:
title_query.head()

0    Regarding consent to assemble outside the seat...
1    Recognizing the challenges and burdens associa...
2    Expressing the sense of the Congress regarding...
3    Supporting the Association of American Veterin...
4    Providing for a joint session of Congress to r...
Name: bill_name, dtype: object

In [7]:
test_frame = pd.DataFrame(vect.fit_transform(title_query).toarray(), columns=vect.get_feature_names())

NameError: name 'vect' is not defined

In [111]:
test_frame[test_frame['tax'] > 0]

Unnamed: 0,00,00 cv,00 cv 03110,000,000 000,000 000 000,000 000 50,000 000 auditor,000 000 cause,000 000 consecutive,...,zone order,zone order help,zone purpose,zone transit,zone transit zone,zone united,zone united state,zoological,zoological veterinary,zoological veterinary medicine
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Check to see if this will work on the full data for U.S. bills
# Now let's do the same for the titles
# query:
sql_query = """
SELECT bill_text FROM us_bills;
"""
bill_query = pd.read_sql_query(sql_query,con)['bill_text']


  bill_query = pd.read_sql_query(sql_query,con)['bill_text']


In [21]:
len(bill_query)

10933

In [22]:
%time full_text_frame = pd.DataFrame(vect.fit_transform(bill_query).toarray(), columns=vect.get_feature_names())

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [23]:
# Check to see if this will work on the full data for NY bills


In [24]:
full_text_frame[full_text_frame['tax'] > 0]

NameError: name 'full_text_frame' is not defined

0    {"\n","[Congressional Bills 114th Congress]\n"...
1    {"\n","[Congressional Bills 114th Congress]\n"...
2    {"\n","[Congressional Bills 114th Congress]\n"...
3    {"\n","[Congressional Bills 114th Congress]\n"...
4    {"\n","[Congressional Bills 114th Congress]\n"...
Name: bill_text, dtype: object