# Welcome and Setup

This project will investigate the ability to detect ideology from the speech of individuals in Congress. The first few cells will import the necessary packages, set up the vectorizers, and read in the data. The format thereafter is as follows:

    1)  Prepare data for bag of words models
    2)  Inspect resulting data
    3)  Reshape data for LSTM model
    4)  Pad and sequence in prep for LSTM Model
    5)  Evaluate BOW models
    6)  Collect ideologies from BOW models to compare against GovTrack data
    7)  Run LSTM model
    8)  Collect ideologies from LSTM models to compare against GovTrack data
    9)  Graph comparisons and performance of models


In [1]:
import pandas as pd
import numpy as np

import datetime


from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Embedding, LSTM, Dropout, Flatten, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import re

import seaborn as sns
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
big_start = datetime.datetime.now()

In [3]:
#cd into the directory of the inputs
!cd /home/kr.redfield/w266/project/Inputs

### Initiliaze Instances of Vectorizers

We will only use one instance of each vectorizer throughout the notebook. They are initialized here. For the vectorizer, English stopwords are removed.

In [4]:
#set up your tfidf vectorizer
#we use english stop words and accept the rest as default options
tf = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    max_features=None)

#set up your count vectorizer
#we use english stop words and accept the rest as default options

count_vec = CountVectorizer(stop_words = "english",strip_accents='unicode')

In [5]:
def replace_nth_char(string_orig, char_orig, char_rep, n = None):
    """Function to replace nth occurence of character with a different character."""
        
    str_l = string_orig.split(char_orig)
    ret_str = str_l[0] + "|" + str_l[1]
    for i in range(2,len(str_l)):
        ret_str += str_l[i]
    
    return ret_str

In [6]:
start = datetime.datetime.now()

#let's read in the data, shall we?
def read_congress_speech(yr, sample = True):
    '''Function to read in the raw data from HeinOnline download'''
    
    #just plain read it in using the text format
    with open("./speeches_{}.txt".format(yr), "r+", errors='ignore') as file:
        data = file.readlines()
    
    #its pipe delimited, but at some points in the file, there are pipes occuring in the text itself
    #we need to sub out those instances so we can read it into a table
    for i in range(len(data)):
        if data[i].count("|") > 1:
            data[i] = replace_nth_char(data[i],"|","/")
    
    #once you've replace it, write it back to the file
    with open("./temp_clean.txt", "w") as file:
        for line in data:
            file.write(line)

    #read in the cleaned speech data (note: use latin1. unicode doesn't work)
    x = pd.read_table("./temp_clean.txt",delimiter="|", encoding="latin1")
    
    #read in the data mapping speeches to the characteristics of the speaker, including party, state, chamber, etc.
    y = pd.read_table("./{}_SpeakerMap.txt".format(yr),delimiter="|")

    #merge the two together
    data = pd.merge(x,y,on='speech_id',how='inner')
    
    #create a variable assigning it to the number congress you're reading in
    data['congress_num'] = yr
    
    #string cleaning to lowercase
    data = data[data['party'].apply(lambda x: x.lower() in ['r','d'])]
    data = data[data['chamber'].apply(lambda x: x.lower() in ['s','h'])]

    #two things going on here
    #first, getting rid of anything that's not a string in speech
    #second, getting rid of any speech fewer than 30 words in length
    data = data[data['speech'].apply(lambda x: False if not isinstance(x,str) else len(x.split(" ")) > 30)]
    
    #third, dropping any speech greater than 500 words in length
    data = data[data['speech'].apply(lambda x: len(x.split(" ")) < 500)]

    print("Read in speeches for Congress {}. Data contains {} speeches.".format(yr, data.shape[0]))
    
    #if we don't trim it down a bit, this thing is unmanageably large
    #let's just take 40% of the data we read in
    if sample:
        data = data.sample(frac=.4)

    return data

train_df = pd.DataFrame()

#iterate through all congresses except the most recent two
for yr in ["097","098","099","100","101","102","103","104","105","106","107","108","109","110","111","112"]:

    train_df = train_df.append(read_congress_speech(yr))

#make sure we don't have nonetype speeches
train_df = train_df[train_df['speech'].notnull()]

#and fit the tfidf vectorizer
tf.fit(train_df['speech'])

#you worked hard to import. save your work
train_df.to_csv("train_df.csv")

print("Runtime is {}.".format(datetime.datetime.now()-start))

#shuffle it up per standard practice
train_df = train_df.sample(frac=1)

Read in speeches for Congress 097. Data contains 81211 speeches.
Read in speeches for Congress 098. Data contains 82657 speeches.
Read in speeches for Congress 099. Data contains 83178 speeches.
Read in speeches for Congress 100. Data contains 79886 speeches.
Read in speeches for Congress 101. Data contains 70439 speeches.
Read in speeches for Congress 102. Data contains 69103 speeches.
Read in speeches for Congress 103. Data contains 69042 speeches.
Read in speeches for Congress 104. Data contains 82845 speeches.
Read in speeches for Congress 105. Data contains 59736 speeches.
Read in speeches for Congress 106. Data contains 60525 speeches.
Read in speeches for Congress 107. Data contains 49553 speeches.
Read in speeches for Congress 108. Data contains 54947 speeches.
Read in speeches for Congress 109. Data contains 51171 speeches.
Read in speeches for Congress 110. Data contains 56331 speeches.
Read in speeches for Congress 111. Data contains 43161 speeches.
Read in speeches for Cong

In [7]:
val_df = pd.DataFrame()

#read in the two most recent congresses to use for validation
#we will compare these against GovTrack scores
#note: not sampling this down. we'll end up with the full deal

for yr in ["113","114"]:
    val_df = val_df.append(read_congress_speech(yr, sample = False))

#shuffling this as well
val_df = val_df.sample(frac=1)
val_df.head()

Read in speeches for Congress 113. Data contains 35951 speeches.
Read in speeches for Congress 114. Data contains 32385 speeches.


Unnamed: 0,speech_id,speech,speakerid,lastname,firstname,chamber,state,gender,party,district,nonvoting,congress_num
2381,1130003810,Mr. President. today I wish to recognize two o...,113121841,HELLER,DEAN,S,NV,M,R,,voting,113
21654,1130035067,In the time since this body last adjourned. st...,113121520,THOMPSON,GLENN,H,PA,M,R,5.0,voting,113
23793,1130038287,Mr. Speaker. I rise today in strong support of...,113120970,RENACCI,JAMES,H,OH,M,R,16.0,voting,113
21179,1130034262,At the beginning of the consideration of this ...,113119160,DEFAZIO,PETER,H,OR,M,D,4.0,voting,113
71146,1130116260,I ask unanimous consent that when the Senate c...,113120961,REID,HARRY,S,NV,M,D,,voting,113


In [10]:
govtrack_report_cards = pd.DataFrame()

#you can download the GovTrack Ideology scores at https://www.govtrack.us/congress/members/report-cards/2013 ... 2014... etc.
#a little manual manipulation made them such that we can sweep through and put them into one df
for yr in ['2013','2014','2015','2016','2017']:
    yr_reportcard = pd.read_excel(r"Ideology_13_17.xlsx",sheet_name=yr)
    govtrack_report_cards = govtrack_report_cards.append(yr_reportcard, ignore_index=True)

#because there are several years in a single congress, we take the mean of their ideology of all the years that member participated in that congress
govtrack_report_cards = govtrack_report_cards.groupby(['congress','chamber','state','last_name']).ideology.mean().reset_index()
govtrack_report_cards.rename(columns={"last_name":"lastname", 'congress':'congress_num'}, inplace = True)

#force congress_num to string for purposes of merging to the train_df later
govtrack_report_cards['congress_num'] = govtrack_report_cards['congress_num'].apply(str)
govtrack_report_cards.head()

Unnamed: 0,congress_num,chamber,state,lastname,ideology
0,113,H,AK,young,0.644915
1,113,H,AL,aderholt,0.662598
2,113,H,AL,bachus,0.707156
3,113,H,AL,brooks,0.759287
4,113,H,AL,byrne,0.675195


# Prepare Data for N-Gram Models

Because the preparation will differ across models, here we prepare the data in two different ways to be employed for various n-gram models. We will test the use of both TDIDF Vectorizer and Count Vectorizer to compare their performance. Both vectorizers automatically remove the effects of punctuation and capitalization to standardize the data for use in the models.

In [11]:
start = datetime.datetime.now()

train_df = pd.read_csv("train_df.csv")

#retrieve training data from dataframe for TfIDF models
x_tf = train_df['speech']
x_tf = tf.fit_transform(train_df['speech'])

#retrieve test data from dataframe for TfIDF models
y_tf = train_df['party']
y_tf = np.where(y_tf.str.lower() == 'r',1,0)

#retrieve validation prediction from dataframe for TfIDF models
x_val_tf = val_df['speech']
x_val_tf = tf.transform(val_df['speech'])

#retrieve validation actual results from dataframe for TfIDF models
y_val_tf = val_df['party']
y_val_tf = np.where(y_val_tf.str.lower() == 'r',1,0)

print("Runtime is {}.".format(datetime.datetime.now()-start))

Runtime is 0:01:19.999515.


In [12]:
start = datetime.datetime.now()

#retrieve training data from dataframe for Count models
x_countvec = train_df['speech']
x_countvec = count_vec.fit_transform(train_df['speech'])

#retrieve test data from dataframe for Count models
y_countvec = train_df['party']
y_countvec = np.where(y_countvec.str.lower() == 'r',1,0)

#retrieve validation prediction from dataframe for Count models
x_val_count = val_df['speech']
x_val_count = count_vec.transform(val_df['speech'])

#retrieve validation actual results from dataframe for Count models
y_val_count = val_df['party']
y_val_count = np.where(y_val_count.str.lower() == 'r',1,0)


print("Runtime is {}.".format(datetime.datetime.now()-start))


Runtime is 0:01:07.868742.


In [13]:
X_tf_train, X_tf_test, y_tf_train, y_tf_test = train_test_split(x_tf, y_tf, test_size= 0.2, random_state=81392)

In [14]:
X_countvec_train, X_countvec_test, y_countvec_train, y_countvec_test = train_test_split(x_countvec, y_countvec, test_size= 0.2, random_state=81392)

### Inspect Data

Here we'll take a look at some basic statistics of this monstrosity we've created.

In [15]:
#get the weights from the fitted TfIDF
weights = tf.idf_

#get a dataframe from it to look at what words are most likely (worth the least)
word_vs_weight = dict(zip(tf.get_feature_names(), weights))
idf_inspect = pd.DataFrame.from_dict(word_vs_weight, orient='index').reset_index()
idf_inspect.columns=('token','weight')

#sort it so the lowest weighted words are at the top
idf_inspect = idf_inspect.sort_values(by='weight')

#and throw it into a list so we can use it in a couple cells
bottom_50 = idf_inspect.head(50)['token'].tolist()

#let's take a look
idf_inspect.head(50)

Unnamed: 0,token,weight
251648,mr,1.322735
296632,president,1.913069
378145,time,2.128622
351628,speaker,2.204055
37897,amendment,2.460022
91763,committee,2.507589
80531,chairman,2.509969
163746,gentleman,2.522812
284702,people,2.562267
338640,senator,2.567488


In [16]:
#count up number of records in train_df
print("{} records in dataset.".format(train_df.shape[0]))

#assess distribution of length of record by Congress
speeches_by_year = train_df.groupby("congress_num").speech.count().reset_index()

#get number of speakers
speakers_by_year = train_df.groupby("congress_num").speakerid.count().reset_index()

#count number of democrat/republican speakers
party_split_train = train_df['party'].value_counts()
party_split_val = val_df['party'].value_counts()

413983 records in dataset.


In [17]:
speeches_by_year

Unnamed: 0,congress_num,speech
0,97,32484
1,98,33063
2,99,33271
3,100,31954
4,101,28176
5,102,27641
6,103,27617
7,104,33138
8,105,23894
9,106,24210


In [18]:
speakers_by_year

Unnamed: 0,congress_num,speakerid
0,97,32484
1,98,33063
2,99,33271
3,100,31954
4,101,28176
5,102,27641
6,103,27617
7,104,33138
8,105,23894
9,106,24210


In [19]:
party_split_train

D    213237
R    200746
Name: party, dtype: int64

In [20]:
party_split_val

R    35339
D    32997
Name: party, dtype: int64

# Prepare Data for LSTM Model

Because the preparation will differ across models, here we prepare the data in to be sent to the LSTM. Because we will send it to be embedded in text form, we take out the 50 words with the highest tfidf weight, standardize the strings to lowercase, and turn into tokenized sequences.

## Reduce Length of Each Speech in Reshape

Currently, the data is intractably long because the speeches can be up to 20,000 words in length. We will divide speeches into component sentences so the LSTM can train later. In doing so, we will limit ourselves to looking at speeches between 20 and 150 words in length. Sentences fewer than 20 words in length are very likely to be procedural. Going above 150 words would again make the LSTM run for far too long.

Finally, we remove any sentence containing a word belonging to a list of words with the lowest weight from the TfIDF fit above. This will ensure that we remove the majority of procedural sentences.

In [21]:
#control for the user if you already have the data all cleaned up and ready for analysis
#if you want to just read in the already created reshaped dataset, set this to False
create_new_sentence_df = False

def gen_sentence_df(input_df=train_df, max_sentence_length=200, export=False):
    '''function to break up the dataset into smaller sentences and get rid of any speeches that have low weight words'''
    
    sentence_df = pd.DataFrame()

    #you'll get a memory error on the reshape section if you try to do it all at once
    #so we iterate through some chunks of it
    for i, chunk in enumerate(np.array_split(input_df,20)):
        
        print("Processing chunk {}".format(i))
        
        #first, drop any speech that is longer than what we're tolerating
        temp = chunk[chunk['speech'].apply(lambda x: len(x.split(". ")) < max_sentence_length)]
        
        #next, split on the periods. this should give us just the sentences in the speech
        temp = temp['speech'].str.split('.',expand=True)
        temp.columns = ["sentence{}".format(x) for x in range(len(temp.columns))]
        
        #merge the speech_id back onto the split sentences
        temp = pd.concat([temp,chunk[chunk['speech'].apply(lambda x: len(x.split(". ")) < max_sentence_length)].loc[:,['speech_id',"party"]]],axis=1)
        
        #reshape that to long for processing later
        temp = pd.wide_to_long(temp, stubnames="sentence", i = "speech_id", j = "sentence_num").reset_index()
        
        #get rid of the nulls that populated as a result and then lowercase everything
        temp = temp[temp['sentence'].notnull()]
        temp['sentence'] = temp['sentence'].str.lower()
        
        #only keep those sentences greater than 20 words in length. Any shorter and its probably procedural
        temp = temp[temp['sentence'].apply(lambda x: len(x.split(" ")) > 20)]
        
        #get rid of sentences containing the 50 procedural words identified by the TfIDF Vectorizer
        #this should leave us with a pretty clean, non-procedural dataset
        temp = temp[~temp['sentence'].isin(bottom_50)]
        temp.rename(columns={'sentence':'speech'}, inplace=True)
        
        #and append it to the running list
        sentence_df = sentence_df.append(temp)
    
    if export:
        sentence_df.to_csv("sentence_df_train.csv")
        
    return sentence_df

start = datetime.datetime.now()

if create_new_sentence_df:
    #do this for the training data or just import it if its already done
    sentence_df_train = gen_sentence_df(input_df=train_df, max_sentence_length=150, export=True)

else:
    sentence_df_train = pd.read_csv("sentence_df_train.csv")
    
#clean for validation data
sentence_df_val = gen_sentence_df(input_df=val_df, max_sentence_length=150, export=False)

print("Runtime is {}.".format(datetime.datetime.now()-start))


Processing chunk 0
Processing chunk 1
Processing chunk 2
Processing chunk 3
Processing chunk 4
Processing chunk 5
Processing chunk 6
Processing chunk 7
Processing chunk 8
Processing chunk 9
Processing chunk 10
Processing chunk 11
Processing chunk 12
Processing chunk 13
Processing chunk 14
Processing chunk 15
Processing chunk 16
Processing chunk 17
Processing chunk 18
Processing chunk 19
Runtime is 0:00:26.279066.


In [22]:
#check out how many republican/democrat speeches we have now
sentence_df_train['party'].value_counts()

D    431627
R    392214
Name: party, dtype: int64

In [23]:
sentence_df_val['party'].value_counts()

D    66756
R    66607
Name: party, dtype: int64

In [24]:
#just for fun, let's check out the new lowest and highest weights resulting from a new fit of TfIDF on the sentence dataframe
tf2 = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    max_features=None)

#fit new one on new speeches
tf2.fit(sentence_df_train['speech'])

#get the weights from the fitted TfIDF
weights = tf2.idf_

#get a dataframe from it to look at what words are most likely (worth the least)
word_vs_weight = dict(zip(tf2.get_feature_names(), weights))
idf_inspect = pd.DataFrame.from_dict(word_vs_weight, orient='index').reset_index()
idf_inspect.columns=('token','weight')

#sort it so the lowest weighted words are at the top
idf_inspect = idf_inspect.sort_values(by='weight')

#and throw it into a list so we can use it in a couple cells
bottom_20 = idf_inspect.head(20)['token'].tolist()

#let's take a look (it'll never be perfect)
idf_inspect.head(20)

Unnamed: 0,token,weight
15558,amendment,3.778943
38762,committee,3.833096
122794,people,3.843403
164679,time,3.990456
154803,states,4.013566
80279,house,4.124257
147059,senate,4.124699
40992,congress,4.171079
147186,senator,4.196392
95803,legislation,4.23545


## Tokenize and Pad LSTM Input

Previously, we have reshaped the data to be shorter and to remove procedural sentences. Now, we tokenize the sequence so that we're inputting numbers to the LSTM and pad them so they are all of equal length. The LSTM requires vectors of equal length to be input to the network.

In [25]:
start = datetime.datetime.now()

#initialize a tokenizer for the speech sequences
tkn = Tokenizer()

#flexibly determine the max length of the speeches to feed to feed to the LSTM
sentence_df_train['speech_len'] = sentence_df_train['speech'].apply(lambda x: len(x.split(" ")))
max_speech_len = sentence_df_train.speech_len.max()

print("Max speech length is: {}".format(max_speech_len))

#start reducing your data to just the arrays the LSTM needs
x_test = sentence_df_train['speech']
#and fit the tokenizer using that data
tkn.fit_on_texts(x_test)

#note, tokenizer reserves 0 for a not word
#so the size is going to be one larger than expected
vocabulary_size = len(tkn.word_index) + 1

print("vocabulary size is: {}".format(vocabulary_size))

#transform your train data to integer sequences
x_lstm = tkn.texts_to_sequences(x_test)
#and fill in with zeroes for those speeches shorter than the longest speech
#all obs will now have .shape = (None,max_speech_len)
x_lstm = pad_sequences(x_lstm, maxlen=max_speech_len)

#convert your target (party) to int
#republican's will be 1 to match GovTracker system
y_lstm = sentence_df_train['party']
y_lstm = np.where(y_lstm.str.lower() == 'r',1,0)

x_lstm = np.asarray(x_lstm)

print("Runtime is {}.".format(datetime.datetime.now()-start))


Max speech length is: 205
vocabulary size is: 182413
Runtime is 0:00:57.438891.


In [26]:
start = datetime.datetime.now()

#ensure that the length of the validation data does not exceed the length of the training/test data
x_val_orig = sentence_df_val[sentence_df_val['speech'].apply(lambda x: len(x.split(" ")) <= max_speech_len)]
x_val = x_val_orig['speech']

#fit the validation data on the already generated training sequencer
x_val = tkn.texts_to_sequences(x_val)
x_val = pad_sequences(x_val, maxlen=max_speech_len)

#same target mapping to int as above
y_val = sentence_df_val['party']
y_val = np.where(y_val.str.lower() == 'r',1,0)

x_val = np.asarray(x_val)

print("Runtime is {}.".format(datetime.datetime.now()-start))


Runtime is 0:00:04.702839.


In [27]:
X_lstm_train, X_lstm_test, y_lstm_train, y_lstm_test = train_test_split(x_lstm, y_lstm, test_size= 0.2, random_state=81392)

In [28]:
print("Shape of x_train is: {}".format(X_lstm_train.shape))
print("Shape of y_train is: {}".format(y_lstm_train.shape))
print("Shape of x_val is: {}".format(x_val.shape))


Shape of x_train is: (659072, 205)
Shape of y_train is: (659072,)
Shape of x_val is: (133363, 205)


# Run N-Gram Models and Establish Baseline

We will use logistic regression to establish a baseline against which we will compare more sophisticated models. First, we will pass the created tfidf and count vectorized data and predict party affiliation in the test set using a grid search. Then, we will assess the "strength" of a member's party affiliation using the results of that best model.

In [29]:
start = datetime.datetime.now()

print("Computing logistic TfIDF model...")

#using cvgridsearch to try to optimize the logistic regression
#first, we'll use the TFIDF data
param_grid = {"C":np.linspace(0.2,2,5).tolist(), "penalty":['l1','l2']}
lg = LogisticRegression()
grid = GridSearchCV(lg, param_grid, n_jobs = -1)

#fit the training data with the best parameters found
grid.fit(X_tf_train,y_tf_train)
#and predict on test
lg_predict_tf = grid.predict(X_tf_test)

print("Best performance parameters are found to be:\n\n{}\n".format(grid.best_estimator_))
print("Accuracy: {}".format(accuracy_score(lg_predict_tf, y_tf_test)))
print("Runtime is {}.".format(datetime.datetime.now()-start))


Computing logistic TfIDF model...
Best performance parameters are found to be:

LogisticRegression(C=1.55, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Accuracy: 0.6461103663176202
Runtime is 0:04:10.552313.


In [30]:
#let's predict the probability that a given speech is by a democrat or republican
ideology_logit_tf = pd.DataFrame(grid.predict_proba(x_val_tf),columns=['Democrat_Logit_TF', 'Republican_Logit_TF'])
print("Ideology shape is: {}".format(ideology_logit_tf.shape))
print("Original df shape is: {}".format(val_df.shape))

#and put that in a df
ideology_logit_tf = pd.concat([ideology_logit_tf.reset_index(),val_df.reset_index()], axis=1, sort=False)
ideology_logit_tf.head()

Ideology shape is: (68336, 2)
Original df shape is: (68336, 12)


Unnamed: 0,index,Democrat_Logit_TF,Republican_Logit_TF,index.1,speech_id,speech,speakerid,lastname,firstname,chamber,state,gender,party,district,nonvoting,congress_num
0,0,0.640163,0.359837,2381,1130003810,Mr. President. today I wish to recognize two o...,113121841,HELLER,DEAN,S,NV,M,R,,voting,113
1,1,0.365035,0.634965,21654,1130035067,In the time since this body last adjourned. st...,113121520,THOMPSON,GLENN,H,PA,M,R,5.0,voting,113
2,2,0.141107,0.858893,23793,1130038287,Mr. Speaker. I rise today in strong support of...,113120970,RENACCI,JAMES,H,OH,M,R,16.0,voting,113
3,3,0.522,0.478,21179,1130034262,At the beginning of the consideration of this ...,113119160,DEFAZIO,PETER,H,OR,M,D,4.0,voting,113
4,4,0.598095,0.401905,71146,1130116260,I ask unanimous consent that when the Senate c...,113120961,REID,HARRY,S,NV,M,D,,voting,113


In [31]:
start = datetime.datetime.now()

print("Computing logistic count model...")

#again, using a cvgridsearch to find the optimal parameters for this logistic
#this time with count vectorized data
param_grid = {"C":[1.0,1.1,1.2], "penalty":['l1','l2']}
lg = LogisticRegression()
grid = GridSearchCV(lg, param_grid, n_jobs = -1)

#standard train and test
grid.fit(X_countvec_train, y_countvec_train)
lg_predict = grid.predict(X_countvec_test)

print("Best performance parameters are found to be:\n\n{}\n".format(grid.best_estimator_))
print("Accuracy: {}".format(accuracy_score(lg_predict, y_countvec_test)))

print("Runtime is {}.".format(datetime.datetime.now()-start))

Computing logistic count model...
Best performance parameters are found to be:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Accuracy: 0.6320156527410413
Runtime is 0:20:25.802514.


In [32]:
#predicting on val_data and making a df
ideology_logit_count = pd.DataFrame(grid.predict_proba(x_val_count),columns=['Democrat_Logit_Count', 'Republican_Logit_Count'])

print("Ideology shape is: {}".format(ideology_logit_count.shape))
print("Original df shape is: {}".format(val_df.shape))

ideology_logit_count = pd.concat([ideology_logit_count.reset_index(),val_df.reset_index()], axis=1, sort=False)

ideology_logit_count.head()

Ideology shape is: (68336, 2)
Original df shape is: (68336, 12)


Unnamed: 0,index,Democrat_Logit_Count,Republican_Logit_Count,index.1,speech_id,speech,speakerid,lastname,firstname,chamber,state,gender,party,district,nonvoting,congress_num
0,0,0.742285,0.257715,2381,1130003810,Mr. President. today I wish to recognize two o...,113121841,HELLER,DEAN,S,NV,M,R,,voting,113
1,1,0.297284,0.702716,21654,1130035067,In the time since this body last adjourned. st...,113121520,THOMPSON,GLENN,H,PA,M,R,5.0,voting,113
2,2,0.030419,0.969581,23793,1130038287,Mr. Speaker. I rise today in strong support of...,113120970,RENACCI,JAMES,H,OH,M,R,16.0,voting,113
3,3,0.788607,0.211393,21179,1130034262,At the beginning of the consideration of this ...,113119160,DEFAZIO,PETER,H,OR,M,D,4.0,voting,113
4,4,0.573605,0.426395,71146,1130116260,I ask unanimous consent that when the Senate c...,113120961,REID,HARRY,S,NV,M,D,,voting,113


In [33]:
start = datetime.datetime.now()

print("Computing multinomial naive bayes TfIDF model...")

#now we'll move onto naive bayes just for baseline testing
#first, using tfidf data
multi_nb = MultinomialNB()

#fitting
multi_nb.fit(X_tf_train,y_tf_train)

#predict and evaluate
multi_nb_predict = multi_nb.predict(X_tf_test)
accuracy_score(multi_nb_predict, y_tf_test)

print("Accuracy: {}".format(accuracy_score(multi_nb_predict, y_tf_test)))

print("Runtime is {}.".format(datetime.datetime.now()-start))


Computing multinomial naive bayes TfIDF model...
Accuracy: 0.6138386656521372
Runtime is 0:00:00.415328.


In [34]:
#same as before, predict the probability using this model and append to a df
ideology_mnb_tf = pd.DataFrame(multi_nb.predict_proba(x_val_tf),columns=['Democrat_MNB_TF', 'Republican_MNB_TF'])

print("Ideology shape is: {}".format(ideology_mnb_tf.shape))
print("Original df shape is: {}".format(val_df.shape))

ideology_mnb_tf = pd.concat([ideology_mnb_tf.reset_index(),val_df.reset_index()], axis=1, sort=False)

ideology_mnb_tf.head()

Ideology shape is: (68336, 2)
Original df shape is: (68336, 12)


Unnamed: 0,index,Democrat_MNB_TF,Republican_MNB_TF,index.1,speech_id,speech,speakerid,lastname,firstname,chamber,state,gender,party,district,nonvoting,congress_num
0,0,0.734846,0.265154,2381,1130003810,Mr. President. today I wish to recognize two o...,113121841,HELLER,DEAN,S,NV,M,R,,voting,113
1,1,0.632021,0.367979,21654,1130035067,In the time since this body last adjourned. st...,113121520,THOMPSON,GLENN,H,PA,M,R,5.0,voting,113
2,2,0.359752,0.640248,23793,1130038287,Mr. Speaker. I rise today in strong support of...,113120970,RENACCI,JAMES,H,OH,M,R,16.0,voting,113
3,3,0.519849,0.480151,21179,1130034262,At the beginning of the consideration of this ...,113119160,DEFAZIO,PETER,H,OR,M,D,4.0,voting,113
4,4,0.401202,0.598798,71146,1130116260,I ask unanimous consent that when the Senate c...,113120961,REID,HARRY,S,NV,M,D,,voting,113


In [35]:
start = datetime.datetime.now()

print("Computing multinomial naive bayes count model...")

#finally, naive bayes using the count vectorized data
multi_nb_count = MultinomialNB()
multi_nb_count.fit(X_countvec_train, y_countvec_train)
multi_nb_count_predict = multi_nb.predict(X_countvec_test)

print("Accuracy: {}".format(accuracy_score(multi_nb_count_predict, y_countvec_test)))

print("Runtime is {}.".format(datetime.datetime.now()-start))



Computing multinomial naive bayes count model...
Accuracy: 0.6120994722030991
Runtime is 0:00:00.469263.


In [36]:
ideology_mnb_count = pd.DataFrame(multi_nb_count.predict_proba(x_val_count),columns=['Democrat_MNB_Count', 'Republican_MNB_Count'])

print("Ideology shape is: {}".format(ideology_mnb_count.shape))
print("Original df shape is: {}".format(val_df.shape))

ideology_mnb_count = pd.concat([ideology_mnb_count.reset_index(),val_df.reset_index()], axis=1, sort=False)

ideology_mnb_count.head()

Ideology shape is: (68336, 2)
Original df shape is: (68336, 12)


Unnamed: 0,index,Democrat_MNB_Count,Republican_MNB_Count,index.1,speech_id,speech,speakerid,lastname,firstname,chamber,state,gender,party,district,nonvoting,congress_num
0,0,1.0,1.628992e-07,2381,1130003810,Mr. President. today I wish to recognize two o...,113121841,HELLER,DEAN,S,NV,M,R,,voting,113
1,1,0.884591,0.1154093,21654,1130035067,In the time since this body last adjourned. st...,113121520,THOMPSON,GLENN,H,PA,M,R,5.0,voting,113
2,2,0.00365,0.9963501,23793,1130038287,Mr. Speaker. I rise today in strong support of...,113120970,RENACCI,JAMES,H,OH,M,R,16.0,voting,113
3,3,0.01324,0.9867601,21179,1130034262,At the beginning of the consideration of this ...,113119160,DEFAZIO,PETER,H,OR,M,D,4.0,voting,113
4,4,0.008353,0.9916473,71146,1130116260,I ask unanimous consent that when the Senate c...,113120961,REID,HARRY,S,NV,M,D,,voting,113


## Average Strength of Prediction for Each Speaker for Each Congress for Comparison

In the end, we want to compare the strength of our prediction to the report card given by GovTrack. So, we will take the mean adn median of the predictions made across all the speeches of each speaker and, later, we will compare to the GovTrack website scores.

This will result in a single row for each speaker and a variety of "ideology" scores for that speaker.

In [37]:
#average and median of logit_tf
ideo1 = pd.DataFrame(ideology_logit_tf.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_Logit_TF'].mean()).add_prefix('Mean_').reset_index()
ideo1['Median_Republican_Logit_TF'] = ideology_logit_tf.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_Logit_TF'].median().reset_index(drop=True)

#average and median of logit_count
ideo2 = pd.DataFrame(ideology_logit_count.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_Logit_Count'].mean()).add_prefix('Mean_').reset_index()
ideo2['Median_Republican_Logit_Count'] = ideology_logit_count.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_Logit_Count'].median().reset_index(drop=True)

#average and median of mnb_tf
ideo3 = pd.DataFrame(ideology_mnb_tf.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_MNB_TF'].mean()).add_prefix('Mean_').reset_index()
ideo3['Median_Republican_MNB_TF'] = ideology_mnb_tf.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_MNB_TF'].median().reset_index(drop=True)

#average and median of mnb_count
ideo4 = pd.DataFrame(ideology_mnb_count.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_MNB_Count'].mean()).add_prefix('Mean_').reset_index()
ideo4['Median_Republican_MNB_Count'] = ideology_mnb_count.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_MNB_Count'].median().reset_index(drop=True)


In [38]:
compare = ideo1.merge(ideo2, on=["speakerid",'lastname','congress_num','state','chamber'], how = 'outer')
compare = compare.merge(ideo3, on=["speakerid",'lastname','congress_num','state','chamber'], how = 'outer')
compare = compare.merge(ideo4, on=["speakerid",'lastname','congress_num','state','chamber'], how = 'outer')
compare['lastname'] = compare['lastname'].str.lower()
compare = compare.merge(govtrack_report_cards, on=['lastname','congress_num','state','chamber'], how = 'left')
compare.sort_values('lastname')

Unnamed: 0,speakerid,lastname,congress_num,state,chamber,Mean_Republican_Logit_TF,Median_Republican_Logit_TF,Mean_Republican_Logit_Count,Median_Republican_Logit_Count,Mean_Republican_MNB_TF,Median_Republican_MNB_TF,Mean_Republican_MNB_Count,Median_Republican_MNB_Count,ideology
1043,114123020,abraham,114,LA,H,0.521462,0.513696,0.523814,0.546923,0.416981,0.411378,0.324388,0.168580,0.867061
549,113122780,adams,113,NC,H,0.133865,0.133199,0.089352,0.066523,0.239439,0.234506,0.000150,0.000060,0.493277
1019,114122780,adams,114,NC,H,0.302362,0.257302,0.260036,0.191810,0.312170,0.282230,0.084534,0.000074,0.284845
120,113118490,aderholt,113,AL,H,0.534652,0.529928,0.539273,0.536954,0.461485,0.450967,0.488593,0.420390,0.662598
594,114118490,aderholt,114,AL,H,0.610934,0.591235,0.542959,0.531144,0.513840,0.494785,0.674863,0.690831,0.723550
1029,114122880,aguilar,114,CA,H,0.306813,0.298724,0.221849,0.202659,0.314050,0.316140,0.079934,0.000729,0.336885
573,114118281,alexander,114,TN,S,0.480137,0.472827,0.500933,0.493600,0.456110,0.458365,0.559240,0.589733,0.717694
99,113118281,alexander,113,TN,S,0.479416,0.481825,0.496588,0.473124,0.460000,0.461955,0.578156,0.653706,0.700479
1039,114122980,allen,114,GA,H,0.620710,0.648010,0.631545,0.649821,0.473755,0.478267,0.540325,0.517028,0.893076
595,114118500,amash,114,MI,H,0.647488,0.707507,0.704832,0.786389,0.523375,0.554414,0.830911,0.959592,0.561310


# Run LSTM Model and Compare to Baseline

After having established a reasonable baseline, we will move our analysis to an LSTM model, attempting to use the full context of each speech to predict party affiliation. This will use the sequenced data prepared above. Our model generates a custom embedding layer based on our data. 

We use 100 LSTM cells and a small dropout rate to prevent overfitting. We use binary crossentropy as our loss because we have a binary class we're trying to predict. It is standard to use the ADAM optimizer. Because we are predicting probabilities of belonging in a class, we use MSE as our metric.

Once training is complete, we will use speeches from the 113th and 114th Congress to assess the weighted text score of that member's party affiliation.

In [39]:
embedding_vector_length = 32
batch_size = 10

model = Sequential()

#this will create the custom embedding layer to use in the LSTM cells
model.add(Embedding(vocabulary_size, embedding_vector_length, input_length=max_speech_len, mask_zero = True))

#adding 100 LSTM cells
model.add(LSTM(100))

#adding dropout to mitigate overfitting
model.add(Dropout(.2))

#dense layer will make a probability prediction for each class
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mean_squared_error'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 205, 32)           5837216   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 5,890,517
Trainable params: 5,890,517
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
from numpy.random import choice

#if you need this to run much faster, you can use these training and test sets instead 
#you can manipulate the "frac="" to get a faster train. Lower fraction = faster training
X_train_brief = pd.DataFrame(X_lstm_train).sample(frac=.35, random_state = 821)
y_train_brief = pd.DataFrame(y_lstm_train).sample(frac=.35, random_state = 821)
X_test_brief = pd.DataFrame(X_lstm_test).sample(frac=.35, random_state = 952)
y_test_brief = pd.DataFrame(y_lstm_test).sample(frac=.35, random_state = 952)

#We don't have all day. Stop training if it isn't getting better after an epoch
early_stop = [EarlyStopping(patience=1)]

#full version
#uncomment this for a long run
#commenting out 12/8 since model is saved for now
# model.fit(X_lstm_train, y_lstm_train, validation_data=(X_lstm_test, y_lstm_test), epochs=10, callbacks = early_stop)

#brief version for brief data created above
#uncomment this for a quick run
# model.fit(X_train_brief, y_train_brief, validation_data=(X_test_brief, y_test_brief), epochs=10, callbacks = early_stop)

#that took a long time. make sure you save
# model.save('lstm_model.h5')


## Retrieve Model and Evaluate Predictions

Since we've saved the model, we can load it and use it to predict the probability of each sentence belonging to one party or the other. We'll add this to the compare model we were building above as well.

We'll also take a look at the distribution of ideology per each member's sentences organized by their GovTrack score.

In [42]:
val_df.columns

Index(['speech_id', 'speech', 'speakerid', 'lastname', 'firstname', 'chamber',
       'state', 'gender', 'party', 'district', 'nonvoting', 'congress_num'],
      dtype='object')

In [44]:
start = datetime.datetime.now()

#loading whatever model you just ran, make your prediction on the val data
model = load_model('lstm_model.h5')

#predict the probability that each speech is a republican speech, as before
ideology_lstm = pd.DataFrame(model.predict_proba(x_val), columns=['Republican_LSTM']).sort_values('Republican_LSTM', ascending=True)
print("Ideology shape is: {}".format(ideology_lstm.shape))
print("Original df shape is: {}".format(x_val_orig.shape))

#add back on the speech_id and party of the speaker
ideology_lstm = pd.concat([ideology_lstm.reset_index(),x_val_orig.reset_index()], axis=1, sort=False)
ideology_lstm = ideology_lstm.drop('index',axis=1)

#merge on the speaker_id, state, chamber, etc. so we can merge over to the GovTrack data
ideology_lstm = pd.merge(ideology_lstm,val_df.loc[:,["speech_id","speakerid",'lastname','congress_num','state','chamber']],on='speech_id',how='left')
ideology_lstm['lastname'] = ideology_lstm['lastname'].str.lower()
print("Runtime is {}.".format(datetime.datetime.now()-start))

ideology_lstm.head()



Ideology shape is: (133363, 1)
Original df shape is: (133363, 4)
Runtime is 0:04:53.878622.


Unnamed: 0,Republican_LSTM,speech_id,sentence_num,party,speech,speakerid,lastname,congress_num,state,chamber
0,0.003727,1130036147,0,R,my amendment prohibits the use of funds to be ...,113121660,walberg,113,MI,H
1,0.004893,1130030496,0,D,the most disingenuous thing is there are a num...,113119180,delauro,113,CT,H
2,0.005571,1130039882,0,D,my understanding as well during subcommittee c...,113121640,visclosky,113,IN,H
3,0.005895,1130066865,0,D,i ask unanimous consent that the bill be read ...,113117441,levin,113,MI,S
4,0.006163,1130103610,0,D,does the senator know the origin of the law wh...,113118321,durbin,113,IL,S


## Evaluate Performance of LSTM Model

We'll inspect the loss and accuracy of the model. Interestingly, it performs the worst of any model. We will evaluate the resulting probabilities against the other model's predictions by member.

In [45]:
start = datetime.datetime.now()

score = model.evaluate(x_val, y_val)
lstm_predict = model.predict_classes(x_val)

print("Accuracy: {}".format(accuracy_score(lstm_predict, y_val)))
print('Validation loss:', score[0])

print("Runtime is {}.".format(datetime.datetime.now()-start))

Accuracy: 0.5707355113487249
Validation loss: 0.704889317958568
Runtime is 0:09:53.372702.


In [46]:
#average and median of logit_tf
ideo5 = pd.DataFrame(ideology_lstm.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_LSTM'].mean()).add_prefix('Mean_').reset_index()
ideo5['Median_Republican_LSTM'] = ideology_lstm.groupby(["speakerid",'lastname','congress_num','state','chamber'])['Republican_LSTM'].median().reset_index(drop=True)
compare = compare.merge(ideo5, on=["speakerid",'lastname','congress_num','state','chamber'], how = 'outer')
ideology_lstm = ideology_lstm.merge(compare.loc[:,['ideology','speakerid','congress_num']], on=['speakerid','congress_num'])
compare.to_csv("comparison_dataset.csv")
ideology_lstm.to_csv("lstm_prediction_results.csv")
compare

Unnamed: 0,speakerid,lastname,congress_num,state,chamber,Mean_Republican_Logit_TF,Median_Republican_Logit_TF,Mean_Republican_Logit_Count,Median_Republican_Logit_Count,Mean_Republican_MNB_TF,Median_Republican_MNB_TF,Mean_Republican_MNB_Count,Median_Republican_MNB_Count,ideology,Mean_Republican_LSTM,Median_Republican_LSTM
0,113117261,kerry,113,MA,S,0.419810,0.419810,0.188736,0.188736,0.435204,0.435204,0.254330,0.254330,,0.312277,0.313775
1,113117271,lautenberg,113,NJ,S,0.314499,0.267700,0.317093,0.357472,0.325873,0.296932,0.148370,0.016686,,0.330449,0.179318
2,113117281,cowan,113,MA,S,0.424082,0.462549,0.365430,0.342376,0.359700,0.359668,0.090852,0.015068,,0.472234,0.448243
3,113117290,bonner,113,AL,H,0.540400,0.517502,0.433519,0.367092,0.444340,0.401275,0.501026,0.418555,,0.529536,0.490974
4,113117310,young,113,FL,H,0.540930,0.546145,0.557972,0.539138,0.469070,0.472296,0.583142,0.633210,,0.443917,0.421097
5,113117321,chiesa,113,NJ,S,0.410509,0.432627,0.419022,0.435544,0.457922,0.471725,0.393929,0.297060,,0.679315,0.679315
6,113117330,watt,113,NC,H,0.396118,0.453058,0.415525,0.378241,0.426009,0.435990,0.376820,0.277586,0.331317,0.389959,0.370778
7,113117340,radel,113,FL,H,0.563370,0.621258,0.598507,0.672574,0.489167,0.487043,0.649102,0.671706,0.776537,0.361774,0.414313
8,113117351,baucus,113,MT,S,0.469112,0.480774,0.462465,0.453369,0.437751,0.432686,0.514717,0.501201,0.351129,0.358612,0.376384
9,113117360,andrews,113,NJ,H,0.396705,0.396498,0.408106,0.427939,0.427847,0.429096,0.458490,0.493554,0.354866,0.459407,0.442276


In [47]:
print("All done! Total runtime: {}".format(datetime.datetime.now()-big_start))

All done! Total runtime: 1:01:35.799056
