# Logistic Regression using TF-IDF

In [35]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from helpers import vocabulary
from sklearn import linear_model

In [36]:
# Load data and vocabulary preprocessed
data = pickle.load(open("./data/data.p", "rb"))
vocab = pickle.load(open("./vocab.p", "rb"))

In [74]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=[w for w, i in vocab.unigram_counts.most_common(1000)])  

In [75]:
# Filter out other political party
df = data[(data.political_party=='r')|(data.political_party=='d')]

# Shuffle data
df = df.sample(frac=1)
df.loc[:,'y'] = [1 if p == 'd' else 0 for p in df.political_party]

# Divide data into train/dev/test
train_number = int(len(df)*0.8)
dev_number = int(len(df)*0.1)

train_data = df[0:train_number]
dev_data = df[train_number:train_number+dev_number]
test_data = df[train_number+dev_number:]

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

pickle.dump(train_data, open("train.p", "wb"))
pickle.dump(dev_data, open("dev.p", "wb"))
pickle.dump(test_data, open("test.p", "wb"))

In [145]:
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 10000)
(17052,)
(2131, 10000)
(2131,)
(2133, 10000)
(2133,)


In [77]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

0.7995543044804129
0.7705302674800563


In [78]:
# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

0.8134529673938541
0.7470671046457062


In [80]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.6667839549612948
0.669638667292351

0.2
0.6924700914848698
0.6954481464101361

0.30000000000000004
0.7118813042458363
0.7109338338808071

0.4
0.7300023457658925
0.7297043641482872

0.5
0.7440769411212761
0.7400281557954013

0.6
0.7565095003518649
0.7433129985922102

0.7000000000000001
0.7691766361717101
0.7541060534960112

0.8
0.7803190241613887
0.7639605818864383

0.9
0.7902885292047853
0.7672454246832473

1.0
0.7995543044804129
0.7705302674800563



In [106]:
# Final model with L1 regularization with C = 1.0
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0)
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

0.7995543044804129
0.7477730895452415


In [143]:
# Find top 10 words for both Democrats and Republicans
coeff = clf_l1.coef_.flatten()
democrat_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:10])
republican_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:10])

print("Democrat Top 10:")
for i, d in enumerate(democrat_top_10):
    print(i+1, '-', vocab.id_to_word.get(d))

print()
print("Republican Top 10:")
for i, r in enumerate(republican_top_10):
    print(i+1, '-', vocab.id_to_word.get(r))

Democrat Top 10:
1 - frankfurter
2 - fuller
3 - whereof
4 - problem
5 - insofar
6 - compare
7 - douglas
8 - furthermore
9 - exhibited
10 - consequently

Republican Top 10:
1 - brennan
2 - waite
3 - holmes
4 - sutherland
5 - stevens
6 - isso
7 - pursuance
8 - matthews
9 - brewer
10 - observed


In [316]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 10, labels=False)

preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1814.6]",1.058688
"(1814.6, 1837.2]",0.526999
"(1837.2, 1859.8]",0.518075
"(1859.8, 1882.4]",1.821959
"(1882.4, 1905.0]",1.644657
"(1905.0, 1927.6]",1.507513
"(1927.6, 1950.2]",0.964998
"(1950.2, 1972.8]",0.771208
"(1972.8, 1995.4]",1.315938
"(1995.4, 2018.0]",1.225883
