# Logistic Regression using TF-IDF

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from helpers import vocabulary
from sklearn import linear_model
from sklearn import metrics

In [2]:
# Load data and vocabulary preprocessed
data = pickle.load(open("./data/data.p", "rb"))
vocab = pickle.load(open("./vocab.p", "rb"))
vocab_12314 = pickle.load(open("./vocab_12314.p", "rb"))
vocab_6155 = pickle.load(open("./vocab_6155.p", "rb"))

# Additional pre-processing of data set

In [3]:
# Filter out other political party
df = data[(data.political_party=='r')|(data.political_party=='d')]

# Shuffle data
df = df.sample(frac=1, random_state=38291)
df.loc[:,'y'] = [1 if p == 'd' else 0 for p in df.political_party]

# Divide data into train/dev/test
train_number = int(len(df)*0.8)
dev_number = int(len(df)*0.1)

train_data = df[0:train_number]
dev_data = df[train_number:train_number+dev_number]
test_data = df[train_number+dev_number:]

pickle.dump(train_data, open("train.p", "wb"))
pickle.dump(dev_data, open("dev.p", "wb"))
pickle.dump(test_data, open("test.p", "wb"))

In [4]:
# Filter out data points pre-1935 because Dem / Rep platforms switched meaning around that time
df.year_filed = [int(x) for x in df.year_filed]
df_post1935 = df[(df.year_filed>int(1935))]

# Shuffle data
df_post1935 = df_post1935.sample(frac=1, random_state=38291)
df_post1935.loc[:,'y'] = [1 if p == 'd' else 0 for p in df_post1935.political_party]

# Divide data into train/dev/test
train_number_post1935 = int(len(df_post1935)*0.8)
dev_number_post1935 = int(len(df_post1935)*0.1)

train_data_post1935 = df_post1935[0:train_number_post1935]
dev_data_post1935 = df_post1935[train_number_post1935:train_number_post1935+dev_number_post1935]
test_data_post1935 = df_post1935[train_number_post1935+dev_number_post1935:]

pickle.dump(train_data_post1935, open("train_post1935.p", "wb"))
pickle.dump(dev_data_post1935, open("dev_post1935.p", "wb"))
pickle.dump(test_data_post1935, open("test_post1935.p", "wb"))

In [5]:
# Double check df sizes make sense
print(data.shape)
print(df.shape)
print(df_post1935.shape)

(22232, 6)
(21316, 7)
(7391, 7)


# Most Common Class Baseline

In [10]:
# actual values
train_y = train_data.y
dev_y = dev_data.y
test_y = test_data.y

# predicted values
print("There are %d Democrat examples, and %d Republican examples" % (sum(train_y==1), sum(train_y==0)))
train_pred = np.zeros(len(train_y))
test_pred = np.zeros(len(test_y))

# calculate accuracy
print("Training set accuracy is: %.2f" % metrics.accuracy_score(train_y, train_pred))
print("Test set accuracy is: %.2f" % metrics.accuracy_score(test_y, test_pred))

There are 5773 Democrat examples, and 11279 Republican examples
Training set accuracy is: 0.66
Test set accuracy is: 0.65


# Initial Regression Parameters

In [63]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=[w for w, i in vocab.unigram_counts.most_common(1000)])  

In [64]:
# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

In [29]:
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 10000)
(17052,)
(2131, 10000)
(2131,)
(2133, 10000)
(2133,)


In [30]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

0.800903119869
0.76208352886


In [31]:
# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

0.814273985456
0.741905208822


In [32]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.666490734225
0.668230877522

0.2
0.693642974431
0.69544814641

0.3
0.712643678161
0.709995307367

0.4
0.73099929627
0.719849835758

0.5
0.747654234107
0.732989206945

0.6
0.761435608726
0.741435945565

0.7
0.773926812104
0.752229000469

0.8
0.78471733521
0.755983106523

0.9
0.792634295097
0.76208352886

1.0
0.800903119869
0.76208352886



In [65]:
# Final model with L1 regularization with C = 1.0
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0)
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

0.800903119869
0.746835443038


In [66]:
# Find top 10 words for both Democrats and Republicans
coeff = clf_l1.coef_.flatten()
democrat_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:10])
republican_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:10])

print("Democrat Top 10:")
for i, d in enumerate(democrat_top_10):
    print(i+1, '-', vocab.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 10:")
for i, r in enumerate(republican_top_10):
    print(i+1, '-', vocab.id_to_word.get(r), '-', round(coeff[r],2))

Democrat Top 10:
1 - frankfurter - 32.41
2 - fuller - 30.75
3 - whereof - 22.49
4 - douglas - 16.06
5 - problem - 14.94
6 - compare - 13.75
7 - furthermore - 12.84
8 - insofar - 12.26
9 - consequently - 9.25
10 - remarked - 9.21

Republican Top 10:
1 - brennan - -24.81
2 - holmes - -21.61
3 - waite - -19.02
4 - sutherland - -18.17
5 - brewer - -11.85
6 - observed - -9.51
7 - isso - -9.27
8 - pursuance - -8.9
9 - stevens - -8.29
10 - besides - -8.0


In [67]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 10, labels=False)

preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1814.6]",1.169719
"(1814.6, 1837.2]",0.492416
"(1837.2, 1859.8]",0.527713
"(1859.8, 1882.4]",1.843899
"(1882.4, 1905.0]",1.652408
"(1905.0, 1927.6]",1.517391
"(1927.6, 1950.2]",0.974788
"(1950.2, 1972.8]",0.749886
"(1972.8, 1995.4]",1.305375
"(1995.4, 2018.0]",1.250749


Conclusion: removing judge names did not lower accuracy by that much, but gave us a more meaningful parameter set

# Modified Regression Parameters - exclude judges in stop words

In [68]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
stop_word_set = [w for w, i in vocab.unigram_counts.most_common(1000)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=stop_word_set)  

# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

# print shapes for confirmation
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 10000)
(17052,)
(2131, 10000)
(2131,)
(2133, 10000)
(2133,)


In [49]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print("L1 stats:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print("L2 stats:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

L1 stats:
0.772695285011
0.735804786485
L2 stats:
0.802076002815
0.726419521351


In [50]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.661447337556
0.660722665415

0.2
0.676577527563
0.678085405913

0.3
0.690476190476
0.690755513843

0.4
0.703377902885
0.697325199437

0.5
0.718273516303
0.702956358517

0.6
0.730764719681
0.713749413421

0.7
0.742904058175
0.717503519474

0.8
0.755277973258
0.724073205068

0.9
0.764015951208
0.732050680432

1.0
0.772695285011
0.735804786485



In [69]:
# Final model with L1 regularization with C = 1.0
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0)
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

0.772695285011
0.719643694327


In [70]:
# Find top 10 words for both Democrats and Republicans
coeff = clf_l1.coef_.flatten()
democrat_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:10])
republican_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:10])

print("Democrat Top 10:")
for i, d in enumerate(democrat_top_10):
    print(i+1, '-', vocab.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 10:")
for i, r in enumerate(republican_top_10):
    print(i+1, '-', vocab.id_to_word.get(r), '-', round(coeff[r],2))

Democrat Top 10:
1 - whereof - 23.8
2 - problem - 17.81
3 - insofar - 14.23
4 - compare - 12.47
5 - furthermore - 12.44
6 - remarked - 9.38
7 - eg - 8.83
8 - stern - 8.74
9 - peters - 7.89
10 - exhibited - 7.44

Republican Top 10:
1 - isso - -10.07
2 - contrast - -9.41
3 - sect - -9.29
4 - pursuance - -9.08
5 - observed - -8.92
6 - quote - -7.67
7 - besides - -7.56
8 - wall - -6.53
9 - observing - -6.29
10 - argue - -6.13


In [71]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 10, labels=False)

preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1814.6]",1.329298
"(1814.6, 1837.2]",0.461046
"(1837.2, 1859.8]",0.539928
"(1859.8, 1882.4]",1.642566
"(1882.4, 1905.0]",1.492848
"(1905.0, 1927.6]",1.37362
"(1927.6, 1950.2]",0.92183
"(1950.2, 1972.8]",0.790945
"(1972.8, 1995.4]",1.223812
"(1995.4, 2018.0]",1.248863


While removing judge names reduced accuracy, it better identifies the types of language we are intending to capture. Thus, we will keep this modification going forward

# Modified Regression Parameters - post 1935 data

In [72]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
stop_word_set = [w for w, i in vocab.unigram_counts.most_common(1000)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=stop_word_set)  

# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data_post1935.text)
train_y = train_data_post1935.y

dev_x = tfidf.transform(dev_data_post1935.text)
dev_y = dev_data_post1935.y

test_x = tfidf.transform(test_data_post1935.text)
test_y = test_data_post1935.y

# print shapes for confirmation
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(5912, 10000)
(5912,)
(739, 10000)
(739,)
(740, 10000)
(740,)


In [73]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print("L1 stats:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print("L2 stats:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

L1 stats:
0.730886332882
0.711772665765
L2 stats:
0.814952638701
0.726657645467


In [74]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l2 = linear_model.LogisticRegression(penalty='l2', C=c)
    clf_l2.fit(train_x, train_y)

    print(c)
    print(clf_l2.score(train_x, train_y))
    print(clf_l2.score(dev_x, dev_y))
    print()

0.1
0.725473612991
0.723951285521

0.2
0.742895805142
0.729364005413

0.3
0.756596752368
0.725304465494

0.4
0.768437077131
0.722598105548

0.5
0.777909336942
0.72801082544

0.6
0.787550744249
0.72801082544

0.7
0.79702300406
0.725304465494

0.8
0.802943166441
0.723951285521

0.9
0.809032476319
0.726657645467

1.0
0.814952638701
0.726657645467



In [78]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.52300405954
0.499323410014

0.2
0.527571041949
0.499323410014

0.3
0.64851150203
0.661705006766

0.4
0.674391069012
0.695534506089

0.5
0.688261163735
0.709066305819

0.6
0.697564276049
0.703653585927

0.7
0.707882273342
0.707713125846

0.8
0.716170500677
0.719891745602

0.9
0.723443843031
0.718538565629

1.0
0.730886332882
0.711772665765



In [82]:
# Final model with L1 regularization with C = 1.0
clf_l2 = linear_model.LogisticRegression(penalty='l2', C=0.5)
clf_l2.fit(train_x, train_y)

print(clf_l2.score(train_x, train_y))
print(clf_l2.score(test_x, test_y))

0.777909336942
0.675675675676


In [76]:
# Find top 10 words for both Democrats and Republicans
coeff = clf_l2.coef_.flatten()
democrat_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:10])
republican_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:10])

print("Democrat Top 10:")
for i, d in enumerate(democrat_top_10):
    print(i+1, '-', vocab.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 10:")
for i, r in enumerate(republican_top_10):
    print(i+1, '-', vocab.id_to_word.get(r), '-', round(coeff[r],2))

Democrat Top 10:
1 - problem - 1.12
2 - eg - 0.98
3 - contempt - 0.87
4 - usc - 0.86
5 - army - 0.85
6 - pointed - 0.82
7 - railroads - 0.81
8 - enemy - 0.79
9 - administrator - 0.78
10 - cite - 0.78

Republican Top 10:
1 - governmental - -1.06
2 - subsection - -0.98
3 - collateral - -0.91
4 - quoting - -0.89
5 - briefs - -0.88
6 - text - -0.86
7 - argue - -0.86
8 - approach - -0.86
9 - prosecutor - -0.85
10 - massachusetts - -0.84


In [77]:
out, _ = pd.cut([int(x) for x in train_data_post1935.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data_post1935.year_filed], 10, labels=False)

preds = np.log(clf_l2.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1935.918, 1944.2]",0.516035
"(1944.2, 1952.4]",0.451311
"(1952.4, 1960.6]",0.553444
"(1960.6, 1968.8]",0.674378
"(1968.8, 1977.0]",0.905125
"(1977.0, 1985.2]",1.03153
"(1985.2, 1993.4]",1.098933
"(1993.4, 2001.6]",1.140037
"(2001.6, 2009.8]",1.099959
"(2009.8, 2018.0]",0.755164


Conclusion: reducing the time period did not increase accuracy

# Modified Regression Parameters - balanced class weight

In [83]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
stop_word_set = [w for w, i in vocab.unigram_counts.most_common(1000)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=stop_word_set)  

# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

# print shapes for confirmation
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 10000)
(17052,)
(2131, 10000)
(2131,)
(2133, 10000)
(2133,)


In [84]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_l1.fit(train_x, train_y)

print("L1 stats:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2', class_weight='balanced')
clf_l2.fit(train_x, train_y)

print("L2 stats:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

L1 stats:
0.782782078349
0.719380572501
L2 stats:
0.819200093831
0.717034256218


In [85]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c, class_weight='balanced')
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.641391039174
0.654152979822

0.2
0.665317851278
0.672923510089

0.3
0.684729064039
0.676208352886

0.4
0.701618578466
0.682308775223

0.5
0.721440300258
0.690286250587

0.6
0.737508796622
0.704364148287

0.7
0.752580342482
0.717034256218

0.8
0.762373915083
0.720319099015

0.9
0.773516303073
0.718911309244

1.0
0.782840722496
0.719380572501



In [88]:
# Final model with L1 regularization with C = 1.0
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0, class_weight='balanced')
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

0.782840722496
0.704172526957


In [89]:
# Find top 10 words for both Democrats and Republicans
coeff = clf_l1.coef_.flatten()
democrat_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:10])
republican_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:10])

print("Democrat Top 10:")
for i, d in enumerate(democrat_top_10):
    print(i+1, '-', vocab.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 10:")
for i, r in enumerate(republican_top_10):
    print(i+1, '-', vocab.id_to_word.get(r), '-', round(coeff[r],2))

Democrat Top 10:
1 - whereof - 24.25
2 - problem - 18.45
3 - insofar - 15.1
4 - furthermore - 13.93
5 - compare - 12.92
6 - remarked - 9.84
7 - eg - 9.22
8 - stern - 8.59
9 - whilst - 8.17
10 - peters - 8.14

Republican Top 10:
1 - isso - -12.17
2 - contrast - -10.37
3 - sect - -10.14
4 - quote - -8.98
5 - observing - -8.93
6 - pursuance - -8.93
7 - observed - -8.83
8 - besides - -8.59
9 - argue - -6.38
10 - wall - -6.18


In [100]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 20, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 22, labels=False)

preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1803.3]",0.930352
"(1803.3, 1814.6]",0.189055
"(1814.6, 1825.9]",0.265687
"(1825.9, 1837.2]",0.288959
"(1837.2, 1848.5]",0.460992
"(1848.5, 1859.8]",1.014621
"(1859.8, 1871.1]",1.466419
"(1871.1, 1882.4]",1.179084
"(1882.4, 1893.7]",0.989033
"(1893.7, 1905.0]",1.095646


Conclusion: weighted class distribution did not improve performance

# Modified Regression Parameters - larger vocabulary

In [101]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
stop_word_set = [w for w, i in vocab_12314.unigram_counts.most_common(1000)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab_12314.word_to_id,
    stop_words=stop_word_set)  

# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

# print shapes for confirmation
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 12314)
(17052,)
(2131, 12314)
(2131,)
(2133, 12314)
(2133,)


In [102]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print("L1 stats:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print("L2 stats:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

L1 stats:
0.771287825475
0.736274049742
L2 stats:
0.803952615529
0.723603941811


In [103]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.661447337556
0.660722665415

0.2
0.675639221206
0.678554669169

0.3
0.68930330753
0.68840919756

0.4
0.702498240676
0.697325199437

0.5
0.715869106263
0.702956358517

0.6
0.727656579873
0.713280150164

0.7
0.740089139104
0.718442045988

0.8
0.752638986629
0.721726888785

0.9
0.761025099695
0.727827311122

1.0
0.771287825475
0.736274049742



In [104]:
# Final model with L1 regularization with C = 1.0
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0)
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

0.771287825475
0.719174871074


In [105]:
# Find top 10 words for both Democrats and Republicans
coeff = clf_l1.coef_.flatten()
democrat_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:10])
republican_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:10])

print("Democrat Top 10:")
for i, d in enumerate(democrat_top_10):
    print(i+1, '-', vocab_12314.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 10:")
for i, r in enumerate(republican_top_10):
    print(i+1, '-', vocab_12314.id_to_word.get(r), '-', round(coeff[r],2))

Democrat Top 10:
1 - whereof - 24.11
2 - problem - 18.47
3 - insofar - 14.78
4 - compare - 12.74
5 - furthermore - 12.34
6 - remarked - 9.39
7 - eg - 8.92
8 - stern - 8.64
9 - peters - 8.0
10 - exhibited - 7.48

Republican Top 10:
1 - isso - -10.06
2 - sect - -9.56
3 - observed - -9.32
4 - pursuance - -9.29
5 - contrast - -9.11
6 - quote - -7.65
7 - besides - -7.48
8 - wall - -6.63
9 - observing - -5.88
10 - anything - -5.77


In [106]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 10, labels=False)

preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1814.6]",1.339322
"(1814.6, 1837.2]",0.462582
"(1837.2, 1859.8]",0.547575
"(1859.8, 1882.4]",1.632333
"(1882.4, 1905.0]",1.483389
"(1905.0, 1927.6]",1.366678
"(1927.6, 1950.2]",0.922225
"(1950.2, 1972.8]",0.793996
"(1972.8, 1995.4]",1.215459
"(1995.4, 2018.0]",1.240703


Conclusion: practically the same results as vocab of 10,000 words

# Modified Regression Parameters - smaller vocab

In [107]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
stop_word_set = [w for w, i in vocab_6155.unigram_counts.most_common(1000)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab_6155.word_to_id,
    stop_words=stop_word_set)  

# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

# print shapes for confirmation
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 6155)
(17052,)
(2131, 6155)
(2131,)
(2133, 6155)
(2133,)


In [108]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print("L1 stats:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print("L2 stats:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

L1 stats:
0.776331222144
0.729704364148
L2 stats:
0.790464461647
0.724542468325


In [109]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.661740558292
0.660722665415

0.2
0.67992024396
0.683247301736

0.3
0.693936195168
0.695917409667

0.4
0.709125029322
0.701548568747

0.5
0.723610133709
0.706241201314

0.6
0.736805066854
0.714218676678

0.7
0.748827117054
0.717034256218

0.8
0.760731878958
0.719849835758

0.9
0.769528501056
0.726419521351

1.0
0.776331222144
0.729704364148



In [110]:
# Final model with L1 regularization with C = 1.0
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0)
clf_l1.fit(train_x, train_y)

print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

0.776331222144
0.721050164088


In [111]:
# Find top 10 words for both Democrats and Republicans
coeff = clf_l1.coef_.flatten()
democrat_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:10])
republican_top_10 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:10])

print("Democrat Top 10:")
for i, d in enumerate(democrat_top_10):
    print(i+1, '-', vocab_6155.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 10:")
for i, r in enumerate(republican_top_10):
    print(i+1, '-', vocab_6155.id_to_word.get(r), '-', round(coeff[r],2))

Democrat Top 10:
1 - whereof - 19.33
2 - problem - 16.5
3 - insofar - 13.56
4 - furthermore - 12.68
5 - compare - 11.8
6 - remarked - 8.87
7 - eg - 8.87
8 - peters - 7.5
9 - likewise - 7.39
10 - exhibited - 7.29

Republican Top 10:
1 - contrast - -10.31
2 - sect - -8.75
3 - pursuance - -8.56
4 - observed - -8.39
5 - quote - -7.63
6 - besides - -7.21
7 - argue - -6.8
8 - observing - -6.62
9 - wall - -5.9
10 - whereas - -5.66


In [112]:
out, _ = pd.cut([int(x) for x in train_data.year_filed], 10, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 10, labels=False)

preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1814.6]",1.306647
"(1814.6, 1837.2]",0.406512
"(1837.2, 1859.8]",0.536763
"(1859.8, 1882.4]",1.634456
"(1882.4, 1905.0]",1.522472
"(1905.0, 1927.6]",1.397685
"(1927.6, 1950.2]",0.919636
"(1950.2, 1972.8]",0.783053
"(1972.8, 1995.4]",1.253913
"(1995.4, 2018.0]",1.273757


Conclusion: practically the same as the larger vocabulary

# Modified Regression Parameters - smaller set of stop words

# Note: these are the values used in presentation slides

In [4]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
# Stop word set up to position 200, corresponds to 99.65 percentile with words having counts> 38746
# Still includes common words like 'petitioners', but IDF weighting should take care of that 
stop_word_set = [w for w, i in vocab.unigram_counts.most_common(200)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=stop_word_set)  

# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

# print shapes for confirmation
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 10000)
(17052,)
(2131, 10000)
(2131,)
(2133, 10000)
(2133,)


In [42]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print("L1 stats:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print("L2 stats:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

L1 stats:
0.789643443584
0.763022055373
L2 stats:
0.809523809524
0.756452369779


In [52]:
# Try different hyperparameters for L1
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.663793103448
0.661191928672

0.2
0.681503635937
0.679023932426

0.3
0.700621627962
0.699202252464

0.4
0.719035890218
0.715626466448

0.5
0.734224724373
0.729235100892

0.6
0.749296270232
0.738620366025

0.7
0.760321369927
0.746128578132

0.8
0.771581046212
0.752229000469

0.9
0.78078817734
0.76067573909

1.0
0.789643443584
0.763022055373



In [53]:
# Try different hyperparameters for L2
for c in np.arange(0.1, 1.1, 0.1):
    clf_l2 = linear_model.LogisticRegression(penalty='l2', C=c)
    clf_l2.fit(train_x, train_y)

    print(c)
    print(clf_l2.score(train_x, train_y))
    print(clf_l2.score(dev_x, dev_y))
    print()

0.1
0.677339901478
0.670107930549

0.2
0.7107084213
0.694509619897

0.3
0.734459300962
0.717503519474

0.4
0.753342716397
0.728765837635

0.5
0.767417311752
0.733458470202

0.6
0.778266479005
0.742374472079

0.7
0.78776683087
0.746128578132

0.8
0.79562514661
0.748474894416

0.9
0.802603800141
0.753636790239

1.0
0.809523809524
0.756452369779



### Tuned Model and Results

In [5]:
# Final model with L1 regularization with C = 1.0
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0)
clf_l1.fit(train_x, train_y)

print("L1 model results:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

# Final model with L1 regularization with C = 1.0
clf_l2 = linear_model.LogisticRegression(penalty='l2', C=1.0)
clf_l2.fit(train_x, train_y)

print("L2 model results:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(test_x, test_y))

L1 model results:
0.789643443584
0.743084857009
L2 model results:
0.809523809524
0.732301922175


In [55]:
# Find top 10 words for both Democrats and Republicans for L1 regularization
coeff = clf_l1.coef_.flatten()
democrat_top_20 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:20])
republican_top_20 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:20])

print("L1 results:")

print("Democrat Top 20:")
for i, d in enumerate(democrat_top_20):
    print(i+1, '-', vocab.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 20:")
for i, r in enumerate(republican_top_20):
    print(i+1, '-', vocab.id_to_word.get(r), '-', round(coeff[r],2))

L1 results:
Democrat Top 20:
1 - whereof - 21.65
2 - argued - 21.5
3 - problem - 19.04
4 - compare - 13.03
5 - remarked - 10.02
6 - eg - 9.92
7 - ruled - 9.31
8 - furthermore - 9.01
9 - concerning - 8.62
10 - introduced - 8.59
11 - peters - 8.46
12 - hereby - 8.19
13 - insofar - 8.15
14 - therein - 7.77
15 - prior - 7.69
16 - exhibited - 7.53
17 - likewise - 7.52
18 - moreover - 7.49
19 - thereby - 7.0
20 - inasmuch - 6.91

Republican Top 20:
1 - delivered - -10.15
2 - argue - -10.14
3 - simply - -9.42
4 - sect - -8.84
5 - pursuance - -8.44
6 - far - -8.11
7 - shown - -7.75
8 - consider - -7.75
9 - latter - -7.49
10 - follows - -7.1
11 - wall - -7.04
12 - analysis - -6.86
13 - besides - -6.48
14 - nothing - -6.45
15 - id - -6.2
16 - least - -5.73
17 - observed - -5.67
18 - ruling - -5.61
19 - brought - -5.49
20 - principle - -5.47


In [56]:
# Find top 10 words for both Democrats and Republicans for L2 regularization
coeff = clf_l2.coef_.flatten()
democrat_top_20 = list(sorted(range(len(coeff)), key=lambda i: coeff[i], reverse=True)[0:20])
republican_top_20 = list(sorted(range(len(coeff)), key=lambda i: coeff[i])[0:20])

print("L2 results:")

print("Democrat Top 20:")
for i, d in enumerate(democrat_top_20):
    print(i+1, '-', vocab.id_to_word.get(d), '-', round(coeff[d],2))

print()
print("Republican Top 20:")
for i, r in enumerate(republican_top_20):
    print(i+1, '-', vocab.id_to_word.get(r), '-', round(coeff[r],2))

L2 results:
Democrat Top 20:
1 - argued - 6.15
2 - prior - 3.43
3 - hereby - 3.41
4 - whereof - 3.41
5 - peters - 3.22
6 - problem - 2.98
7 - legal - 2.78
8 - therein - 2.72
9 - thousand - 2.67
10 - concerning - 2.64
11 - introduced - 2.48
12 - now - 2.47
13 - circumstances - 2.46
14 - duly - 2.35
15 - ruled - 2.29
16 - compare - 2.28
17 - drawn - 2.26
18 - judiciary - 2.23
19 - decreed - 2.12
20 - theory - 2.11

Republican Top 20:
1 - sect - -2.67
2 - wall - -2.5
3 - brought - -2.38
4 - far - -2.37
5 - id - -2.3
6 - latter - -2.3
7 - delivered - -2.19
8 - shown - -2.15
9 - simply - -2.11
10 - testimony - -2.1
11 - pursuance - -2.09
12 - ruling - -2.08
13 - 2d - -2.06
14 - nothing - -2.03
15 - consider - -2.02
16 - accordance - -1.95
17 - duty - -1.94
18 - principle - -1.93
19 - follows - -1.9
20 - former - -1.87


In [59]:
# Get cross-entropy over time for L1 model
out, _ = pd.cut([int(x) for x in train_data.year_filed], 20, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 22, labels=False)

preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1803.3]",1.272506
"(1803.3, 1814.6]",0.336828
"(1814.6, 1825.9]",0.445353
"(1825.9, 1837.2]",0.441889
"(1837.2, 1848.5]",0.658037
"(1848.5, 1859.8]",1.462013
"(1859.8, 1871.1]",1.897501
"(1871.1, 1882.4]",1.646741
"(1882.4, 1893.7]",1.429614
"(1893.7, 1905.0]",1.56592


In [60]:
# Get cross-entropy over time for L2 model
out, _ = pd.cut([int(x) for x in train_data.year_filed], 20, retbins=True)
bins = pd.cut([int(x) for x in train_data.year_filed], 22, labels=False)

preds = np.log(clf_l2.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': bins, 'log_prob': preds})

agg_df = preds_df.groupby('bins').agg(lambda x: -np.mean(x.log_prob))
agg_df.index = out.categories
agg_df.columns = ['cross_entropy_loss']

agg_df

Unnamed: 0,cross_entropy_loss
"(1791.774, 1803.3]",0.944424
"(1803.3, 1814.6]",0.382964
"(1814.6, 1825.9]",0.525742
"(1825.9, 1837.2]",0.530336
"(1837.2, 1848.5]",0.641271
"(1848.5, 1859.8]",1.355997
"(1859.8, 1871.1]",1.692196
"(1871.1, 1882.4]",1.555249
"(1882.4, 1893.7]",1.343641
"(1893.7, 1905.0]",1.485937


In [23]:
# Get cross-entropy by judge for L1 model
preds = np.log(clf_l1.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': train_data.name_last+', '+train_data.name_first, 'log_prob': preds, 'year1': train_data.year_filed, 'year2': train_data.year_filed})

agg_df = preds_df.groupby('bins').agg(lambda x: [-np.mean(x.log_prob), np.min(x.year1), np.max(x.year2)])
agg_df.columns = ['cross_entropy_loss', 'min_year', 'max_year']

agg_df

Unnamed: 0_level_0,cross_entropy_loss,min_year,max_year
bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Alito, Samuel",1.119271,2006,2018
"Baldwin, Henry",0.278859,1830,1844
"Barbour, Philip",0.445166,1837,1840
"Black, Hugo",0.591280,1937,1971
"Blackmun, Harry",1.298135,1971,1994
"Blatchford, Samuel",1.664280,1882,1893
"Bradley, Joseph",1.670991,1870,1891
"Brandeis, Louis",0.748530,1916,1939
"Brennan, William",1.071540,1956,1990
"Brewer, David",1.906523,1890,1910


In [25]:
# Get cross-entropy by judge for L2 model
preds = np.log(clf_l2.predict_proba(train_x)[:,1])
preds_df = pd.DataFrame({'bins': train_data.name_last+', '+train_data.name_first, 'log_prob': preds, 'year1': train_data.year_filed, 'year2': train_data.year_filed})

agg_df = preds_df.groupby('bins').agg(lambda x: [-np.mean(x.log_prob), np.min(x.year1), np.max(x.year2)])
agg_df.columns = ['cross_entropy_loss', 'min_year', 'max_year']

agg_df

Unnamed: 0_level_0,cross_entropy_loss,min_year,max_year
bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Alito, Samuel",1.279106,2006,2018
"Baldwin, Henry",0.381915,1830,1844
"Barbour, Philip",0.641383,1837,1840
"Black, Hugo",0.571154,1937,1971
"Blackmun, Harry",1.301927,1971,1994
"Blatchford, Samuel",1.598906,1882,1893
"Bradley, Joseph",1.572278,1870,1891
"Brandeis, Louis",0.741049,1916,1939
"Brennan, William",1.089571,1956,1990
"Brewer, David",1.745609,1890,1910


Conclusion: Making stop word set smaller increased accuracy. Commonality of words is still taken care of by IDF, but allows for additional information if words are more commonly used by Dem / Rep. We will keep this modification going forward

# Modified Regression Parameters - L1 regularization followed by L2

In [11]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
# Stop word set up to position 200, corresponds to 99.65 percentile with words having counts> 38746
# Still includes common words like 'petitioners', but IDF weighting should take care of that 
stop_word_set = [w for w, i in vocab.unigram_counts.most_common(200)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=stop_word_set)  

# Transform to TFIDF data sets

train_x = tfidf.fit_transform(train_data.text)
train_y = train_data.y

dev_x = tfidf.transform(dev_data.text)
dev_y = dev_data.y

test_x = tfidf.transform(test_data.text)
test_y = test_data.y

# print shapes for confirmation
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)
print(test_x.shape)
print(test_y.shape)

(17052, 10000)
(17052,)
(2131, 10000)
(2131,)
(2133, 10000)
(2133,)


In [38]:
# Fit logitic regression (L1)
clf_l1 = linear_model.LogisticRegression(penalty='l1')
clf_l1.fit(train_x, train_y)

print("L1 stats:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(dev_x, dev_y))

# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_x, train_y)

print("L2 stats:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(dev_x, dev_y))

L1 stats:
0.789643443584
0.763022055373
L2 stats:
0.809523809524
0.756452369779


In [39]:
# Remove features with coeff of 0 from L1 regression
print("There are %d coefficients equal to 0" % np.sum(clf_l1.coef_==0))
print("There are %d coefficients not equal to 0" % np.sum(clf_l1.coef_!=0))

keep_column = clf_l1.coef_[0] != 0

train_reduced = train_x[:,keep_column]
dev_reduced = dev_x[:,keep_column]
test_reduced = test_x[:,keep_column]

# confirm shapes
print(train_reduced.shape)
print(dev_reduced.shape)
print(test_reduced.shape)

# reduce vocabulary?
# would have done if this model was better

There are 9105 coefficients equal to 0
There are 895 coefficients not equal to 0
(17052, 895)
(2131, 895)
(2133, 895)


In [40]:
# Fit logitic regression (L2)
clf_l2 = linear_model.LogisticRegression(penalty='l2')
clf_l2.fit(train_reduced, train_y)

print("L2 stats:")
print(clf_l2.score(train_reduced, train_y))
print(clf_l2.score(dev_reduced, dev_y))

L2 stats:
0.763194933146
0.740497419052


In [25]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l1 = linear_model.LogisticRegression(penalty='l1', C=c)
    clf_l1.fit(train_x, train_y)

    print(c)
    print(clf_l1.score(train_x, train_y))
    print(clf_l1.score(dev_x, dev_y))
    print()

0.1
0.663734459301
0.661191928672

0.2
0.681562280084
0.679023932426

0.3
0.700621627962
0.699202252464

0.4
0.718977246071
0.715626466448

0.5
0.734166080225
0.729235100892

0.6
0.749296270232
0.738620366025

0.7
0.760321369927
0.746128578132

0.8
0.771581046212
0.752229000469

0.9
0.78078817734
0.76067573909

1.0
0.789643443584
0.763022055373



In [26]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l2 = linear_model.LogisticRegression(penalty='l2', C=c)
    clf_l2.fit(train_x, train_y)

    print(c)
    print(clf_l2.score(train_x, train_y))
    print(clf_l2.score(dev_x, dev_y))
    print()

0.1
0.677339901478
0.670107930549

0.2
0.7107084213
0.694509619897

0.3
0.734459300962
0.717503519474

0.4
0.753342716397
0.728765837635

0.5
0.767417311752
0.733458470202

0.6
0.778266479005
0.742374472079

0.7
0.78776683087
0.746128578132

0.8
0.79562514661
0.748474894416

0.9
0.802603800141
0.753636790239

1.0
0.809523809524
0.756452369779



In [27]:
# Try different hyperparameters
for c in np.arange(0.1, 1.1, 0.1):
    clf_l2 = linear_model.LogisticRegression(penalty='l2', C=c)
    clf_l2.fit(train_reduced, train_y)

    print(c)
    print(clf_l2.score(train_reduced, train_y))
    print(clf_l2.score(dev_reduced, dev_y))
    print()

0.1
0.668484635233
0.663538244955

0.2
0.688365001173
0.678085405913

0.3
0.705254515599
0.695917409667

0.4
0.718625381187
0.710464570624

0.5
0.729709125029
0.722196152041

0.6
0.739561341778
0.726419521351

0.7
0.747419657518
0.732050680432

0.8
0.753987802017
0.735804786485

0.9
0.758503401361
0.737681839512

1.0
0.763194933146
0.740497419052



In [28]:
# Final model for each with best parameters
clf_l1 = linear_model.LogisticRegression(penalty='l1', C=1.0)
clf_l1.fit(train_x, train_y)

print("L1:")
print(clf_l1.score(train_x, train_y))
print(clf_l1.score(test_x, test_y))

clf_l2 = linear_model.LogisticRegression(penalty='l2', C=1.0)
clf_l2.fit(train_x, train_y)

print("L2:")
print(clf_l2.score(train_x, train_y))
print(clf_l2.score(test_x, test_y))

clf_l2_reduced = linear_model.LogisticRegression(penalty='l2', C=1.0)
clf_l2_reduced.fit(train_reduced, train_y)

print("L2 post L1 feature selection:")
print(clf_l2_reduced.score(train_reduced, train_y))
print(clf_l2_reduced.score(test_reduced, test_y))

L1:
0.789643443584
0.743084857009
L2:
0.809523809524
0.732301922175
L2 post L1 feature selection:
0.763194933146
0.717768401313


Conclusion: Using L1 as a feature selection was not effective and did not improve accuracy scores

# Document Similarity Vectors

In [26]:
# Define TF-IDF vectorizer; Using top 1000 words as stop words.
def dummy_fun(doc):
    return doc

# Remove judge names b/c these are strong indicators of political party without indicating how language is used in opinions
# Stop word set up to position 200, corresponds to 99.65 percentile with words having counts> 38746
# Still includes common words like 'petitioners', but IDF weighting should take care of that 
stop_word_set = [w for w, i in vocab.unigram_counts.most_common(200)]
stop_word_set.extend([judge_name for judge_name in set(w.lower() for w in data.name_last)])

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    vocabulary=vocab.word_to_id,
    stop_words=stop_word_set)  

# Transform all data to TFIDF data sets (not modeling and calculating error, just looking at doc sim measures)

all_x = tfidf.fit_transform(df.text)

# print shapes for confirmation
print(all_x.shape)

(21316, 10000)


In [131]:
# Get document similarity measures over time

# note that linear_kernel is the same as cosine similarity when working on normalized vectors. TFIDF vectors are normalized vectors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

# rest index on dataframe to match indices in the TFIDF matrix
df = df.reset_index(drop=True)

# Add a column for the decade for grouping documents
df['time_bin'] = df['year_filed'].apply(lambda x: int(str(x[:3])+str(0)))
unique_bins = list(set(df['time_bin']))

# Get true/false with indices by party
dem = df.loc[:,'y'] == 1
rep = df.loc[:,'y'] == 0

# create array to store results
docsim_results = []

# Loop through bins to calculate internal party and cross part average cosine similarity
for bin in unique_bins:
    # get indices of values in each party in this time period
    time = df.loc[:,'time_bin'] == bin
    dem_in_period = dem & time
    rep_in_period = rep & time
    dem_in_period_idx = dem_in_period[dem_in_period].index
    rep_in_period_idx = rep_in_period[rep_in_period].index
    
    # get cosine similarity matrix - internal to each party and across parties
    # then take mean to get average cos similarity
    if len(dem_in_period_idx)>0:
        dem_internal_cos_sim = np.mean(linear_kernel(all_x[dem_in_period_idx], all_x[dem_in_period_idx]))
    else:
        dem_internal_cos_sim = 0
    if len(rep_in_period_idx)>0:
        rep_internal_cos_sim = np.mean(linear_kernel(all_x[rep_in_period_idx], all_x[rep_in_period_idx]))
    else:
        rep_internal_cos_sim = 0
    if len(rep_in_period_idx)>0 and len(dem_in_period_idx)>0:
        cross_party_cos_sim = np.mean(linear_kernel(all_x[rep_in_period_idx], all_x[dem_in_period_idx]))
    else:
        cross_party_cos_sim = 0
    
    docsim_results.append((bin,dem_internal_cos_sim,rep_internal_cos_sim,cross_party_cos_sim))

docsim_results = pd.DataFrame(docsim_results, columns=['decade', 'dem_cos_sim', 'rep_cos_sim', 'cross_party_cos_sim'])
docsim_results

Unnamed: 0,decade,dem_cos_sim,rep_cos_sim,cross_party_cos_sim
0,1920,0.050556,0.044428,0.044262
1,1800,0.269329,0.237059,0.051135
2,1930,0.054945,0.052312,0.050454
3,1940,0.056247,0.059768,0.052134
4,1950,0.057973,0.072282,0.060163
5,1830,0.104416,0.0,0.0
6,1960,0.058263,0.069704,0.060898
7,1840,0.076523,0.0,0.0
8,1970,0.065405,0.069145,0.064322
9,1850,0.06856,0.0,0.0
