In [2]:
import pickle
import numpy as np
import pandas as pd

In [3]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer
import sklearn.preprocessing
from  sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 1. DEMOGRAPHIC APPROACH
## 1.1 Feature Generation

In [3]:
jc_data = pd.read_pickle("demographic_approach/data/cases_justices_merged.pk")

In [4]:
jc_data

Unnamed: 0,justice,justiceName,justice_vote,petitionerState_1,petitionerState_10,petitionerState_12,petitionerState_13,petitionerState_15,petitionerState_17,petitionerState_18,...,prespart_2,prespart_3,prespart_6,prespart_7,senparty_1,senparty_2,senparty_3,senparty_6,senparty_7,senparty_8
44883,95,BRWhite,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44884,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44885,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44886,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44887,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44888,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44889,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44890,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44891,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
44892,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [58]:
X = jc_data.drop(columns=['justice_vote', 'justiceName']).interpolate()
Xdata = X.values.astype(np.float64)
Xdata

array([[ 95.,   0.,   0., ...,   0.,   0.,   0.],
       [ 95.,   0.,   0., ...,   0.,   0.,   0.],
       [ 95.,   0.,   0., ...,   0.,   0.,   0.],
       ...,
       [115.,   0.,   0., ...,   1.,   0.,   0.],
       [115.,   0.,   0., ...,   1.,   0.,   0.],
       [115.,   0.,   0., ...,   1.,   0.,   0.]])

In [59]:
Y = jc_data[['justice_vote']]
Ydata = Y.values.astype(np.int32)
Ydata

array([[1],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int32)

In [8]:
X = jc_data.drop(columns = ["justice", "justiceName", "justice_vote"]).values.astype(np.float64)
Y = jc_data["justice_vote"].values.astype(np.int32)

In [9]:
min_max_scaler = sklearn.preprocessing.StandardScaler()
X = min_max_scaler.fit_transform(X)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count


In [10]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X = imp.transform(X)

## 1.2 Logistic Regression Model

In [7]:
def validate_model(m, X_train, t_train, fit_params={}, cv=3):
    cv_results = cross_validate(m, X_train, t_train, fit_params=fit_params,cv=cv, scoring=('accuracy'), return_train_score=True, verbose=10, n_jobs=-1)
    print("train accuracies: ", cv_results['train_score'])
    print("average train accuracies: ", np.mean(cv_results['train_score'] ))
    print("validation accuracies: ", cv_results['test_score'])
    print("average validation accuracies: ", np.mean(cv_results['test_score'] ))
    return cv_results

In [13]:
lr = LR()
cv_results = validate_model(lr, X, Y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:   28.4s remaining:   28.4s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   28.6s remaining:   17.2s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:   28.6s remaining:    9.5s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   28.6s remaining:    0.0s


KeyboardInterrupt: 

In [7]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('X', 114363361), ('np', 80), ('pd', 80)]

# 2. NLP APPROACH
## 2.1 Feature Generation

In [4]:
# 1-gram data
Y = np.load("nlp_approach/data/case_justices_1grams_merged_Y.npy")
X = np.load("nlp_approach/data/case_justices_1grams_merged_X.npy")

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
# from sklearn.decomposition import PCA
# # Make an instance of the Model
# pca = PCA(.9)
# pca.fit(X_train)
# print("PCA components: ", pca.n_components_)
# X_train = pca.transform(X_train)
# X_test = pca.transform(X_test)

## 2.2 Logistic Regression Model

In [10]:
lr = LR(tol=0.01, solver='sag')
cv_results = validate_model(lr, X, Y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  5.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  5.7min finished


train accuracies:  [0.54974911 0.56218073]
average train accuracies:  0.5559649202115637
validation accuracies:  [0.55869988 0.52588039]
average validation accuracies:  0.5422901315492066


In [15]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True)
cv_results = validate_model(rf, X, Y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 11.1min remaining: 16.7min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 11.1min remaining:  7.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 11.1min remaining:    0.0s


KeyboardInterrupt: 

In [8]:
lr = LR(tol=0.01, solver='sag')
cv_results = validate_model(lr, X, Y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  8.2min remaining:    0.0s


KeyboardInterrupt: 

In [6]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X_train, Y_train)
print("score: {}".format(rf.score(X_test, Y_test)))

score: 0.6252683919086902


In [1]:
lr = LR(tol=0.01, solver='sag')
lr.fit(X_train, Y_train)
print("score: {}".format(lr.score(X_test, Y_test)))

NameError: name 'LR' is not defined

score: 0.5564470561645384
