# Model Training, Evaluation, and Tuning
This notebook creates a model (Logistic Regression, Random Forest) for each feature set that we have (Demographic, Demographic + N-grams, Gemographioc + Mean Word Embedding). It first cleans up the data and formats it for training, then trains the models, and then evaluates them using cross-validation.

In [62]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
import sklearn.preprocessing
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfTransformer

# 1. Helper functions

In [3]:
# Performs k-fold cross-validation for a model.
def validate_model(m, X_train, t_train, fit_params={}, cv=5):
    cv_results = cross_validate(m, X_train, t_train, fit_params=fit_params,cv=cv, scoring=('accuracy'), return_train_score=True, verbose=10, n_jobs=-1)
    print("train accuracies: ", cv_results['train_score'])
    print("average train accuracies: ", np.mean(cv_results['train_score'] ))
    print("validation accuracies: ", cv_results['test_score'])
    print("average validation accuracies: ", np.mean(cv_results['test_score'] ))
    return cv_results

In [4]:
# Performs L1 feature selection using a Linear SVM
def L1_feature_select(X_train, X_test, Y_train, Y_test, C=0.01):
    lsvc = LinearSVC(C=C, penalty="l1", dual=False).fit(X_train, Y_train)
    model = SelectFromModel(lsvc, prefit=True)
    X_train_new = model.transform(X_train)
    X_test_new = model.transform(X_test)
    print("old train shape: {}".format(X_train.shape))
    print("new train shape: {}".format(X_train_new.shape))
    return X_train_new, X_test_new

# 2. DEMOGRAPHIC DATA ONLY
## 2.1 Data Cleaning

In [72]:
# get merged justice and case data
jc_data = pd.read_pickle("demographic_approach/data/cases_justices_merged.pk")

In [6]:
jc_data

Unnamed: 0,justice,justiceName,justice_vote,petitionerState_1,petitionerState_10,petitionerState_12,petitionerState_13,petitionerState_15,petitionerState_17,petitionerState_18,...,undsch_103.0,undsch_104.0,undsch_110.0,undsch_111.0,undsch_112.0,undsch_113.0,undsch_114.0,undsch_115.0,undsch_116.0,undsch_117.0
44883,95,BRWhite,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44884,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44885,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44886,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44887,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44888,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44889,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44890,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44891,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44892,95,BRWhite,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
# Create X, Y data
X = jc_data.drop(columns = ["justice", "justiceName", "justice_vote"]).values.astype(np.float64)
Y = jc_data["justice_vote"].values.astype(np.int32)

In [48]:
# Normalize features
min_max_scaler = sklearn.preprocessing.StandardScaler()
X = min_max_scaler.fit_transform(X)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count


In [49]:
# Impute features
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X = imp.transform(X)

In [50]:
# split into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [46]:
np.save("nlp_approach/data/case_justices_merged_X_train.npy", X_train)
np.save("nlp_approach/data/case_justices_merged_X_test.npy", X_test)
np.save("nlp_approach/data/case_justices_merged_Y_train.npy", Y_train)
np.save("nlp_approach/data/case_justices_merged_Y_test.npy", Y_test)

## 2.2 Feature Selection using L1 Regularization

In [51]:
X_train_new, X_test_new = L1_feature_select(X_train, X_test, Y_train, Y_test)

old train shape: (35393, 1581)
new train shape: (35393, 618)


In [53]:
np.save("nlp_approach/data/case_justices_merged_X_train_new.npy", X_train_new)
np.save("nlp_approach/data/case_justices_merged_X_test_new.npy", X_test_new)

## 2.2 Logistic Regression Model

In [52]:
lr = LR()
cv_results = validate_model(lr, X_train_new, Y_train, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   29.0s remaining:   43.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   29.3s remaining:   19.6s


train accuracies:  [0.64399237 0.64452214 0.64282687 0.64194391 0.64659556]
average train accuracies:  0.6439761729806548
validation accuracies:  [0.62424071 0.62664218 0.62904365 0.62296935 0.62893882]
average validation accuracies:  0.626366941026466


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   41.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   41.3s finished


## 2.3 RandomForest Model

In [53]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True)
cv_results = validate_model(rf, X_train_new, Y_train, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   40.8s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   40.8s remaining:   27.2s


train accuracies:  [0.99173554 0.99251254 0.99141767 0.99177086 0.99184207]
average train accuracies:  0.9918557343777907
validation accuracies:  [0.72001695 0.71097613 0.70970476 0.70885718 0.71668786]
average validation accuracies:  0.7132485767968407


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   50.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   50.5s finished


# 3. DEMOGRAPHIC DATA + N-GRAMS
## 1-grams
## 3.1 Data cleaning

In [55]:
# 1-gram data. Note that imputing the data was already done in pre-processing.
Y_1gram = np.load("nlp_approach/data/case_justices_1grams_merged_Y.npy")
X_1gram = np.load("nlp_approach/data/case_justices_1grams_merged_X.npy")

In [56]:
X_train_1gram, X_test_1gram, Y_train_1gram, Y_test_1gram = train_test_split(X_1gram, Y_1gram, test_size=0.2, random_state=42)

In [57]:
np.save("nlp_approach/data/case_justices_1grams_merged_X_train.npy", X_train_1gram)
np.save("nlp_approach/data/case_justices_1grams_merged_X_test.npy", X_test_1gram)
np.save("nlp_approach/data/case_justices_1grams_merged_Y_train.npy", Y_train_1gram)
np.save("nlp_approach/data/case_justices_1grams_merged_Y_test.npy", Y_test_1gram)

## 3.2 Feature Section using L1-Regularization

In [58]:
X_train_1gram_new, X_test_1gram_new = L1_feature_select(X_train_1gram, X_test_1gram, Y_train_1gram, Y_test_1gram, C=0.1)



old train shape: (35393, 6581)
new train shape: (35393, 568)


In [59]:
np.save("nlp_approach/data/case_justices_1grams_merged_X_train_new.npy", X_train_1gram_new)
np.save("nlp_approach/data/case_justices_1grams_merged_X_test_new.npy", X_test_1gram_new)

In [None]:
X_train_1gram_new = np.load("nlp_approach/data/case_justices_1grams_merged_X_train_new.npy")
X_test_1gram_new = np.load("nlp_approach/data/case_justices_1grams_merged_X_test_new.npy")

## 3.3 Logistic Regression Model

In [60]:
lr = LR()
cv_results = validate_model(lr, X_train_1gram_new, Y_train_1gram, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   43.1s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   45.9s remaining:   30.6s


train accuracies:  [0.64137882 0.64102564 0.64130819 0.64236773 0.64384094]
average train accuracies:  0.6419842644495588
validation accuracies:  [0.62395819 0.63116259 0.63201017 0.62876112 0.62724318]
average validation accuracies:  0.6286270514582364


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   59.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   59.6s finished


## 3.4 Random Forest Model

In [61]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True)
cv_results = validate_model(rf, X_train_1gram_new, Y_train_1gram, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   33.4s remaining:   50.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   33.8s remaining:   22.5s


train accuracies:  [0.99085258 0.99155895 0.9904994  0.99092322 0.9907826 ]
average train accuracies:  0.9909233484128499
validation accuracies:  [0.63935584 0.64952677 0.63864953 0.63893205 0.64278649]
average validation accuracies:  0.6418501362617008


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   42.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   42.2s finished


## N-grams (1 to 5 )

## 3.1 Data cleaning

In [23]:
ngram_features = np.load("nlp_approach/data/ngrams/1to5grams_names.p")
ngram_features

['000',
 '10',
 '10 years',
 '100',
 '10th',
 '10th amendment',
 '11',
 '12',
 '12 years',
 '13',
 '14',
 '14th',
 '14th amendment',
 '15',
 '15 years',
 '150',
 '17',
 '1787',
 '18',
 '1948',
 '1954',
 '1960',
 '1961',
 '1964',
 '1965',
 '1966',
 '1968',
 '1969',
 '1970',
 '1971',
 '1973',
 '1974',
 '1975',
 '1976',
 '1980',
 '1983',
 '1985',
 '1992',
 '19th',
 '19th century',
 '20',
 '20 years',
 '20 years ago',
 '200',
 '200 years',
 '200 years ago',
 '21',
 '23',
 '23 years',
 '24',
 '24 years',
 '25',
 '25 years',
 '28',
 '2d',
 '30',
 '30 years',
 '40',
 '50',
 '50 years',
 '632',
 '70',
 '700',
 '78',
 '80',
 '90',
 '97',
 'aba',
 'abide',
 'abilities',
 'ability',
 'able',
 'abortion',
 'abortions',
 'absence',
 'absolute',
 'absolutely',
 'absolutely right',
 'absolutely senator',
 'abstract',
 'abuse',
 'abused',
 'aca',
 'academic',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'access',
 'accommodation',
 'accordance',
 'according',
 'according law',
 'according ru

In [22]:
# 1-gram data. Note that imputing the data was already done in pre-processing.
Y_ngram = np.load("nlp_approach/data/case_justices_1to5grams_merged_Y.npy")
X_ngram = np.load("nlp_approach/data/case_justices_1to5grams_merged_X.npy")

In [24]:
X_train_ngram, X_test_ngram, Y_train_ngram, Y_test_ngram = train_test_split(X_ngram, Y_ngram, test_size=0.2, random_state=42)


In [25]:
np.save("nlp_approach/data/case_justices_ngrams_merged_X_train.npy", X_train_ngram)
np.save("nlp_approach/data/case_justices_ngrams_merged_X_test.npy", X_test_ngram)
np.save("nlp_approach/data/case_justices_ngrams_merged_Y_train.npy", Y_train_ngram)
np.save("nlp_approach/data/case_justices_ngrams_merged_Y_test.npy", Y_test_ngram)

## 3.2 Feature Section using L1-Regularization

In [26]:
X_train_ngram_new, X_test_ngram_new = L1_feature_select(X_train_ngram, X_test_ngram, Y_train_ngram, Y_test_ngram, C=0.1)




old train shape: (35393, 6581)
new train shape: (35393, 568)


In [73]:
np.save("nlp_approach/data/case_justices_ngrams_merged_X_train_new.npy", X_train_ngram_new)
np.save("nlp_approach/data/case_justices_ngrams_merged_X_test_new.npy", X_test_ngram_new)

## 3.3 Logistic Regression Model

In [44]:
lr = LR()
cv_results = validate_model(lr, X_train_ngram_new, Y_train_ngram, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   42.1s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   42.2s remaining:   28.1s


train accuracies:  [0.64049587 0.63993078 0.63925973 0.64183796 0.64451194]
average train accuracies:  0.6412072541931664
validation accuracies:  [0.62551208 0.63314027 0.63299901 0.62791355 0.62286279]
average validation accuracies:  0.6284855410535368


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   56.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   56.1s finished


## 3.4 Random Forest Model

In [45]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True)
cv_results = validate_model(rf, X_train_ngram_new, Y_train_ngram, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   29.7s remaining:   44.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   30.0s remaining:   20.0s


train accuracies:  [0.99064067 0.99162958 0.99046408 0.99085258 0.9905707 ]
average train accuracies:  0.9908315240417875
validation accuracies:  [0.64726656 0.6471253  0.6372369  0.63398785 0.64038434]
average validation accuracies:  0.6412001912328662


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.5s finished


## with IDF Transform

In [30]:
X_ngram_idf = np.load("nlp_approach/data/case_justices_1to5grams_merged_X_idf.npy")

In [31]:
X_train_ngram_idf, X_test_ngram_idf, Y_train_ngram, Y_test_ngram = train_test_split(X_ngram_idf, Y_ngram, test_size=0.2, random_state=42)


In [32]:
np.save("nlp_approach/data/case_justices_ngrams_merged_X_train_idf.npy", X_train_ngram_idf)
np.save("nlp_approach/data/case_justices_ngrams_merged_X_test_idf.npy", X_test_ngram_idf)

## 3.2 Feature Section using L1-Regularization

In [39]:
X_train_ngram_idf_new, X_test_ngram_idf_new = L1_feature_select(X_train_ngram_idf, X_test_ngram_idf, Y_train_ngram, Y_test_ngram, C=1)




old train shape: (35393, 6581)
new train shape: (35393, 686)


In [41]:
np.save("nlp_approach/data/case_justices_ngrams_merged_X_train_idf_new.npy", X_train_ngram_idf_new)
np.save("nlp_approach/data/case_justices_ngrams_merged_X_test_idf_new.npy", X_test_ngram_idf_new)

## 3.3 Logistic Regression Model

In [40]:
lr = LR()
cv_results = validate_model(lr, X_train_ngram_idf_new, Y_train_ngram, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   21.3s remaining:   32.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   21.9s remaining:   14.6s


train accuracies:  [0.64328601 0.64413364 0.64332133 0.64166137 0.64161605]
average train accuracies:  0.6428036785187897
validation accuracies:  [0.6197203  0.62763102 0.6284786  0.62254556 0.61975413]
average validation accuracies:  0.6236259219739007


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   27.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   27.2s finished


## 3.4 Random Forest Model

In [42]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True)
cv_results = validate_model(rf, X_train_ngram_idf_new, Y_train_ngram, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   39.9s remaining:   59.9s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   40.5s remaining:   27.0s


train accuracies:  [0.99219467 0.99300699 0.99187681 0.99222999 0.99226586]
average train accuracies:  0.9923148652135797
validation accuracies:  [0.65715497 0.64316994 0.64882045 0.65305834 0.65394941]
average validation accuracies:  0.6512306229362109


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   50.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   50.7s finished


## Accuracy Results

|  | Training Accuracy | Validation Accuracy |
|----------------------------------------|-------------------|---------------------|
| Baseline (guess defendant) | 0.531 | 0.531 |
| LR (Demographic) | 0.644 | 0.627 |
| LR (Demographic, 1-grams) | 0.642 | 0.629 |
| LR (Demographic, 1 to 5-grams) | 0.641 | 0.628 |
| LR (Demographic, 1 to 5-grams, TF-IDF) | 0.643 | 0.624 |
| RF (Demographic) | 0.992 | 0.713 |
| RF (Demographic, 1-grams) | 0.991 | 0.643 |
| RF (Demographic, 1 to 5-grams) | 0.991 | 0.641 |
| RF (Demographic, 1 to 5-grams, TF-IDF) | 0.991 | 0.651 |
| RF (Past Votes) | 0.996 | 0.724 |

# 4. DEMOGRAPHIC + MEAN EMBEDDING

## 4.1 Data cleaning

In [73]:
mean_embeddings = np.load("mean_embeddings.npy")

In [74]:
# convert mean word embeddings into a dataframe
justicenames = ['AJGoldberg', 'AMKennedy', 'AScalia', 'BRWhite', 'DHSouter', 'EKagan', 'JGRoberts', 'JPStevens', 'LFPowell', 'NMGorsuch', 'RBGinsburg', 'SAAlito', 'SDOConnor', 'SGBreyer', 'SSotomayor', 'TMarshall', 'WHRehnquist']
emb_df = pd.DataFrame(data=np.array(mean_embeddings).T, columns=justicenames)
emb_df = emb_df.T
emb_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
AJGoldberg,0.008152,0.030855,0.049433,0.099664,-0.068314,-0.044576,0.054488,-0.046819,0.087485,0.026701,...,-0.08528,0.002186,-0.045725,0.010782,-0.040817,-0.002206,0.017133,-0.044007,0.024941,0.017669
AMKennedy,0.016872,0.03343,0.032639,0.100741,-0.055972,-0.030286,0.086012,-0.031739,0.097141,0.037108,...,-0.070913,0.002739,-0.030951,-0.00413,-0.028663,0.016892,0.026093,-0.060026,0.041306,-0.007428
AScalia,0.022761,0.024507,0.030656,0.10331,-0.064888,-0.022084,0.08864,-0.033492,0.107173,0.039799,...,-0.068686,0.017412,-0.030655,0.001107,-0.034818,0.004451,0.027409,-0.060654,0.038389,-0.004057
BRWhite,0.011146,0.046047,0.04047,0.097818,-0.038411,-0.063352,0.048497,-0.045905,0.095829,0.048845,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.08053,0.042416,0.017462
DHSouter,0.025509,0.034823,0.031487,0.0987,-0.062366,-0.023857,0.088676,-0.042049,0.103281,0.047648,...,-0.072291,0.015554,-0.037238,0.005705,-0.029011,0.012824,0.021722,-0.058771,0.029084,-0.011395
EKagan,0.019558,0.038979,0.030725,0.107536,-0.054141,-0.035241,0.081142,-0.029598,0.095316,0.032488,...,-0.070401,0.00738,-0.036783,-0.005863,-0.039308,0.011736,0.030312,-0.057283,0.042815,-0.001315
JGRoberts,0.016157,0.041639,0.028075,0.102392,-0.052291,-0.027871,0.08431,-0.037156,0.101411,0.038104,...,-0.061336,0.003481,-0.024413,-0.000453,-0.028761,0.014732,0.032057,-0.062147,0.037187,-0.002238
JPStevens,0.025139,0.030309,0.0236,0.096372,-0.061139,-0.02011,0.091802,-0.039671,0.110155,0.039653,...,-0.069516,0.012484,-0.028121,0.006008,-0.026353,0.013316,0.029625,-0.065233,0.027977,-0.011153
LFPowell,0.015467,0.036296,0.037268,0.091282,-0.063734,-0.02775,0.070568,-0.036367,0.098408,0.032488,...,-0.078713,0.001923,-0.03897,0.016924,-0.024185,0.003162,0.013345,-0.070291,0.035007,-0.007677
NMGorsuch,0.027357,0.032483,0.031983,0.103614,-0.048414,-0.028371,0.072933,-0.031878,0.093986,0.04835,...,-0.055399,0.010445,-0.053723,-0.012857,-0.028771,0.005119,0.028553,-0.064721,0.042943,-0.001579


In [75]:
# merge mean word embeddings with justice and case data
emb_df["justiceName"] = emb_df.index
merged_jc_emb = jc_data.merge(emb_df, on='justiceName')
merged_jc_emb

Unnamed: 0,justice,justiceName,justice_vote,petitionerState_1,petitionerState_10,petitionerState_12,petitionerState_13,petitionerState_15,petitionerState_17,petitionerState_18,...,290,291,292,293,294,295,296,297,298,299
0,95,BRWhite,1,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
1,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
2,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
3,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
4,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
5,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
6,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
7,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
8,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462
9,95,BRWhite,0,0,0,0,0,0,0,0,...,-0.066685,-0.022377,-0.033402,-0.016695,-0.011504,0.010424,0.007548,-0.080530,0.042416,0.017462


In [77]:
# create data and labels
X_emb = merged_jc_emb.drop(columns = ["justice", "justiceName", "justice_vote"]).values.astype(np.float64)
Y_emb = merged_jc_emb["justice_vote"].values.astype(np.int32)

# impute missing values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_emb)
X_emb = imp.transform(X_emb)

In [86]:
np.save("nlp_approach/data/mean_embedding/cases_justices_emb_merged_X.npy", X_emb)
np.save("nlp_approach/data/mean_embedding/cases_justices_emb_merged_Y.npy", Y_emb)

In [78]:
# split in to training and test data
X_train_emb, X_test_emb, Y_train_emb, Y_test_emb = train_test_split(X_emb, Y_emb, test_size=0.2, random_state=42)


## 4.2 Feature Selection using L1 Regularization

In [79]:
X_train_emb_new, X_test_emb_new = L1_feature_select(X_train_emb, X_test_emb, Y_train_emb, Y_test_emb, C=0.1)




old train shape: (35393, 1881)
new train shape: (35393, 457)


## 4.3 Logistic Regression Model

In [80]:
# with L1 regularization feature selection
lr = LR()
cv_results = validate_model(lr, X_train_emb_new, Y_train_emb, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   11.2s remaining:   16.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   11.3s remaining:    7.5s


train accuracies:  [0.64077841 0.64077841 0.64109628 0.64194391 0.64366436]
average train accuracies:  0.6416522757030532
validation accuracies:  [0.62395819 0.63172765 0.63017375 0.62522955 0.62710188]
average validation accuracies:  0.6276382032422279


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.3s finished


In [85]:
# without L1 regularization feature selection
lr = LR()
cv_results = validate_model(lr, X_train_emb, Y_train_emb, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.9min remaining:  2.8min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  2.1min remaining:  1.4min


train accuracies:  [0.64204987 0.64275623 0.64356855 0.64233242 0.64716062]
average train accuracies:  0.6435735373226544
validation accuracies:  [0.62070914 0.62748976 0.62522955 0.61887272 0.62342801]
average validation accuracies:  0.6231458357398625


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished


## 4.4 Random Forest Model

In [81]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True)
cv_results = validate_model(rf, X_train_emb_new, Y_train_emb, cv=5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   27.0s remaining:   40.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   27.2s remaining:   18.1s


train accuracies:  [0.99095854 0.9916649  0.99074663 0.99120576 0.99095918]
average train accuracies:  0.9911070006473824
validation accuracies:  [0.65715497 0.66676084 0.65574234 0.65235203 0.66030804]
average validation accuracies:  0.6584636422116191


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   37.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   37.3s finished


## For debugging purposes

In [7]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('X', 114363361), ('np', 80), ('pd', 80)]