In [1]:
import pandas as pd
import numpy as np

In [3]:
train =  pd.read_csv('hidden_state_vector_training.csv')
valid =  pd.read_csv('hidden_state_vector_validation.csv')

In [42]:
f_variables = train.columns[~train.columns.str.contains('h_')]

In [43]:
f_train = train.loc[:, f_variables]
f_valid = valid.loc[:, f_variables]

X_train_f = f_train.iloc[:,:-1]
X_test_f  = f_valid.iloc[:, :-1]

y_train_f = f_train[['labels']].to_numpy()
y_test_f  = f_valid[['labels']].to_numpy()


In [15]:
X_train = train.iloc[:,:-1]
X_test  = valid.iloc[:, :-1]

y_train = train[['labels']].to_numpy()
y_test  = valid[['labels']].to_numpy()


In [12]:
num_trees = [10, 20, 50, 100, 200, 400, 800, 1500]

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def compute_metrics(labels, preds):
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    recall = recall_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    precision = precision_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall':recall, 'f1':f1, 'auc':auc}

# Single run

In [None]:
from sklearn.ensemble import RandomForestClassifier

feature_names = train.columns.tolist()[:-1]
forest = RandomForestClassifier(n_estimators = 1500, random_state=0, criterion = 'log_loss',
                               n_jobs = -1)
forest.fit(X_train, y_train.ravel())

In [None]:
y_pred = forest.predict(X_test)

In [None]:
context_metrics = compute_metrics(y_test, y_pred)

In [66]:
forest_wo_ct = RandomForestClassifier(n_estimators = 400, random_state=0, criterion = 'log_loss',
                               n_jobs = -1)

forest_wo_ct.fit(X_train_f, y_train_f.ravel())

In [67]:
y_pred = forest_wo_ct.predict(X_test_f)
wo_context_metrics = compute_metrics(y_test, y_pred)

# Gird Serach CV

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

scorer = make_scorer(roc_auc_score)


forest = RandomForestClassifier(random_state=0, criterion = 'log_loss',
                               n_jobs = 1)

parameters = {'n_estimators':[10, 20, 50, 100, 200, 400, 800, 1500]}

clf = GridSearchCV(forest, parameters, scoring = scorer, n_jobs = -1,
                  verbose = 2)

In [33]:
clf.fit(X_train, y_train.ravel())

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ....................................n_estimators=10; total time=  16.4s
[CV] END ....................................n_estimators=20; total time=  33.9s
[CV] END ....................................n_estimators=50; total time= 1.4min
[CV] END ...................................n_estimators=200; total time= 6.0min
[CV] END ...................................n_estimators=400; total time=11.8min
[CV] END ...................................n_estimators=800; total time=24.6min




KeyboardInterrupt: 

In [34]:
clf.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [None]:
final_forest = clf.best_estimator_

In [None]:
y_pred = final_forest.predict(X_test)

In [None]:
Fernandes_model_perf = [ 0.67, 0.67 ,0.71 ,0.69, 0.73]
metrics = ['Accuracy', 'Precision' ,'Recall', 'F1', 'AUC']
our_perf = compute_metrics( y_test, y_pred)

In [68]:
context_metrics

{'accuracy': 0.619288058571068,
 'precision': 0.6197145165634258,
 'recall': 0.5892445582586427,
 'f1': 0.6040955631399317,
 'auc': 0.618869229589864}

In [69]:
wo_context_metrics

{'accuracy': 0.6331734410502399,
 'precision': 0.6317594302294909,
 'recall': 0.6133162612035852,
 'f1': 0.6224012474012474,
 'auc': 0.6328966170344538}

In [None]:
df = pd.DataFrame(zip(context_metrics.values(), wo_context_metrics.values()), index = context_metrics.keys(), columns = ['random forest with context vector','random forest without context vector']).T

In [None]:
print(df.to_latex())