# Training 
## All Data as Raw

This notebook finds the best parameters for each of the different classifiers that achieve the maximuma accuracy.

In [181]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_selection import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
%matplotlib inline

In [182]:
x = pd.read_pickle('x.pkl')
y = pd.read_pickle('y.pkl')

In [183]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 10,\
                                                    stratify = y)

In [184]:
def train_and_test(clf=DecisionTreeClassifier()):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    try:
        result = dict()
        result['accuracy']  = clf.score(x_test, y_test)
        result['precision'] = precision_score(y_test, y_pred)
        result['recall']    = recall_score(y_test,y_pred)
        result['f1']        = f1_score(y_test.values,y_pred)
        result['matrix']    = confusion_matrix(y_test.values, y_pred)
        dt_scores = cross_val_score(clf, x_train, y_train, cv=10)
        result['cval_score'] = np.mean(dt_scores)
#         result['score'] = clf.predict_proba(x_test)[:,1]
        return result
    except UndefinedMetricWarning:
        return None

# DECISION TREES

In [185]:
criterions = ['entropy', 'gini']
splitters = ['random', 'best']
depths = [5, 10, None]
min_samples_splits = [0.50, 0.25, 2]
min_samples_leafs = [1, 10]

In [186]:
results = []
for c, s, d, mss, msl in list(itertools.product(criterions, splitters, depths, min_samples_splits, min_samples_leafs)):
    clf = DecisionTreeClassifier(criterion=c, splitter=s, max_depth=d, min_samples_split=mss, min_samples_leaf=msl)
    result = train_and_test(clf)
    if result is not None:
        result['criterion'] = c
        result['splitter'] = s
        result['depth'] = d
        result['mss'] = mss
        result['msl'] = msl
        results.append(result)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [187]:
tree_df = pd.DataFrame.from_dict(results)
tree_df.head()

Unnamed: 0,accuracy,criterion,cval_score,depth,f1,matrix,msl,mss,precision,recall,splitter
0,0.742081,entropy,0.74753,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,0.0,0.0,random
1,0.742081,entropy,0.742643,5.0,0.0,"[[492, 0], [171, 0]]",10,0.5,0.0,0.0,random
2,0.719457,entropy,0.748279,5.0,0.243902,"[[447, 45], [141, 30]]",1,0.25,0.4,0.175439,random
3,0.739065,entropy,0.756597,5.0,0.498551,"[[404, 88], [85, 86]]",10,0.25,0.494253,0.502924,random
4,0.757164,entropy,0.769797,5.0,0.320675,"[[464, 28], [133, 38]]",1,2.0,0.575758,0.222222,random


In [188]:
tree_df.loc[[tree_df['accuracy'].idxmax(), tree_df['precision'].idxmax(), tree_df['cval_score'].idxmax()]]

Unnamed: 0,accuracy,criterion,cval_score,depth,f1,matrix,msl,mss,precision,recall,splitter
0,0.742081,entropy,0.74753,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,0.0,0.0,random
0,0.742081,entropy,0.74753,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,0.0,0.0,random
0,0.742081,entropy,0.74753,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,0.0,0.0,random


# RandomForestClassifier

In [189]:
n_estimators_list = [5, 10, 15, 20]
criterions = ['entropy', 'gini']
depths = [5, 10, None]
min_samples_splits = [0.50, 0.25, 2]
min_samples_leafs = [1, 10]

In [190]:
results = []
for nest, c, d, mss, msl in list(itertools.product(n_estimators_list, criterions, depths, min_samples_splits, min_samples_leafs)):
    clf = RandomForestClassifier(n_estimators=nest, criterion=c, max_depth=d, min_samples_split=mss, min_samples_leaf=msl)
    result = train_and_test(clf)
    if result is not None:
        result['n_estimators'] = nest
        result['criterion'] = c
        result['depth'] = d
        result['mss'] = mss
        result['msl'] = msl
        results.append(result)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

In [191]:
rf_df = pd.DataFrame.from_dict(results)
rf_df.head()

Unnamed: 0,accuracy,criterion,cval_score,depth,f1,matrix,msl,mss,n_estimators,precision,recall
0,0.742081,entropy,0.749417,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,5,0.0,0.0
1,0.739065,entropy,0.745662,5.0,0.03352,"[[487, 5], [168, 3]]",10,0.5,5,0.375,0.017544
2,0.754148,entropy,0.749072,5.0,0.137566,"[[487, 5], [158, 13]]",1,0.25,5,0.722222,0.076023
3,0.745098,entropy,0.758879,5.0,0.034286,"[[491, 1], [168, 3]]",10,0.25,5,0.75,0.017544
4,0.767722,entropy,0.78227,5.0,0.330435,"[[471, 21], [133, 38]]",1,2.0,5,0.644068,0.222222


In [192]:
rf_df.loc[[rf_df['accuracy'].idxmax(), rf_df['precision'].idxmax(), rf_df['cval_score'].idxmax()]]

Unnamed: 0,accuracy,criterion,cval_score,depth,f1,matrix,msl,mss,n_estimators,precision,recall
0,0.742081,entropy,0.749417,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,5,0.0,0.0
0,0.742081,entropy,0.749417,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,5,0.0,0.0
0,0.742081,entropy,0.749417,5.0,0.0,"[[492, 0], [171, 0]]",1,0.5,5,0.0,0.0


# AdaBoost

In [None]:
base_estimators=[None, RandomForestClassifier()]
n_estimators_list=[10, 30, 50, 100, 200]
learning_rate=[0.1, 0.5, 1.0]

In [None]:
results = []
for est, nest, lr in list(itertools.product(base_estimators, n_estimators_list, learning_rate)):
    clf = AdaBoostClassifier(base_estimator=est, n_estimators=nest, learning_rate=lr)
    result = train_and_test(clf)
    if result is not None:
        result['n_estimators'] = nest
        result['base_estimator'] = est 
        result['lr'] = lr
        results.append(result)

In [None]:
adaboost_df = pd.DataFrame.from_dict(results)
adaboost_df.head()

In [None]:
adaboost_df.loc[[adaboost_df['accuracy'].idxmax(), adaboost_df['precision'].idxmax(), adaboost_df['cval_score'].idxmax()]]

# K-Nearest Neighbor
http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
n_neighbors=[3, 5, 10, 20, 50]
leaf_size=[5, 10, 30, 50]
weights = ['uniform', 'distance']
ps=[1, 2]

In [None]:
results = []
for n, ls, w, p in list(itertools.product(n_neighbors, leaf_size, weights, ps)):
    clf = KNeighborsClassifier(n_neighbors=n, leaf_size=ls, weights=w, p=p)
    result = train_and_test(clf)
    if result is not None:
        result['n_neighbor'] = n
        result['leaf_size'] = ls
        result['p'] = p
        results.append(result)

In [None]:
knn_df = pd.DataFrame.from_dict(results)
knn_df.head()

In [None]:
knn_df.loc[[df['accuracy'].idxmax(), df['precision'].idxmax(), df['cval_score'].idxmax()]]

# Gaussian Naive Bayes
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [None]:
clf = GaussianNB()
result = train_and_test(clf)

In [None]:
nb_df = pd.DataFrame.from_dict([result])
nb_df.head()

In [None]:
nb_df.loc[[df['accuracy'].idxmax(), df['precision'].idxmax(), df['cval_score'].idxmax()]]

---

In [None]:
#ROC Curve
y_score_tree = tree_df['score']
y_score_rf = rf_df['score']
y_score_ab = adaboost_df['score']
y_score_knn = knn_df['score']
y_score_nb = nb_df['score']
fpr_tree, tpr_tree, _ = roc_curve(y_test, y_score_tree)
fpr_rf, tpr_rf, _ = roc_curve(y_test,y_score_rf)
fpr_ab, tpr_ab, _ = roc_curve(y_test,y_score_ab)
fpr_knn, tpr_knn, _ = roc_curve(y_test,y_score_knn)
fpr_nb, tpr_nb, _ = roc_curve(y_test,y_score_nb)
plt.plot([0,1],[0,1],'k--')

plt.plot(fpr_tree,tpr_tree,label='Decision Tree')
plt.plot(fpr_rf,tpr_rf,label='RandomForest')
plt.plot(fpr_ab,tpr_ab,label='Adaboost')
plt.plot(fpr_knn,tpr_knn,label='KNN')
plt.plot(fpr_nb,tpr_nb,label='Naive Bayes')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()

In [None]:
importances = ab_clf.feature_importances_
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(20):
    print("%d. %s (%f)" % (f + 1, x.iloc[:,indices[f]].name, importances[indices[f]]))

In [None]:
plt.figure(figsize=(10,7))
plt.title("Feature importance")
plt.barh(range(x_test.shape[1])[:10], importances[indices][:10],align="center")
plt.yticks(range(x_test.shape[1])[:10], x.iloc[:,indices].columns)
plt.ylim([-1, 10])