# Train ML methods

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [None]:
from sklearn.decomposition import PCA

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!ls

In [None]:
%cd gdrive/MyDrive/Colab\ Notebooks/EEG

In [None]:
def find_filenames(ls):
  output = []
  for file in ls:
    output.append(os.path.basename(file))
  return output

In [None]:
## import our classifiers
from sklearn.ensemble import RandomForestClassifier
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## The VotingClassifier
from sklearn.ensemble import VotingClassifier

## import accuracy metric
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## split data
from sklearn.model_selection import train_test_split

## validation and model selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [None]:
df_full = pd.read_csv("full_feature_corr.csv")

In [None]:
df_full = df_full.drop_duplicates()
print(len(df_full.subject))
print(len(df_full.subject.unique()))

In [None]:
df_copy = df_full.copy()
y = df_copy.alcoholic
X = df_copy.drop('alcoholic', axis = 1,inplace=True)
# If there is a categoric data
# X = pd.get_dummies(X)
X

In [None]:
# GET TRAIN AND TEST DATA
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                    test_size = .2,
                                                    shuffle = True,
                                                    random_state=440,
                                                    stratify=y) 

In [None]:
rf = RandomForestClassifier(random_state=440)
lr = LogisticRegression(max_iter=100000)
svc = SVC(kernel= 'rbf')
clf_vote = VotingClassifier([('lr', lr),('rf', rf),('svm',svc)])

In [None]:
# GET KFOLD VALIDATION
kfold = 5
models = [rf, lr, svc, clf_vote]

cv_results = []
for model in models:
  cv_results.append(cross_validate(model, X_train, y_train, cv=kfold))

In [None]:
cv_results_avg = np.zeros(len(cv_results))
for i in range(len(cv_results)):
  cv_results_avg[i] = np.mean(cv_results[i]['test_score'])

cv_results_avg

In [None]:
parameters = {'n_estimators':[50, 100, 200, 500], 
              'criterion':('gini','entropy'), 
              'min_samples_split':[2,4,6,8], 
              'max_features':('auto','log2'), 
              'max_depth':[5,10,20,100, 200]
              }
clf_gridCV = GridSearchCV(rf, parameters, cv = kfold)
clf_gridCV.fit(X_train, y_train)
sorted(clf_gridCV.cv_results_.keys())

In [None]:
rf_best = clf_gridCV.best_estimator_
print_grid_search_metrics(clf_gridCV)


Best score: 0.7738721804511279

Best parameters set:

criterion:entropy

max_depth:5

max_features:auto

min_samples_split:8

n_estimators:100

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=440,
                       verbose=0, warm_start=False)

In [None]:
rf_best.fit(X_train, y_train)

y_pred = rf_best.predict(X_test)

In [None]:
def print_grid_search_metrics(gs):
  # gs: grid search result.
  print('Best score:', gs.best_score_)
  print('Best parameters set:')
  best_parameters = gs.best_params_
  for param_name in sorted(best_parameters.keys()):
    print(param_name + ':'+str(best_parameters[param_name]))

In [None]:
df_importance_features = pd.DataFrame(rf_best.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
df_importance_features.head(10)

importance

1170	0.010216

1752	0.008834

1200	0.008252

1637	0.007758

943	  0.007506

475	  0.007002

1614	0.006604

235	  0.006530

546	  0.006334

1636	0.005944

In [None]:
# calculate accuracy, precision and recall, [[tn,fp],[]]
def cal_evaluation(classifier, cm):
  tn = cm[0][0]
  fp = cm[0][1]
  fn = cm[1][0]
  tp = cm[1][1]
  accuracy  = (tp + tn) / (tp + fp + fn + tn + 0.0)
  precision = tp / (tp + fp + 0.0)
  recall = tp / (tp + fn + 0.0)
  print(classifier)
  print("Accuracy is: " + str(accuracy))
  print("precision is: " + str(precision))
  print("recall is: " + str(recall))
  print('Specificity is', tn/(fp+tn))
  

# Pring out confusion matrix
def draw_confusion_matrices(confusion_matricies):
    class_names = ['Not Alcoholic','Alcoholic']
    for cm in confusion_matrices:
        classifier, cm = cm[0], cm[1]
        cal_evaluation(classifier, cm)
        fig = plt.figure(figsize=(10,10))
        ax = fig.add_subplot(111)
        cax = ax.matshow(cm, interpolation='nearest',cmap=plt.get_cmap('Reds'))
        plt.title('Confusion matrix for ' + classifier)
        fig.colorbar(cax)
        ax.set_xticklabels([''] + class_names)
        ax.set_yticklabels([''] + class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()

In [None]:
confusion_matrices = [("Random Forest", confusion_matrix(y_test,y_pred))]
draw_confusion_matrices(confusion_matrices)

Random Forest

Accuracy is: 0.7464788732394366

precision is: 0.7413793103448276

recall is: 0.9347826086956522

Specificity is 0.4