### Import Package

In [2]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble, neighbors, tree, neural_network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
import xgboost

import string
import os
import pandas as pd 
import numpy as np
import pickle
import data_reader


### Load Data

In [38]:
data = data_reader.read()
data.shape

(600, 176)

In [39]:
print(data[:10,-2:])

[[-0.05805464 -0.52113092]
 [ 0.32652215 -0.35021785]
 [-0.46058144 -1.51423159]
 [-0.39751291 -2.0050897 ]
 [ 0.7812306  -0.10708989]
 [ 1.630479    1.49625811]
 [-0.05849049 -0.81698579]
 [ 0.60994349  0.18275919]
 [-0.72237744 -1.48216038]
 [ 0.86143762  1.5444956 ]]


### Preprocessing

In [40]:
data = data[(np.abs(data[:,-2])>=0.3) & (np.abs(data[:,-1])>=0.3)]
data[:,-2:] = np.where(data[:,-2:] > 0.0, 1, 0)

In [41]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data[:,:-2], data[:,-2:], test_size=0.2)

In [42]:
test_x = data[:,:-2]
test_x.shape

(235, 174)

### Model Training

In [43]:
def train_model(classifier, train_x, train_y, valid_x, valid_y):
    # fit the training dataset on the classifier
    classifier.fit(train_x, train_y)   
    # predict the labels on validation dataset
    predictions = classifier.predict(valid_x)
    
    return metrics.accuracy_score(predictions, valid_y), metrics.classification_report(valid_y, predictions), classifier

In [44]:
def test_models(classifier_ch, classifier_en, test_x):
    diff = []
    pred_ch = classifier_ch.predict(test_x)
    pred_en = classifier_en.predict(test_x)
    for idx in range(len(test_x)):
        if pred_ch[idx] != pred_en[idx]:
            diff.append(idx)
    return diff

### Logistic Regression

In [45]:
accuracy, report, classifier_ch = train_model(linear_model.LogisticRegression(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_glm: ',accuracy)
print(report)

ch_glm:  0.702127659574
             precision    recall  f1-score   support

        0.0       0.67      0.67      0.67        21
        1.0       0.73      0.73      0.73        26

avg / total       0.70      0.70      0.70        47



In [46]:
scores = cross_val_score(linear_model.LogisticRegression(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.72 (+/- 0.20)


In [47]:
accuracy, report, classifier_en = train_model(linear_model.LogisticRegression(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_glm: ',accuracy)
print(report)

en_glm:  0.723404255319
             precision    recall  f1-score   support

        0.0       0.68      0.65      0.67        20
        1.0       0.75      0.78      0.76        27

avg / total       0.72      0.72      0.72        47



In [48]:
scores = cross_val_score(linear_model.LogisticRegression(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.71 (+/- 0.07)


In [49]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
32
[0, 22, 24, 25, 28, 46, 57, 59, 71, 77, 83, 92, 99, 116, 121, 127, 131, 139, 169, 184, 186, 187, 192, 193, 199, 202, 203, 205, 211, 214, 226, 227]


### SVM (C=0.2, kernel=linear)

In [50]:
accuracy, report, classifier_ch= train_model(svm.SVC(C=0.2, kernel='linear'), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_svm: ',accuracy)
print(report)

ch_svm:  0.723404255319
             precision    recall  f1-score   support

        0.0       0.67      0.76      0.71        21
        1.0       0.78      0.69      0.73        26

avg / total       0.73      0.72      0.72        47



In [51]:
scores = cross_val_score(svm.SVC(C=0.2, kernel='linear'), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.72 (+/- 0.12)


In [52]:
accuracy, report, classifier_en = train_model(svm.SVC(C=0.2, kernel='linear'), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_svm: ',accuracy)
print(report)

en_svm:  0.723404255319
             precision    recall  f1-score   support

        0.0       0.67      0.70      0.68        20
        1.0       0.77      0.74      0.75        27

avg / total       0.73      0.72      0.72        47



In [53]:
scores = cross_val_score(svm.SVC(C=0.2, kernel='linear'), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.69 (+/- 0.13)


In [54]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
31
[15, 24, 25, 26, 48, 57, 59, 62, 71, 77, 83, 85, 92, 99, 115, 116, 124, 127, 131, 139, 169, 184, 186, 190, 192, 193, 202, 214, 226, 227, 231]


### SVM(C=100, kernel=linear)

In [55]:
accuracy, report, classifier_ch = train_model(svm.SVC(C=100, kernel='linear'), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_svm: ',accuracy)
print(report)

ch_svm:  0.617021276596
             precision    recall  f1-score   support

        0.0       0.56      0.71      0.63        21
        1.0       0.70      0.54      0.61        26

avg / total       0.64      0.62      0.62        47



In [56]:
scores = cross_val_score(svm.SVC(C=100, kernel='linear'), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.71 (+/- 0.18)


In [57]:
accuracy, report, classifier_en = train_model(svm.SVC(C=100, kernel='linear'), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_svm: ',accuracy)
print(report)

en_svm:  0.702127659574
             precision    recall  f1-score   support

        0.0       0.62      0.75      0.68        20
        1.0       0.78      0.67      0.72        27

avg / total       0.72      0.70      0.70        47



In [58]:
scores = cross_val_score(svm.SVC(C=100, kernel='linear'), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.68 (+/- 0.13)


In [59]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
33
[5, 12, 20, 22, 24, 25, 46, 57, 71, 83, 92, 99, 105, 112, 116, 121, 122, 127, 131, 135, 138, 139, 143, 169, 186, 187, 192, 193, 199, 211, 214, 227, 234]


### Naive Bayes (GaussianNB)

In [60]:
accuracy, report, classifier_ch = train_model(naive_bayes.GaussianNB(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_NB: ',accuracy)
print(report)

ch_NB:  0.702127659574
             precision    recall  f1-score   support

        0.0       0.73      0.52      0.61        21
        1.0       0.69      0.85      0.76        26

avg / total       0.71      0.70      0.69        47



In [61]:
scores = cross_val_score(naive_bayes.GaussianNB(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.71 (+/- 0.14)


In [62]:
accuracy, report, classifier_en = train_model(naive_bayes.GaussianNB(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_NB: ',accuracy)
print(report)

en_NB:  0.787234042553
             precision    recall  f1-score   support

        0.0       1.00      0.50      0.67        20
        1.0       0.73      1.00      0.84        27

avg / total       0.84      0.79      0.77        47



In [63]:
scores = cross_val_score(naive_bayes.GaussianNB(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.68 (+/- 0.14)


In [64]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
58
[8, 12, 17, 22, 25, 31, 32, 33, 42, 44, 46, 47, 51, 53, 56, 61, 68, 71, 75, 76, 77, 78, 80, 82, 84, 85, 88, 92, 99, 101, 104, 105, 108, 109, 112, 124, 125, 131, 152, 153, 162, 165, 172, 181, 184, 186, 190, 195, 202, 205, 212, 219, 221, 222, 223, 227, 228, 233]


### Naive Bayes (Multinomial)

In [65]:
accuracy, report, classifier_ch = train_model(naive_bayes.MultinomialNB(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_NB: ',accuracy)
print(report)

ch_NB:  0.787234042553
             precision    recall  f1-score   support

        0.0       0.79      0.71      0.75        21
        1.0       0.79      0.85      0.81        26

avg / total       0.79      0.79      0.79        47



In [66]:
scores = cross_val_score(naive_bayes.MultinomialNB(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.71 (+/- 0.19)


In [67]:
accuracy, report, classifier_en = train_model(naive_bayes.MultinomialNB(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_NB: ',accuracy)
print(report)

en_NB:  0.787234042553
             precision    recall  f1-score   support

        0.0       0.81      0.65      0.72        20
        1.0       0.77      0.89      0.83        27

avg / total       0.79      0.79      0.78        47



In [68]:
scores = cross_val_score(naive_bayes.MultinomialNB(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.72 (+/- 0.14)


In [69]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
26
[7, 30, 46, 48, 62, 71, 77, 80, 83, 85, 92, 115, 116, 117, 131, 137, 139, 144, 153, 157, 179, 192, 202, 203, 226, 227]


### RandomForest

In [70]:
accuracy, report, classifier_ch = train_model(ensemble.RandomForestClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_RF: ',accuracy)
print(report)

ch_RF:  0.808510638298
             precision    recall  f1-score   support

        0.0       0.77      0.81      0.79        21
        1.0       0.84      0.81      0.82        26

avg / total       0.81      0.81      0.81        47



In [71]:
scores = cross_val_score(ensemble.RandomForestClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.70 (+/- 0.20)


In [72]:
accuracy, report, classifier_en = train_model(ensemble.RandomForestClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_RF: ',accuracy)
print(report)

en_RF:  0.723404255319
             precision    recall  f1-score   support

        0.0       0.64      0.80      0.71        20
        1.0       0.82      0.67      0.73        27

avg / total       0.74      0.72      0.72        47



In [73]:
scores = cross_val_score(ensemble.RandomForestClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.69 (+/- 0.16)


In [74]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
35
[0, 5, 12, 22, 24, 25, 46, 57, 71, 80, 92, 99, 103, 105, 110, 111, 112, 116, 121, 131, 135, 138, 139, 143, 169, 181, 184, 186, 187, 192, 193, 199, 211, 214, 227]


### AdaBoost

In [75]:
accuracy, report, classfier_ch = train_model(ensemble.AdaBoostClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_AdaBoost: ',accuracy)
print(report)

ch_AdaBoost:  0.787234042553
             precision    recall  f1-score   support

        0.0       0.74      0.81      0.77        21
        1.0       0.83      0.77      0.80        26

avg / total       0.79      0.79      0.79        47



In [76]:
scores = cross_val_score(ensemble.AdaBoostClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.69 (+/- 0.13)


In [77]:
accuracy, report, classifier_en = train_model(ensemble.AdaBoostClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_AdaBoost: ',accuracy)
print(report)

en_AdaBoost:  0.787234042553
             precision    recall  f1-score   support

        0.0       0.81      0.65      0.72        20
        1.0       0.77      0.89      0.83        27

avg / total       0.79      0.79      0.78        47



In [78]:
scores = cross_val_score(ensemble.AdaBoostClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.70 (+/- 0.12)


In [79]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
34
[5, 22, 25, 28, 30, 36, 39, 46, 57, 59, 71, 83, 92, 99, 105, 111, 116, 117, 131, 133, 138, 139, 165, 169, 184, 186, 187, 192, 193, 195, 199, 211, 214, 227]


### KNN

In [80]:
accuracy, report, classifier_ch = train_model(neighbors.KNeighborsClassifier(n_neighbors=7), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_knn: ',accuracy)
print(report)

ch_knn:  0.595744680851
             precision    recall  f1-score   support

        0.0       0.53      0.76      0.63        21
        1.0       0.71      0.46      0.56        26

avg / total       0.63      0.60      0.59        47



In [81]:
scores = cross_val_score(neighbors.KNeighborsClassifier(n_neighbors=3), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.65 (+/- 0.12)


In [82]:
accuracy, report, classifier_en = train_model(neighbors.KNeighborsClassifier(n_neighbors=7), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_knn: ',accuracy)
print(report)

en_knn:  0.63829787234
             precision    recall  f1-score   support

        0.0       0.55      0.85      0.67        20
        1.0       0.81      0.48      0.60        27

avg / total       0.70      0.64      0.63        47



In [83]:
scores = cross_val_score(neighbors.KNeighborsClassifier(n_neighbors=3), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.60 (+/- 0.28)


In [84]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
18
[19, 46, 48, 57, 66, 73, 75, 85, 131, 137, 139, 164, 193, 206, 213, 214, 229, 233]


### XGBoost

In [85]:
accuracy, report, classifier_ch = train_model(xgboost.XGBClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_XGB: ',accuracy)
print(report)

ch_XGB:  0.765957446809
             precision    recall  f1-score   support

        0.0       0.75      0.71      0.73        21
        1.0       0.78      0.81      0.79        26

avg / total       0.77      0.77      0.77        47



In [86]:
scores = cross_val_score(xgboost.XGBClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.77 (+/- 0.16)


In [87]:
accuracy, report, classifier_en = train_model(xgboost.XGBClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_XGB: ',accuracy)
print(report)

en_XGB:  0.787234042553
             precision    recall  f1-score   support

        0.0       0.73      0.80      0.76        20
        1.0       0.84      0.78      0.81        27

avg / total       0.79      0.79      0.79        47



In [88]:
scores = cross_val_score(xgboost.XGBClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.69 (+/- 0.15)


In [89]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
35
[0, 3, 22, 24, 25, 28, 30, 39, 46, 57, 59, 62, 71, 77, 92, 99, 115, 116, 131, 138, 139, 141, 152, 162, 164, 169, 186, 187, 192, 193, 199, 203, 205, 214, 227]


### Decision Tree

In [90]:
accuracy, report, classifier_ch = train_model(tree.DecisionTreeClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_DT: ',accuracy)
print(report)

ch_DT:  0.765957446809
             precision    recall  f1-score   support

        0.0       0.71      0.81      0.76        21
        1.0       0.83      0.73      0.78        26

avg / total       0.77      0.77      0.77        47



In [91]:
scores = cross_val_score(tree.DecisionTreeClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.66 (+/- 0.27)


In [92]:
accuracy, report, classifier_en = train_model(tree.DecisionTreeClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_DT: ',accuracy)
print(report)

en_DT:  0.68085106383
             precision    recall  f1-score   support

        0.0       0.65      0.55      0.59        20
        1.0       0.70      0.78      0.74        27

avg / total       0.68      0.68      0.68        47



In [93]:
scores = cross_val_score(tree.DecisionTreeClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.65 (+/- 0.18)


In [94]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
35
[12, 20, 22, 24, 25, 34, 39, 46, 57, 71, 92, 99, 105, 110, 111, 112, 116, 127, 131, 133, 138, 139, 148, 169, 186, 187, 192, 193, 199, 203, 205, 211, 214, 227, 234]


### MLP

In [95]:
accuracy, report, classifier_ch = train_model(neural_network.MLPClassifier(max_iter=100), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_MLP: ',accuracy)
print(report)

ch_MLP:  0.659574468085
             precision    recall  f1-score   support

        0.0       0.61      0.67      0.64        21
        1.0       0.71      0.65      0.68        26

avg / total       0.66      0.66      0.66        47





In [96]:
scores = cross_val_score(neural_network.MLPClassifier(max_iter=100), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



K-fold Accuracy: 0.71 (+/- 0.15)




In [97]:
accuracy, report, classifier_en = train_model(neural_network.MLPClassifier(max_iter=100), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_MLP: ',accuracy)
print(report)

en_MLP:  0.765957446809
             precision    recall  f1-score   support

        0.0       0.71      0.75      0.73        20
        1.0       0.81      0.78      0.79        27

avg / total       0.77      0.77      0.77        47



In [98]:
scores = cross_val_score(neural_network.MLPClassifier(max_iter=100), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



K-fold Accuracy: 0.69 (+/- 0.08)




In [99]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
32
[0, 12, 22, 25, 28, 39, 46, 57, 59, 71, 83, 92, 99, 112, 116, 121, 127, 131, 135, 138, 139, 169, 186, 187, 192, 193, 199, 205, 211, 214, 221, 227]
