### Import Package

In [4]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble, neighbors, tree, neural_network
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# conda install py-xgboost
import xgboost
import string
import os
import pandas as pd 
import numpy as np
import pickle
import data_reader


### Load Data

In [5]:
data = data_reader.read()
data.shape

(600, 176)

In [8]:
print(data)
# print(data[:10,-2:])

[[ 0.          0.          0.         ...,  0.         -0.05805464
  -0.52113092]
 [ 0.          0.          0.         ...,  0.          0.32652215
  -0.35021785]
 [ 0.          0.          0.         ...,  0.         -0.46058144
  -1.51423159]
 ..., 
 [ 0.          0.          0.         ...,  0.         -0.35532401
   0.18141579]
 [ 0.          0.          0.         ...,  1.          0.34444384
   0.14184077]
 [ 0.          1.          0.         ...,  0.          0.24339635
   0.35543395]]


### Preprocessing

In [9]:
data[:,-2:] = np.where(data[:,-2:] > 0.0, 1, 0)

In [10]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data[:,:-2], data[:,-2:], test_size=0.2)

In [11]:
test_x = data[:,:-2]

### Model Training

In [12]:
def train_model(classifier, train_x, train_y, valid_x, valid_y):
    # fit the training dataset on the classifier
    classifier.fit(train_x, train_y)   
    # predict the labels on validation dataset
    predictions = classifier.predict(valid_x)
    
    return metrics.accuracy_score(predictions, valid_y), metrics.classification_report(valid_y, predictions), classifier

In [13]:
def test_models(classifier_ch, classifier_en, test_x):
    diff = []
    pred_ch = classifier_ch.predict(test_x)
    pred_en = classifier_en.predict(test_x)
    for idx in range(len(test_x)):
        if pred_ch[idx] != pred_en[idx]:
            diff.append(idx)
    return diff

### Logistic Regression

In [14]:
accuracy, report, classifier_ch = train_model(linear_model.LogisticRegression(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_glm: ',accuracy)
print(report)

ch_glm:  0.683333333333
             precision    recall  f1-score   support

        0.0       0.69      0.64      0.66        58
        1.0       0.68      0.73      0.70        62

avg / total       0.68      0.68      0.68       120



In [15]:
scores = cross_val_score(linear_model.LogisticRegression(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.63 (+/- 0.11)


In [16]:
accuracy, report, classifier_en = train_model(linear_model.LogisticRegression(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_glm: ',accuracy)
print(report)

en_glm:  0.65
             precision    recall  f1-score   support

        0.0       0.65      0.55      0.60        56
        1.0       0.65      0.73      0.69        64

avg / total       0.65      0.65      0.65       120



In [17]:
scores = cross_val_score(linear_model.LogisticRegression(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.66 (+/- 0.10)


In [18]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
187
[0, 4, 11, 12, 16, 20, 22, 24, 27, 29, 32, 35, 40, 41, 46, 48, 50, 54, 56, 59, 60, 61, 72, 73, 76, 82, 84, 88, 91, 98, 101, 107, 108, 109, 110, 111, 115, 122, 123, 133, 135, 136, 139, 142, 144, 146, 147, 148, 149, 153, 156, 160, 167, 168, 169, 170, 175, 177, 184, 186, 187, 189, 195, 196, 199, 200, 202, 211, 213, 216, 218, 219, 234, 237, 244, 248, 253, 254, 257, 272, 277, 278, 279, 280, 282, 284, 289, 295, 298, 299, 311, 312, 313, 321, 323, 324, 325, 326, 327, 328, 334, 337, 342, 344, 357, 365, 366, 371, 372, 374, 375, 376, 382, 383, 393, 398, 399, 400, 403, 404, 406, 407, 408, 413, 414, 421, 428, 431, 433, 435, 436, 439, 457, 459, 460, 461, 463, 464, 467, 468, 470, 472, 473, 476, 477, 478, 480, 484, 486, 487, 488, 492, 496, 497, 503, 505, 508, 510, 516, 520, 523, 524, 526, 532, 533, 534, 538, 542, 550, 554, 555, 558, 566, 571, 573, 575, 576, 577, 578, 581, 582, 583, 584, 585, 590, 591, 595]


### SVM (C=0.2, kernel=linear)

In [19]:
accuracy, report, classifier_ch= train_model(svm.SVC(C=0.2, kernel='linear'), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_svm: ',accuracy)
print(report)

ch_svm:  0.7
             precision    recall  f1-score   support

        0.0       0.71      0.64      0.67        58
        1.0       0.69      0.76      0.72        62

avg / total       0.70      0.70      0.70       120



In [20]:
scores = cross_val_score(svm.SVC(C=0.2, kernel='linear'), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.65 (+/- 0.11)


In [21]:
accuracy, report, classifier_en = train_model(svm.SVC(C=0.2, kernel='linear'), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_svm: ',accuracy)
print(report)

en_svm:  0.683333333333
             precision    recall  f1-score   support

        0.0       0.68      0.61      0.64        56
        1.0       0.69      0.75      0.72        64

avg / total       0.68      0.68      0.68       120



In [22]:
scores = cross_val_score(svm.SVC(C=0.2, kernel='linear'), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.67 (+/- 0.07)


In [23]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
183
[0, 4, 11, 12, 20, 22, 27, 29, 32, 34, 35, 40, 44, 46, 48, 50, 56, 57, 58, 59, 60, 61, 71, 72, 73, 76, 82, 84, 88, 91, 98, 101, 107, 109, 110, 111, 122, 123, 133, 135, 136, 139, 140, 142, 144, 145, 146, 148, 149, 153, 156, 164, 167, 168, 169, 170, 175, 177, 178, 184, 186, 189, 193, 195, 196, 199, 200, 202, 211, 216, 218, 244, 248, 251, 253, 254, 257, 272, 278, 279, 280, 282, 284, 289, 291, 298, 300, 311, 312, 313, 319, 321, 323, 325, 326, 327, 328, 329, 334, 335, 337, 342, 357, 365, 366, 371, 375, 376, 382, 383, 385, 391, 398, 399, 400, 401, 402, 403, 404, 406, 408, 409, 413, 414, 421, 428, 433, 435, 439, 440, 456, 457, 459, 460, 461, 463, 464, 467, 468, 470, 472, 473, 476, 480, 484, 486, 488, 492, 497, 499, 503, 508, 516, 518, 520, 522, 524, 526, 529, 532, 533, 534, 538, 542, 550, 554, 555, 556, 558, 566, 567, 571, 573, 575, 576, 577, 578, 582, 583, 584, 585, 590, 595]


### SVM(C=100, kernel=linear)

In [24]:
accuracy, report, classifier_ch = train_model(svm.SVC(C=100, kernel='linear'), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_svm: ',accuracy)
print(report)

ch_svm:  0.608333333333
             precision    recall  f1-score   support

        0.0       0.59      0.62      0.61        58
        1.0       0.63      0.60      0.61        62

avg / total       0.61      0.61      0.61       120



In [25]:
scores = cross_val_score(svm.SVC(C=100, kernel='linear'), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.62 (+/- 0.12)


In [26]:
accuracy, report, classifier_en = train_model(svm.SVC(C=100, kernel='linear'), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_svm: ',accuracy)
print(report)

en_svm:  0.616666666667
             precision    recall  f1-score   support

        0.0       0.60      0.55      0.57        56
        1.0       0.63      0.67      0.65        64

avg / total       0.62      0.62      0.62       120



In [27]:
scores = cross_val_score(svm.SVC(C=100, kernel='linear'), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.64 (+/- 0.09)


In [28]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
216
[0, 1, 4, 11, 12, 17, 18, 20, 22, 29, 32, 34, 35, 40, 41, 45, 46, 48, 50, 51, 53, 54, 56, 59, 61, 70, 71, 73, 74, 76, 82, 84, 88, 91, 92, 98, 101, 102, 107, 109, 110, 111, 115, 122, 123, 129, 133, 135, 136, 139, 140, 142, 144, 145, 146, 147, 148, 151, 152, 153, 159, 160, 167, 168, 169, 170, 173, 175, 177, 180, 182, 184, 187, 189, 194, 195, 196, 199, 200, 202, 203, 207, 211, 216, 218, 219, 228, 234, 235, 240, 241, 243, 244, 246, 248, 253, 254, 265, 272, 277, 279, 282, 291, 295, 296, 298, 304, 308, 311, 313, 314, 315, 317, 318, 321, 323, 325, 326, 327, 328, 334, 337, 339, 340, 342, 346, 354, 356, 357, 358, 365, 366, 367, 371, 372, 375, 376, 382, 383, 393, 395, 398, 399, 400, 401, 404, 406, 407, 409, 411, 413, 414, 416, 417, 425, 429, 431, 434, 435, 439, 442, 443, 447, 451, 459, 460, 463, 464, 468, 470, 472, 473, 476, 477, 479, 480, 482, 485, 487, 488, 492, 496, 497, 498, 503, 505, 507, 508, 509, 510, 520, 522, 524, 526, 529, 532, 535, 536, 538, 540,

### Naive Bayes (GaussianNB)

In [24]:
accuracy, report, classifier_ch = train_model(naive_bayes.GaussianNB(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_NB: ',accuracy)
print(report)

ch_NB:  0.666666666667
             precision    recall  f1-score   support

        0.0       0.64      0.79      0.71        61
        1.0       0.71      0.54      0.62        59

avg / total       0.67      0.67      0.66       120



In [25]:
scores = cross_val_score(naive_bayes.GaussianNB(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.64 (+/- 0.08)


In [26]:
accuracy, report, classifier_en = train_model(naive_bayes.GaussianNB(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_NB: ',accuracy)
print(report)

en_NB:  0.558333333333
             precision    recall  f1-score   support

        0.0       0.58      0.42      0.49        60
        1.0       0.55      0.70      0.61        60

avg / total       0.56      0.56      0.55       120



In [27]:
scores = cross_val_score(naive_bayes.GaussianNB(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.67 (+/- 0.11)


In [28]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
183
[0, 4, 7, 11, 12, 20, 22, 24, 27, 37, 40, 41, 48, 51, 54, 55, 57, 59, 60, 67, 76, 81, 84, 87, 88, 98, 101, 103, 107, 109, 112, 118, 120, 121, 122, 128, 132, 133, 135, 137, 139, 140, 142, 143, 144, 147, 153, 158, 160, 163, 165, 167, 170, 171, 178, 180, 184, 188, 190, 192, 196, 197, 198, 200, 203, 205, 206, 213, 215, 216, 219, 228, 231, 234, 238, 245, 246, 251, 252, 253, 254, 256, 257, 262, 271, 279, 280, 281, 282, 283, 286, 287, 288, 289, 292, 293, 294, 295, 298, 299, 311, 314, 318, 319, 321, 323, 326, 327, 328, 334, 335, 340, 341, 344, 347, 351, 354, 361, 364, 365, 371, 376, 382, 383, 385, 388, 390, 392, 393, 398, 401, 403, 404, 406, 409, 410, 414, 417, 428, 433, 436, 438, 439, 443, 460, 463, 464, 466, 467, 468, 471, 476, 487, 497, 500, 503, 505, 508, 510, 516, 518, 519, 520, 524, 525, 532, 533, 534, 539, 541, 546, 547, 548, 558, 561, 562, 578, 581, 582, 584, 590, 594, 595]


### Naive Bayes (Multinomial)

In [29]:
accuracy, report, classifier_ch = train_model(naive_bayes.MultinomialNB(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_NB: ',accuracy)
print(report)

ch_NB:  0.683333333333
             precision    recall  f1-score   support

        0.0       0.66      0.79      0.72        61
        1.0       0.72      0.58      0.64        59

avg / total       0.69      0.68      0.68       120



In [30]:
scores = cross_val_score(naive_bayes.MultinomialNB(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.65 (+/- 0.10)


In [31]:
accuracy, report, classifier_en = train_model(naive_bayes.MultinomialNB(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_NB: ',accuracy)
print(report)

en_NB:  0.6
             precision    recall  f1-score   support

        0.0       0.59      0.65      0.62        60
        1.0       0.61      0.55      0.58        60

avg / total       0.60      0.60      0.60       120



In [32]:
scores = cross_val_score(naive_bayes.MultinomialNB(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.67 (+/- 0.07)


In [33]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
155
[0, 4, 7, 11, 12, 16, 20, 22, 24, 27, 32, 37, 46, 55, 59, 62, 65, 66, 71, 76, 88, 93, 109, 111, 114, 122, 125, 129, 132, 133, 136, 139, 140, 142, 144, 146, 148, 149, 153, 156, 158, 160, 163, 165, 167, 171, 173, 175, 179, 184, 188, 195, 196, 197, 200, 202, 205, 209, 215, 216, 222, 234, 237, 246, 250, 252, 254, 256, 257, 262, 273, 279, 280, 282, 283, 292, 293, 303, 307, 311, 319, 321, 323, 326, 327, 328, 329, 334, 335, 342, 345, 364, 365, 369, 371, 372, 379, 382, 383, 385, 390, 398, 399, 403, 410, 413, 414, 436, 438, 440, 442, 443, 451, 460, 462, 463, 464, 468, 471, 474, 480, 486, 487, 493, 499, 508, 516, 519, 520, 524, 525, 526, 529, 532, 533, 534, 538, 542, 543, 545, 548, 553, 557, 558, 562, 566, 567, 569, 575, 581, 582, 584, 590, 591, 595]


### RandomForest

In [34]:
accuracy, report, classifier_ch = train_model(ensemble.RandomForestClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_RF: ',accuracy)
print(report)

ch_RF:  0.558333333333
             precision    recall  f1-score   support

        0.0       0.55      0.69      0.61        61
        1.0       0.57      0.42      0.49        59

avg / total       0.56      0.56      0.55       120



In [35]:
scores = cross_val_score(ensemble.RandomForestClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.61 (+/- 0.16)


In [36]:
accuracy, report, classifier_en = train_model(ensemble.RandomForestClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_RF: ',accuracy)
print(report)

en_RF:  0.65
             precision    recall  f1-score   support

        0.0       0.63      0.72      0.67        60
        1.0       0.67      0.58      0.63        60

avg / total       0.65      0.65      0.65       120



In [37]:
scores = cross_val_score(ensemble.RandomForestClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.59 (+/- 0.09)


In [38]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
197
[0, 1, 11, 17, 20, 27, 34, 38, 41, 46, 48, 50, 51, 54, 55, 56, 58, 59, 60, 65, 70, 71, 74, 75, 82, 83, 84, 88, 90, 93, 98, 103, 107, 109, 110, 111, 115, 122, 123, 124, 126, 133, 135, 139, 140, 144, 146, 147, 151, 152, 153, 167, 169, 173, 180, 184, 187, 194, 195, 196, 202, 205, 206, 207, 213, 214, 215, 216, 217, 218, 220, 221, 227, 228, 234, 235, 240, 245, 246, 248, 254, 255, 257, 263, 264, 265, 267, 272, 275, 278, 282, 291, 311, 313, 314, 315, 321, 323, 327, 328, 330, 333, 339, 340, 342, 346, 347, 349, 354, 364, 365, 371, 376, 392, 393, 395, 396, 398, 399, 400, 401, 402, 404, 406, 407, 408, 409, 411, 413, 414, 416, 420, 421, 427, 429, 433, 434, 435, 436, 438, 456, 459, 460, 463, 464, 467, 471, 474, 476, 479, 480, 483, 486, 488, 490, 491, 492, 493, 497, 503, 505, 507, 508, 509, 512, 516, 519, 522, 524, 529, 532, 536, 539, 545, 546, 547, 548, 551, 552, 554, 555, 558, 560, 562, 565, 567, 571, 573, 575, 577, 578, 582, 583, 585, 594, 595, 597]


### AdaBoost

In [39]:
accuracy, report, classfier_ch = train_model(ensemble.AdaBoostClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_AdaBoost: ',accuracy)
print(report)

ch_AdaBoost:  0.641666666667
             precision    recall  f1-score   support

        0.0       0.64      0.69      0.66        61
        1.0       0.65      0.59      0.62        59

avg / total       0.64      0.64      0.64       120



In [40]:
scores = cross_val_score(ensemble.AdaBoostClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.65 (+/- 0.14)


In [41]:
accuracy, report, classifier_en = train_model(ensemble.AdaBoostClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_AdaBoost: ',accuracy)
print(report)

en_AdaBoost:  0.625
             precision    recall  f1-score   support

        0.0       0.63      0.62      0.62        60
        1.0       0.62      0.63      0.63        60

avg / total       0.63      0.62      0.62       120



In [42]:
scores = cross_val_score(ensemble.AdaBoostClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.64 (+/- 0.11)


In [43]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
211
[0, 1, 6, 11, 12, 17, 19, 22, 29, 41, 50, 52, 54, 56, 59, 60, 62, 71, 74, 75, 76, 77, 80, 81, 82, 83, 84, 88, 90, 98, 107, 109, 111, 112, 115, 118, 122, 123, 124, 125, 132, 133, 136, 137, 139, 140, 144, 147, 148, 151, 153, 156, 160, 163, 166, 167, 175, 178, 179, 180, 184, 187, 188, 192, 194, 195, 196, 197, 202, 205, 206, 207, 209, 212, 213, 214, 216, 217, 218, 219, 220, 227, 228, 231, 234, 235, 243, 245, 246, 248, 250, 253, 255, 256, 257, 262, 263, 264, 265, 272, 278, 282, 283, 287, 292, 294, 299, 300, 311, 313, 314, 315, 319, 321, 323, 325, 327, 328, 329, 333, 339, 340, 342, 344, 346, 347, 354, 358, 362, 364, 365, 366, 369, 371, 372, 376, 378, 385, 390, 393, 394, 395, 396, 398, 399, 400, 401, 403, 404, 407, 410, 411, 414, 429, 430, 431, 433, 436, 438, 440, 442, 443, 449, 456, 459, 463, 464, 467, 471, 472, 473, 474, 476, 479, 480, 483, 484, 487, 490, 491, 493, 497, 498, 500, 503, 508, 509, 516, 522, 524, 531, 532, 536, 542, 545, 546, 548, 550, 552

### KNN

In [44]:
accuracy, report, classifier_ch = train_model(neighbors.KNeighborsClassifier(n_neighbors=7), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_knn: ',accuracy)
print(report)

ch_knn:  0.6
             precision    recall  f1-score   support

        0.0       0.60      0.64      0.62        61
        1.0       0.60      0.56      0.58        59

avg / total       0.60      0.60      0.60       120



In [45]:
scores = cross_val_score(neighbors.KNeighborsClassifier(n_neighbors=3), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.58 (+/- 0.17)


In [46]:
accuracy, report, classifier_en = train_model(neighbors.KNeighborsClassifier(n_neighbors=7), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_knn: ',accuracy)
print(report)

en_knn:  0.608333333333
             precision    recall  f1-score   support

        0.0       0.65      0.47      0.54        60
        1.0       0.58      0.75      0.66        60

avg / total       0.62      0.61      0.60       120



In [47]:
scores = cross_val_score(neighbors.KNeighborsClassifier(n_neighbors=3), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.56 (+/- 0.11)


In [48]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
184
[1, 11, 13, 17, 18, 22, 26, 30, 37, 46, 48, 52, 53, 54, 56, 59, 60, 67, 76, 82, 83, 84, 85, 88, 90, 101, 103, 106, 107, 108, 111, 114, 116, 118, 123, 124, 126, 129, 131, 133, 135, 136, 137, 139, 148, 153, 157, 159, 160, 171, 177, 180, 181, 184, 191, 195, 196, 197, 198, 201, 202, 204, 205, 206, 207, 212, 215, 218, 219, 224, 228, 230, 231, 232, 235, 244, 246, 247, 250, 252, 255, 262, 265, 269, 270, 271, 272, 275, 278, 279, 282, 285, 286, 288, 291, 294, 295, 298, 299, 304, 311, 318, 319, 329, 330, 332, 334, 337, 338, 354, 361, 364, 365, 377, 378, 380, 385, 386, 392, 393, 394, 398, 399, 400, 402, 403, 408, 409, 410, 417, 424, 425, 433, 440, 448, 449, 453, 460, 469, 472, 476, 477, 478, 480, 484, 486, 487, 490, 492, 496, 506, 507, 508, 510, 511, 516, 518, 524, 525, 530, 532, 536, 537, 539, 540, 541, 543, 544, 548, 553, 554, 556, 557, 566, 569, 574, 577, 582, 586, 589, 593, 595, 598, 599]


### XGBoost

In [49]:
accuracy, report, classifier_ch = train_model(xgboost.XGBClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_XGB: ',accuracy)
print(report)

ch_XGB:  0.608333333333
             precision    recall  f1-score   support

        0.0       0.60      0.67      0.64        61
        1.0       0.62      0.54      0.58        59

avg / total       0.61      0.61      0.61       120



In [50]:
scores = cross_val_score(xgboost.XGBClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.62 (+/- 0.10)


In [51]:
accuracy, report, classifier_en = train_model(xgboost.XGBClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_XGB: ',accuracy)
print(report)

en_XGB:  0.625
             precision    recall  f1-score   support

        0.0       0.65      0.53      0.59        60
        1.0       0.61      0.72      0.66        60

avg / total       0.63      0.62      0.62       120



In [52]:
scores = cross_val_score(xgboost.XGBClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.64 (+/- 0.10)


In [53]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
190
[1, 4, 11, 17, 20, 22, 27, 34, 41, 48, 50, 54, 56, 59, 60, 65, 71, 76, 81, 82, 83, 84, 88, 93, 98, 101, 103, 106, 107, 109, 111, 121, 122, 123, 124, 125, 129, 132, 133, 135, 136, 139, 140, 144, 146, 147, 148, 151, 153, 166, 167, 170, 171, 175, 184, 187, 188, 194, 196, 197, 198, 209, 212, 213, 215, 216, 217, 218, 219, 231, 234, 235, 237, 240, 246, 248, 253, 254, 256, 262, 265, 272, 279, 282, 283, 289, 292, 300, 308, 311, 313, 314, 318, 321, 323, 325, 327, 328, 330, 337, 341, 342, 344, 347, 354, 358, 364, 365, 371, 372, 376, 382, 385, 388, 390, 393, 394, 398, 399, 400, 402, 403, 404, 407, 410, 411, 414, 416, 424, 429, 430, 433, 435, 436, 438, 440, 442, 449, 451, 456, 459, 460, 463, 464, 467, 468, 471, 472, 474, 476, 482, 484, 488, 497, 499, 500, 503, 508, 509, 512, 516, 518, 519, 520, 522, 524, 526, 528, 529, 531, 532, 536, 541, 543, 545, 555, 558, 560, 562, 565, 567, 573, 575, 577, 578, 582, 590, 591, 595, 597]


### Decision Tree

In [54]:
accuracy, report, classifier_ch = train_model(tree.DecisionTreeClassifier(), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_DT: ',accuracy)
print(report)

ch_DT:  0.641666666667
             precision    recall  f1-score   support

        0.0       0.64      0.69      0.66        61
        1.0       0.65      0.59      0.62        59

avg / total       0.64      0.64      0.64       120



In [55]:
scores = cross_val_score(tree.DecisionTreeClassifier(), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.56 (+/- 0.12)


In [56]:
accuracy, report, classifier_en = train_model(tree.DecisionTreeClassifier(), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_DT: ',accuracy)
print(report)

en_DT:  0.625
             precision    recall  f1-score   support

        0.0       0.62      0.63      0.63        60
        1.0       0.63      0.62      0.62        60

avg / total       0.63      0.62      0.62       120



In [57]:
scores = cross_val_score(tree.DecisionTreeClassifier(), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

K-fold Accuracy: 0.57 (+/- 0.11)


In [58]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
187
[1, 2, 4, 10, 11, 17, 20, 22, 27, 34, 48, 50, 51, 54, 56, 58, 59, 60, 62, 70, 71, 76, 82, 83, 84, 88, 90, 93, 98, 103, 107, 109, 111, 115, 118, 122, 123, 124, 126, 133, 135, 139, 140, 144, 146, 147, 151, 152, 153, 167, 169, 173, 175, 180, 184, 187, 189, 194, 196, 205, 206, 207, 213, 216, 217, 218, 219, 220, 227, 228, 234, 235, 240, 246, 248, 253, 254, 256, 257, 260, 263, 265, 267, 272, 275, 282, 291, 307, 308, 309, 311, 313, 314, 315, 319, 321, 323, 327, 328, 330, 337, 340, 342, 346, 347, 354, 365, 371, 372, 376, 385, 388, 392, 393, 395, 396, 398, 399, 400, 402, 404, 406, 407, 408, 409, 411, 413, 414, 416, 420, 429, 433, 434, 435, 436, 438, 449, 456, 459, 460, 463, 464, 471, 474, 476, 480, 482, 483, 486, 488, 490, 491, 492, 497, 499, 503, 508, 509, 512, 519, 522, 524, 532, 536, 539, 546, 547, 551, 554, 555, 558, 560, 561, 562, 565, 567, 571, 573, 575, 576, 578, 583, 584, 585, 591, 595, 597]


### MLP

In [59]:
accuracy, report, classifier_ch = train_model(neural_network.MLPClassifier(max_iter=100), train_x, train_y[:,0], valid_x, valid_y[:,0])
print('ch_MLP: ',accuracy)
print(report)

ch_MLP:  0.633333333333
             precision    recall  f1-score   support

        0.0       0.63      0.69      0.66        61
        1.0       0.64      0.58      0.61        59

avg / total       0.63      0.63      0.63       120





In [60]:
scores = cross_val_score(neural_network.MLPClassifier(max_iter=100), data[:,:-2], data[:,-2], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



K-fold Accuracy: 0.65 (+/- 0.11)




In [61]:
accuracy, report, classifier_en = train_model(neural_network.MLPClassifier(max_iter=100), train_x, train_y[:,1], valid_x, valid_y[:,1])
print('en_MLP: ',accuracy)
print(report)

en_MLP:  0.641666666667
             precision    recall  f1-score   support

        0.0       0.65      0.60      0.63        60
        1.0       0.63      0.68      0.66        60

avg / total       0.64      0.64      0.64       120



In [62]:
scores = cross_val_score(neural_network.MLPClassifier(max_iter=100), data[:,:-2], data[:,-1], cv=10)
print("K-fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



K-fold Accuracy: 0.65 (+/- 0.05)




In [63]:
diff_list = test_models(classifier_ch, classifier_en, test_x)
print("Different idx between two models: ")
print(len(diff_list))
print(diff_list)

Different idx between two models: 
182
[4, 11, 16, 17, 19, 20, 22, 27, 34, 48, 50, 51, 54, 56, 58, 59, 60, 70, 71, 76, 82, 83, 84, 88, 90, 93, 98, 107, 110, 111, 115, 122, 123, 124, 133, 135, 139, 140, 144, 146, 147, 151, 152, 153, 167, 169, 170, 173, 175, 184, 187, 194, 195, 196, 197, 205, 206, 207, 209, 212, 213, 215, 216, 218, 219, 220, 227, 228, 234, 235, 240, 245, 246, 248, 253, 254, 256, 257, 260, 263, 265, 272, 275, 282, 291, 307, 311, 313, 314, 319, 321, 323, 326, 327, 328, 330, 337, 340, 342, 346, 347, 354, 365, 371, 372, 376, 385, 392, 393, 395, 396, 398, 399, 400, 402, 404, 406, 407, 408, 409, 411, 413, 414, 416, 420, 427, 429, 433, 434, 435, 436, 438, 456, 459, 460, 463, 464, 471, 474, 476, 480, 482, 490, 491, 492, 497, 499, 503, 508, 509, 512, 516, 519, 522, 524, 526, 532, 534, 536, 546, 547, 548, 551, 554, 555, 556, 558, 560, 562, 565, 567, 571, 575, 576, 577, 578, 582, 584, 585, 591, 595, 597]
