In [8]:
import numpy as np
import warnings
import pickle
import os
from urllib.request import urlopen
import random as rand
%matplotlib inline
import matplotlib.pyplot as plt
import time

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, multilabel_confusion_matrix, classification_report
from sklearn.metrics import label_ranking_average_precision_score, coverage_error, label_ranking_loss

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_predict, cross_val_score

from sklearn import svm as svm, neighbors as nbrs, tree as tree, gaussian_process as GPC, naive_bayes as NB

from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier

from sklearn.neural_network import MLPClassifier

dir = 'C:/ ... /'
os.chdir(dir)

url_main = 'file:///C:/ ... /'
data_file_name = "2_states.pkl"
label_file_name = "2_labels.pkl"

#DATA
data = pickle.load(urlopen(url_main + data_file_name))
data = np.unpackbits(data).reshape(-1, 2500)
data = data.astype('int')
#data[np.where(data==0)] = -1

#LABELS
labels = pickle.load(urlopen(url_main + label_file_name))
labels = np.unpackbits(labels).reshape(40000, -1)

X = data
Y = labels

print('X shape:', X.shape)
print('Y shape:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2)

print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)

#sorted(sklearn.metrics.SCORERS.keys()) UNCOMMENT TO PRINT ALL SCORERS

X shape: (40000, 2500)
Y shape: (40000, 1)
X_train shape: (32000, 2500)
Y_train shape: (32000, 1)


In [19]:
clf1 = LogisticRegression(solver='lbfgs', random_state=42, verbose=1, max_iter=1000)

clf2 = svm.LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, 
                    multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, 
                    verbose=1, random_state=42, max_iter=1000)

clf3 = SGDClassifier(alpha=0.01, shuffle=True, early_stopping=True, 
                    learning_rate='adaptive', loss='hinge', eta0=0.001,
                    penalty='l2', random_state=42)

clf4 = NB.MultinomialNB(alpha=1.5, fit_prior=True)

clf5 = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=42,
                                  min_samples_split=50, min_samples_leaf=1)

clf6 = RandomForestClassifier(oob_score=True, n_estimators=400, n_jobs=-1,random_state=42)

clf7 = ExtraTreesClassifier(oob_score=True, n_estimators=400, n_jobs=-1, bootstrap=True,
                            random_state=42)

clf8 = BaggingClassifier(RandomForestClassifier(n_estimators=400),
                         max_samples=0.5, max_features=0.5, n_jobs=-1,random_state=42)

clf95 = GradientBoostingClassifier(random_state=42)

clf9 = HistGradientBoostingClassifier(random_state=42)


clf10 = MLPClassifier(solver='lbfgs', hidden_layer_sizes = (5, 1000), alpha = 1e-08,
                      warm_start=True, verbose=1, random_state=42)


def benchmark(clf, CVV_n):
    
    print("\n", clf, "\n")
    
    y_train_pred = cross_val_predict(clf, X_train, Y_train.ravel(), cv=2, n_jobs=-1)

    print(classification_report(Y_train, y_train_pred), '\n', 
          confusion_matrix(Y_train, y_train_pred), '\n',
          '\n Precision:', "{0:.4f}".format(precision_score(Y_train, y_train_pred)),
          '\n Recall:   ', "{0:.4f}".format(recall_score(Y_train, y_train_pred)),
          '\n Accuracy: ', "{0:.4f}".format(accuracy_score(Y_train, y_train_pred)),
          '\n F1 score: ', "{0:.4f}".format(f1_score(Y_train, y_train_pred)),
          '\n ROC/AUC:  ', "{0:.4f}".format(roc_auc_score(Y_train, y_train_pred)),
          '\n',)

    t1=time.time()

    CVV=cross_validate(clf, X_train, Y_train.ravel(), cv=CVV_n, return_train_score=True, n_jobs=-1,
                       scoring = ['accuracy',
                                  'balanced_accuracy',
                                  'recall',
                                  'precision',
                                  'average_precision',
                                  'roc_auc',
                                  'f1'])

    t2=time.time()

    for cvv_keys in sorted(CVV.keys()):
        print(cvv_keys, "{0:.4f}".format(CVV[str(cvv_keys)].mean()),
              "{0:.4f}".format(CVV[str(cvv_keys)].std()))

    print()
    print("{0:.2f}".format(t2-t1)+'s')
    print()

In [20]:
benchmark(clf4, 5) # test classifier, fastest


 MultinomialNB(alpha=1.5, class_prior=None, fit_prior=True) 

              precision    recall  f1-score   support

           0       0.62      0.83      0.71     18140
           1       0.59      0.33      0.42     13860

    accuracy                           0.61     32000
   macro avg       0.61      0.58      0.57     32000
weighted avg       0.61      0.61      0.58     32000
 
 [[15024  3116]
 [ 9292  4568]] 
 
 Precision: 0.5945 
 Recall:    0.3296 
 Accuracy:  0.6122 
 F1 score:  0.4241 
 ROC/AUC:   0.5789 

fit_time 0.8430 0.0460
score_time 0.8168 0.0145
test_accuracy 0.6061 0.0072
test_average_precision 0.4507 0.0062
test_balanced_accuracy 0.5690 0.0085
test_f1 0.3904 0.0212
test_precision 0.5915 0.0144
test_recall 0.2918 0.0214
test_roc_auc 0.5038 0.0098
train_accuracy 0.6414 0.0081
train_average_precision 0.4964 0.0092
train_balanced_accuracy 0.6085 0.0084
train_f1 0.4671 0.0137
train_precision 0.6552 0.0164
train_recall 0.3630 0.0124
train_roc_auc 0.5702 0.0036

6.61s

In [10]:
benchmark(clf1, 20)






 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False) 

[[16155  2030]
 [ 8484  5331]] 
 Precision: 0.7242 
 Recall:    0.3859 
 Accuracy:  0.6714 
 F1 score:  0.5035 
 ROC/AUC:   0.6371

23.96s

fit_time 137.2472 30.0693
score_time 1.4520 0.5587
test_accuracy 0.6787 0.0075
test_average_precision 0.6191 0.0125
test_balanced_accuracy 0.6413 0.0079
test_f1 0.4964 0.0146
test_precision 0.7681 0.0206
test_recall 0.3670 0.0146
test_roc_auc 0.5146 0.0153
train_accuracy 0.7526 0.0012
train_average_precision 0.7580 0.0008
train_balanced_accuracy 0.7237 0.0012
train_f1 0.6412 0.0019
train_precision 0.8573 0.0021
train_recall 0.5121 0.0021
train_roc_auc 0.6930 0.0012

304.95s



In [9]:
benchmark(clf2, 20)






 LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l1', random_state=42, tol=0.0001,
          verbose=1) 

[[16386  1799]
 [ 8684  5131]] 
 Precision: 0.7404 
 Recall:    0.3714 
 Accuracy:  0.6724 
 F1 score:  0.4947 
 ROC/AUC:   0.6362

37.44s

fit_time 68.6563 20.1039
score_time 0.3747 0.2625
test_accuracy 0.6778 0.0077
test_average_precision 0.6184 0.0128
test_balanced_accuracy 0.6389 0.0084
test_f1 0.4866 0.0164
test_precision 0.7795 0.0195
test_recall 0.3540 0.0160
test_roc_auc 0.5145 0.0160
train_accuracy 0.7500 0.0010
train_average_precision 0.7553 0.0008
train_balanced_accuracy 0.7195 0.0011
train_f1 0.6314 0.0018
train_precision 0.8689 0.0020
train_recall 0.4958 0.0019
train_roc_auc 0.6886 0.0011

147.76s



In [11]:
benchmark(clf3, 20)






 SGDClassifier(alpha=0.01, average=False, class_weight=None, early_stopping=True,
              epsilon=0.1, eta0=0.001, fit_intercept=True, l1_ratio=0.15,
              learning_rate='adaptive', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False) 

[[17701   484]
 [10242  3573]] 
 Precision: 0.8807 
 Recall:    0.2586 
 Accuracy:  0.6648 
 F1 score:  0.3998 
 ROC/AUC:   0.6160

4.76s

fit_time 26.6128 7.7751
score_time 0.3008 0.0640
test_accuracy 0.6548 0.0066
test_average_precision 0.6210 0.0148
test_balanced_accuracy 0.6029 0.0076
test_f1 0.3577 0.0206
test_precision 0.9089 0.0221
test_recall 0.2229 0.0159
test_roc_auc 0.5159 0.0172
train_accuracy 0.7163 0.0015
train_average_precision 0.7541 0.0010
train_balanced_accuracy 0.6730 0.0018
train_f1 0.5199 0.0038
train_precision 0.9646 0.0017
train_recall 0.355

In [12]:
benchmark(clf4, 20)






 MultinomialNB(alpha=1.5, class_prior=None, fit_prior=True) 

[[15333  2852]
 [ 9451  4364]] 
 Precision: 0.6048 
 Recall:    0.3159 
 Accuracy:  0.6155 
 F1 score:  0.4150 
 ROC/AUC:   0.5795

1.51s

fit_time 2.0955 0.4147
score_time 0.3275 0.0876
test_accuracy 0.6119 0.0070
test_average_precision 0.4556 0.0112
test_balanced_accuracy 0.5742 0.0073
test_f1 0.3986 0.0167
test_precision 0.6024 0.0179
test_recall 0.2983 0.0178
test_roc_auc 0.5061 0.0114
train_accuracy 0.6416 0.0030
train_average_precision 0.4928 0.0067
train_balanced_accuracy 0.6078 0.0027
train_f1 0.4649 0.0038
train_precision 0.6540 0.0086
train_recall 0.3607 0.0045
train_roc_auc 0.5622 0.0025

25.13s



In [13]:
benchmark(clf5, 20)






 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=50,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 

[[16944  1241]
 [  822 12993]] 
 Precision: 0.9128 
 Recall:    0.9405 
 Accuracy:  0.9355 
 F1 score:  0.9265 
 ROC/AUC:   0.9361

6.97s

fit_time 36.0299 6.6898
score_time 0.1760 0.0392
test_accuracy 0.9367 0.0061
test_average_precision 0.9423 0.0077
test_balanced_accuracy 0.9366 0.0063
test_f1 0.9274 0.0070
test_precision 0.9192 0.0083
test_recall 0.9358 0.0097
test_roc_auc 0.9666 0.0047
train_accuracy 0.9666 0.0008
train_average_precision 0.9952 0.0002
train_balanced_accuracy 0.9678 0.0009
train_f1 0.9619 0.0010
train_precision 0.9476 0.0027
train_recall 0.9768 0.0030
train_

In [14]:
benchmark(clf6, 20)






 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False) 

[[17963   222]
 [  161 13654]] 
 Precision: 0.9840 
 Recall:    0.9883 
 Accuracy:  0.9880 
 F1 score:  0.9862 
 ROC/AUC:   0.9881

28.99s

fit_time 261.4376 44.3880
score_time 8.0190 4.8211
test_accuracy 0.9881 0.0026
test_average_precision 0.9993 0.0002
test_balanced_accuracy 0.9880 0.0027
test_f1 0.9862 0.0030
test_precision 0.9847 0.0042
test_recall 0.9877 0.0047
test_roc_auc 0.9995 0.0002
train_accuracy 1.0000 0.0000
train_average_precision 1.0000 0.0000
train_balanced_accuracy 1.0000 0.0000
trai

In [15]:
benchmark(clf7, 20)






 ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
                     oob_score=True, random_state=42, verbose=0,
                     warm_start=False) 

[[17967   218]
 [  156 13659]] 
 Precision: 0.9843 
 Recall:    0.9887 
 Accuracy:  0.9883 
 F1 score:  0.9865 
 ROC/AUC:   0.9884

31.88s

fit_time 298.7715 61.9743
score_time 10.4107 9.9449
test_accuracy 0.9878 0.0022
test_average_precision 0.9993 0.0002
test_balanced_accuracy 0.9878 0.0024
test_f1 0.9859 0.0026
test_precision 0.9844 0.0035
test_recall 0.9874 0.0045
test_roc_auc 0.9995 0.0002
train_accuracy 1.0000 0.0000
train_average_precision 1.0000 0.0000
train_balanced_accuracy 1.0000 0.0000
train_f1 1.0000 0

In [16]:
benchmark(clf8, 20)






 BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='auto',
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        n_estimators=400,
                                                        n_jobs=None,
 

In [19]:
benchmark(clf9, 3)






 HistGradientBoostingClassifier(l2_regularization=0.0, learning_rate=0.1,
                               loss='auto', max_bins=256, max_depth=None,
                               max_iter=100, max_leaf_nodes=31,
                               min_samples_leaf=20, n_iter_no_change=None,
                               random_state=42, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0) 

[[17977   208]
 [  157 13658]] 
 Precision: 0.9850 
 Recall:    0.9886 
 Accuracy:  0.9886 
 F1 score:  0.9868 
 ROC/AUC:   0.9886

104.75s

fit_time 115.4460 0.7392
score_time 2.7361 0.0543
test_accuracy 0.9886 0.0005
test_average_precision 0.9993 0.0000
test_balanced_accuracy 0.9887 0.0004
test_f1 0.9868 0.0006
test_precision 0.9845 0.0017
test_recall 0.9891 0.0009
test_roc_auc 0.9995 0.0000
train_accuracy 1.0000 0.0000
train_average_precision 1.0000 0.0000
train_balanced_accuracy 1.0000 0.0000
train_f1 1.0000 0.0000
train_precision 1.0000 0.0000
train_recal

In [4]:
benchmark(clf10, 3)






 MLPClassifier(activation='relu', alpha=1e-08, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 1000), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=1, warm_start=True) 

[[16995  1163]
 [  934 12908]] 
 Precision: 0.9173 
 Recall:    0.9325 
 Accuracy:  0.9345 
 F1 score:  0.9249 
 ROC/AUC:   0.9342

163.27s

fit_time 264.4805 1.2596
score_time 2.2526 0.0213
test_accuracy 0.9429 0.0057
test_average_precision 0.9689 0.0043
test_balanced_accuracy 0.9432 0.0054
test_f1 0.9348 0.0063
test_precision 0.9250 0.0123
test_recall 0.9449 0.0090
test_roc_auc 0.9815 0.0034
train_accuracy 0.9618 0.0143
train_average_precision 0.9890 0.0074
train_balanced_accuracy 0.9630 0.01

In [3]:
benchmark(clf95, 3)






 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False) 

[[17971   187]
 [  259 13583]] 
 Precision: 0.9864 
 Recall:    0.9813 
 Accuracy:  0.9861 
 F1 score:  0.9838 
 ROC/AUC:   0.9855

243.03s

fit_time 356.8168 0.3415
score_time 1.1574 0.0350
test_accuracy 0.9860 0.0008
test_average_precision 0.9990 0.0001
test_balanced_accuracy 0.9854 0.0009
test_f1 0.9838 0.0009
tes