In [1]:
import numpy as np
import warnings
import pickle
import os
from urllib.request import urlopen
import random as rand
%matplotlib inline
import matplotlib.pyplot as plt
import time

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, multilabel_confusion_matrix, classification_report
from sklearn.metrics import label_ranking_average_precision_score, coverage_error, label_ranking_loss

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_predict, cross_val_score
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

from sklearn import svm as svm, neighbors as nbrs, tree as tree, gaussian_process as GPC, naive_bayes as NB

from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import MultiLabelBinarizer

dir = os.getcwd()
os.chdir(dir)

#print(dir)

url_main = 'file:///'+dir+'/'
data_file_name = "3_states.pkl"
label_file_name = "3_labels.npy"

data = pickle.load(urlopen(url_main + data_file_name))
data = np.unpackbits(data).reshape(-1, 2500)
data = data.astype('int')

labels = np.load(urlopen(url_main + label_file_name))

X = data
Y = labels

print('X shape:', X.shape)
print('Y shape:', Y.shape)

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2)

#sorted(sklearn.metrics.SCORERS.keys()) UNCOMMENT TO PRINT ALL SCORERS

X shape: (30000, 2500)
Y shape: (30000, 1)


In [2]:
clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=42,
                          verbose=1, max_iter=1000, n_jobs=-1)

clf2 = svm.LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, 
                    multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, 
                    verbose=1, random_state=42, max_iter=1000)

clf3 = NB.MultinomialNB(alpha=1.5, fit_prior=True)

clf4 = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=42,
                                  min_samples_split=50, min_samples_leaf=1)

clf5 = BaggingClassifier(RandomForestClassifier(n_estimators=400),
                         max_samples=0.5, max_features=0.5, n_jobs=-1,random_state=42)

clf6 = RandomForestClassifier(oob_score=True, n_estimators=400, n_jobs=-1,random_state=42)

clf7 = ExtraTreesClassifier(oob_score=True, n_estimators=400, n_jobs=-1, bootstrap=True,
                            random_state=42)

clf8 = GradientBoostingClassifier(random_state=42)

clf9 = HistGradientBoostingClassifier(random_state=42)

clf10 = MLPClassifier(solver='lbfgs', hidden_layer_sizes = (5, 1000), alpha = 1e-08,
                      warm_start=True, verbose=1, random_state=42)

In [3]:
def cvv_benchmark(clf, n, data, labels): # ordinary CVV
    
    print("\n", clf, "\n\n CROSS-VALIDATION:\n")
    
    t1=time.time()

    CVV=cross_validate(clf, X, Y.ravel(), cv=n, return_train_score=False, n_jobs=-1,
                       scoring = ['accuracy',
                                  'precision_macro',
                                  'recall_macro',
                                  'f1_macro'])

    t2=time.time()
    
    for rows in CVV:
        print(rows, "{0:.4f}".format(CVV[rows].mean()), "{0:.4f}".format(CVV[rows].std()))
    
    print()
    print("{0:.2f}".format((t2-t1)/n)+'s per one iteration of cross-validate')
    print()
    
    return CVV

In [4]:
def strat_benchmark(clf, n, data, labels): # n-fold cross-validation, stratified sampling
      
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    t1 = time.time()
    
    SSS = StratifiedShuffleSplit(n_splits=n, test_size=0.2, random_state=42)
    for train_index, test_index in SSS.split(data, labels):
        x_train, x_test = data[train_index], data[test_index]
        y_train, y_test = labels[train_index].ravel(), labels[test_index].ravel()
        clf.fit(x_train, y_train.ravel())
        y_pred = clf.predict(x_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='macro'))
        recall_scores.append(recall_score(y_test, y_pred, average='macro'))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    
    t2 = time.time()
    
    print('\nSTRATIFIED CROSS VALIDATION:\n')
    print('Test Accuracy', "{0:.4f}".format(np.mean(accuracy_scores)), "{0:.4f}".format(np.std(accuracy_scores)))
    print('Test Precision', "{0:.4f}".format(np.mean(precision_scores)), "{0:.4f}".format(np.std(precision_scores)))
    print('Test Recall', "{0:.4f}".format(np.mean(recall_scores)), "{0:.4f}".format(np.std(recall_scores)))
    print('Test F1-measure', "{0:.4f}".format(np.mean(f1_scores)), "{0:.4f}".format(np.std(f1_scores))) 
    
    print('\n'+"{0:.2f}".format((t2-t1)/n)+'s per one iteration of stratified cross-validate')
    

In [5]:
def totalbench(clf, n, data, labels):
    cvv_benchmark(clf, n, data, labels)
    strat_benchmark(clf, n, data, labels)

totalbench(clf3, 3, X, Y) # test benchmark, fastest


 MultinomialNB(alpha=1.5, class_prior=None, fit_prior=True) 

 CROSS-VALIDATION:

fit_time 1.7604 0.0275
score_time 1.6630 0.1438
test_accuracy 0.4184 0.0068
test_precision_macro 0.4158 0.0071
test_recall_macro 0.4184 0.0068
test_f1_macro 0.4161 0.0064

4.03s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 0.4204 0.0043
Test Precision 0.4175 0.0046
Test Recall 0.4204 0.0043
Test F1-measure 0.4182 0.0046

2.59s per one iteration of stratified cross-validate


In [6]:
totalbench(clf1, 3, X, Y)


 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False) 

 CROSS-VALIDATION:

fit_time 192.9670 2.0667
score_time 1.1299 0.4953
test_accuracy 0.3579 0.0028
test_precision_macro 0.3583 0.0027
test_recall_macro 0.3579 0.0028
test_f1_macro 0.3575 0.0027

66.34s per one iteration of cross-validate



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  3.6min finished



STRATIFIED CROSS VALIDATION:

Test Accuracy 0.3578 0.0072
Test Precision 0.3583 0.0074
Test Recall 0.3578 0.0072
Test F1-measure 0.3575 0.0072

217.63s per one iteration of stratified cross-validate


In [7]:
totalbench(clf2, 3, X, Y)


 LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l1', random_state=42, tol=0.0001,
          verbose=1) 

 CROSS-VALIDATION:

fit_time 201.7764 6.7810
score_time 0.6727 0.0742
test_accuracy 0.3565 0.0026
test_precision_macro 0.3565 0.0027
test_recall_macro 0.3565 0.0026
test_f1_macro 0.3563 0.0027

71.84s per one iteration of cross-validate

[LibLinear][LibLinear][LibLinear]
STRATIFIED CROSS VALIDATION:

Test Accuracy 0.3566 0.0065
Test Precision 0.3567 0.0065
Test Recall 0.3566 0.0065
Test F1-measure 0.3565 0.0065

188.40s per one iteration of stratified cross-validate


In [8]:
totalbench(clf3, 3, X, Y)


 MultinomialNB(alpha=1.5, class_prior=None, fit_prior=True) 

 CROSS-VALIDATION:

fit_time 0.4932 0.0323
score_time 0.5397 0.0254
test_accuracy 0.4184 0.0068
test_precision_macro 0.4158 0.0071
test_recall_macro 0.4184 0.0068
test_f1_macro 0.4161 0.0064

1.83s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 0.4204 0.0043
Test Precision 0.4175 0.0046
Test Recall 0.4204 0.0043
Test F1-measure 0.4182 0.0046

0.54s per one iteration of stratified cross-validate


In [9]:
totalbench(clf4, 3, X, Y)


 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=50,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 

 CROSS-VALIDATION:

fit_time 10.7215 0.1425
score_time 0.3454 0.0126
test_accuracy 0.8782 0.0010
test_precision_macro 0.8823 0.0012
test_recall_macro 0.8782 0.0010
test_f1_macro 0.8794 0.0010

3.97s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 0.8767 0.0062
Test Precision 0.8818 0.0042
Test Recall 0.8767 0.0062
Test F1-measure 0.8781 0.0058

11.72s per one iteration of stratified cross-validate


In [10]:
totalbench(clf5, 3, X, Y)


 BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='auto',
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        n_estimators=400,
                                                        n_jobs=None,
     

In [11]:
totalbench(clf6, 3, X, Y)


 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=True, random_state=42, verbose=0,
                       warm_start=False) 

 CROSS-VALIDATION:

fit_time 50.3065 1.2136
score_time 2.0521 0.1199
test_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000

18.13s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 1.0000 0.0000
Test Precision 1.0000 0.0000
Test Recall 1.0000 0.0000
Test F1-measure 1.0000 0.0000

31.51s per one iteration of stratified cross-validate


In [12]:
totalbench(clf7, 3, X, Y)


 ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
                     oob_score=True, random_state=42, verbose=0,
                     warm_start=False) 

 CROSS-VALIDATION:

fit_time 49.5463 0.1499
score_time 2.8871 0.1144
test_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000

17.67s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 1.0000 0.0000
Test Precision 1.0000 0.0000
Test Recall 1.0000 0.0000
Test F1-measure 1.0000 0.0000

31.31s per one iteration of stratified cross-validate


In [16]:
totalbench(clf8, 2, X, Y)


 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False) 

 CROSS-VALIDATION:

fit_time 652.0205 0.7953
score_time 1.2795 0.0125
test_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000

330.76s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 1.0000 0.0000
Test Precision 1.0000 0.

In [14]:
totalbench(clf9, 2, X, Y)


 HistGradientBoostingClassifier(l2_regularization=0.0, learning_rate=0.1,
                               loss='auto', max_bins=256, max_depth=None,
                               max_iter=100, max_leaf_nodes=31,
                               min_samples_leaf=20, n_iter_no_change=None,
                               random_state=42, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0) 

 CROSS-VALIDATION:

fit_time 360.8353 0.8461
score_time 7.2965 0.0728
test_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000

185.01s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 1.0000 0.0000
Test Precision 1.0000 0.0000
Test Recall 1.0000 0.0000
Test F1-measure 1.0000 0.0000

226.75s per one iteration of stratified cross-validate


In [15]:
totalbench(clf10, 3, X, Y)


 MLPClassifier(activation='relu', alpha=1e-08, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 1000), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=1, warm_start=True) 

 CROSS-VALIDATION:

fit_time 321.7102 1.7780
score_time 1.4022 0.2575
test_accuracy 0.8354 0.0010
test_precision_macro 0.8384 0.0003
test_recall_macro 0.8354 0.0010
test_f1_macro 0.8329 0.0014

110.38s per one iteration of cross-validate


STRATIFIED CROSS VALIDATION:

Test Accuracy 0.8759 0.0199
Test Precision 0.8793 0.0195
Test Recall 0.8759 0.0199
Test F1-measure 0.8746 0.0196

213.42s per one iteration of stratified cross-validate
