In [79]:
import numpy as np
import warnings
import pickle
import os
from urllib.request import urlopen
import random as rand
%matplotlib inline
import matplotlib.pyplot as plt
import time

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, multilabel_confusion_matrix, classification_report
from sklearn.metrics import label_ranking_average_precision_score, coverage_error, label_ranking_loss

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_predict, cross_val_score
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

from sklearn import svm as svm, neighbors as nbrs, tree as tree, gaussian_process as GPC, naive_bayes as NB

from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import MultiLabelBinarizer

dir = 'C:/Users/10/Desktop/proper/'
os.chdir(dir)
url_main = 'file:///C:/Users/10/Desktop/proper/'

data_file_name = "3_states.pkl"
label_file_name = "3_labels.npy"

#DATA
data = pickle.load(urlopen(url_main + data_file_name))
data = np.unpackbits(data).reshape(-1, 2500)
data = data.astype('int')
#data[np.where(data==0)] = -1

#LABELS
labels = np.load(urlopen(url_main + label_file_name))
#labels = np.unpackbits(labels).reshape(30000, -1)

X = data
Y = labels

print('X shape:', X.shape)
print('Y shape:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2)

#print('X_train shape:', X_train.shape)
#print('Y_train shape:', Y_train.shape)

#sorted(sklearn.metrics.SCORERS.keys()) UNCOMMENT TO PRINT ALL SCORERS

X shape: (30000, 2500)
Y shape: (30000, 1)


In [14]:
clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=42,
                          verbose=1, max_iter=1000, n_jobs=-1)

clf2 = svm.LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, 
                    multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, 
                    verbose=1, random_state=42, max_iter=1000)

clf3 = NB.MultinomialNB(alpha=1.5, fit_prior=True)

clf4 = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=42,
                                  min_samples_split=50, min_samples_leaf=1)

clf5 = BaggingClassifier(RandomForestClassifier(n_estimators=400),
                         max_samples=0.5, max_features=0.5, n_jobs=-1,random_state=42)

clf6 = RandomForestClassifier(oob_score=True, n_estimators=400, n_jobs=-1,random_state=42)

clf7 = ExtraTreesClassifier(oob_score=True, n_estimators=400, n_jobs=-1, bootstrap=True,
                            random_state=42)

clf8 = GradientBoostingClassifier(random_state=42)

clf9 = HistGradientBoostingClassifier(random_state=42)

clf10 = MLPClassifier(solver='lbfgs', hidden_layer_sizes = (5, 1000), alpha = 1e-08,
                      warm_start=True, verbose=1, random_state=42)

def benchmark(clf, CVV_n):
    
    print("\n", clf, "\n")
    
    t1=time.time()

    CVV=cross_validate(clf, X, Y.ravel(), cv=CVV_n, return_train_score=True, n_jobs=-1,
                       scoring = ['accuracy',
                                  'precision_macro',
                                  'recall_macro',
                                  'f1_macro'])

    t2=time.time()
    
    for rows in CVV:
        print(rows, "{0:.4f}".format(CVV[rows].mean()), "{0:.4f}".format(CVV[rows].std()))
    
    print()
    print("{0:.2f}".format((t2-t1)/CVV_n)+'s per one iteration of cross-validate')
    print()
    
    return CVV

In [13]:
benchmark(clf3, 3) # test benchmark, fastest classifier


 MultinomialNB(alpha=1.5, class_prior=None, fit_prior=True) 

fit_time 0.6452 0.0147
score_time 0.6757 0.0221
test_accuracy 0.4184 0.0068
train_accuracy 0.4871 0.0027
test_precision_macro 0.4158 0.0071
train_precision_macro 0.4873 0.0026
test_recall_macro 0.4184 0.0068
train_recall_macro 0.4871 0.0027
test_f1_macro 0.4161 0.0064
train_f1_macro 0.4870 0.0028

1.39s per one iteration of cross-validate



{'fit_time': array([0.62433863, 0.65557861, 0.65557861]),
 'score_time': array([0.64438701, 0.69129133, 0.69129133]),
 'test_accuracy': array([0.4270146 , 0.41044104, 0.41774177]),
 'train_accuracy': array([0.48594859, 0.48447578, 0.49087546]),
 'test_precision_macro': array([0.42511076, 0.40804602, 0.41428225]),
 'train_precision_macro': array([0.48657528, 0.48443935, 0.4907894 ]),
 'test_recall_macro': array([0.4270146 , 0.41044104, 0.41774177]),
 'train_recall_macro': array([0.48594859, 0.48447578, 0.49087546]),
 'test_f1_macro': array([0.42427945, 0.40867957, 0.41529269]),
 'train_f1_macro': array([0.48560932, 0.4844489 , 0.49082423])}

In [26]:
benchmark(clf1, 3) 

fit_time 112.7080 0.9023
score_time 1.0259 0.2981
test_accuracy 0.3579 0.0028
train_accuracy 0.5466 0.0012
test_precision_macro 0.3583 0.0027
train_precision_macro 0.5466 0.0012
test_recall_macro 0.3579 0.0028
train_recall_macro 0.5466 0.0012
test_f1_macro 0.3575 0.0027
train_f1_macro 0.5466 0.0012


{'fit_time': array([112.7519722 , 111.58162975, 113.79044199]),
 'score_time': array([1.02280211, 1.39251757, 0.66232395]),
 'test_accuracy': array([0.35472905, 0.36143614, 0.35753575]),
 'train_accuracy': array([0.54535454, 0.54622269, 0.54832258]),
 'test_precision_macro': array([0.35523103, 0.3617542 , 0.35784214]),
 'train_precision_macro': array([0.54535861, 0.54613379, 0.54829517]),
 'test_recall_macro': array([0.35472905, 0.36143614, 0.35753575]),
 'train_recall_macro': array([0.54535454, 0.54622269, 0.54832258]),
 'test_f1_macro': array([0.35450518, 0.36105437, 0.35681581]),
 'train_f1_macro': array([0.5453565 , 0.54615702, 0.54830829])}

In [96]:
benchmark(clf2, 3)


 LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l1', random_state=42, tol=0.0001,
          verbose=1) 

fit_time 111.9821 4.9794
score_time 0.4071 0.0128
test_accuracy 0.3565 0.0026
train_accuracy 0.5425 0.0010
test_precision_macro 0.3565 0.0027
train_precision_macro 0.5424 0.0010
test_recall_macro 0.3565 0.0026
train_recall_macro 0.5425 0.0010
test_f1_macro 0.3563 0.0027
train_f1_macro 0.5423 0.0011

40.70s per one iteration of cross-validate



{'fit_time': array([113.06403518, 117.46721196, 105.41512322]),
 'score_time': array([0.40627956, 0.39190793, 0.4232347 ]),
 'test_accuracy': array([0.35692861, 0.35943594, 0.35313531]),
 'train_accuracy': array([0.54370437, 0.54117294, 0.54272286]),
 'test_precision_macro': array([0.35715001, 0.3593965 , 0.35298288]),
 'train_precision_macro': array([0.54361204, 0.54109761, 0.54253786]),
 'test_recall_macro': array([0.35692861, 0.35943594, 0.35313531]),
 'train_recall_macro': array([0.54370437, 0.54117294, 0.54272286]),
 'test_f1_macro': array([0.35686183, 0.35925819, 0.35264277]),
 'train_f1_macro': array([0.5435794 , 0.54097091, 0.54249059])}

In [28]:
benchmark(clf3, 3) 

fit_time 0.4704 0.0074
score_time 0.6105 0.0221
test_accuracy 0.4184 0.0068
train_accuracy 0.4871 0.0027
test_precision_macro 0.4158 0.0071
train_precision_macro 0.4873 0.0026
test_recall_macro 0.4184 0.0068
train_recall_macro 0.4871 0.0027
test_f1_macro 0.4161 0.0064
train_f1_macro 0.4870 0.0028


{'fit_time': array([0.47564411, 0.4600172 , 0.47564411]),
 'score_time': array([0.62614942, 0.57927632, 0.62614942]),
 'test_accuracy': array([0.4270146 , 0.41044104, 0.41774177]),
 'train_accuracy': array([0.48594859, 0.48447578, 0.49087546]),
 'test_precision_macro': array([0.42511076, 0.40804602, 0.41428225]),
 'train_precision_macro': array([0.48657528, 0.48443935, 0.4907894 ]),
 'test_recall_macro': array([0.4270146 , 0.41044104, 0.41774177]),
 'train_recall_macro': array([0.48594859, 0.48447578, 0.49087546]),
 'test_f1_macro': array([0.42427945, 0.40867957, 0.41529269]),
 'train_f1_macro': array([0.48560932, 0.4844489 , 0.49082423])}

In [29]:
benchmark(clf4, 3)

fit_time 8.9772 0.1603
score_time 0.2977 0.0123
test_accuracy 0.8782 0.0010
train_accuracy 0.9113 0.0006
test_precision_macro 0.8823 0.0012
train_precision_macro 0.9161 0.0010
test_recall_macro 0.8782 0.0010
train_recall_macro 0.9113 0.0006
test_f1_macro 0.8794 0.0010
train_f1_macro 0.9124 0.0006


{'fit_time': array([8.78933525, 9.18112183, 8.96121836]),
 'score_time': array([0.28241062, 0.31250596, 0.29803777]),
 'test_accuracy': array([0.87782444, 0.87958796, 0.87718772]),
 'train_accuracy': array([0.9110411 , 0.91215439, 0.91075446]),
 'test_precision_macro': array([0.88122773, 0.88404881, 0.8816834 ]),
 'train_precision_macro': array([0.91485069, 0.91738681, 0.91595492]),
 'test_recall_macro': array([0.87782444, 0.87958796, 0.87718772]),
 'train_recall_macro': array([0.9110411 , 0.91215439, 0.91075446]),
 'test_f1_macro': array([0.87884764, 0.88081031, 0.87844962]),
 'train_f1_macro': array([0.91193286, 0.91324508, 0.91187876])}

In [30]:
benchmark(clf5, 3)

fit_time 69.5088 0.5960
score_time 24.9886 0.7278
test_accuracy 1.0000 0.0000
train_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
train_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
train_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000
train_f1_macro 1.0000 0.0000


{'fit_time': array([70.31835532, 69.30712962, 68.90085769]),
 'score_time': array([25.60580301, 25.39330411, 23.96670938]),
 'test_accuracy': array([1., 1., 1.]),
 'train_accuracy': array([1., 1., 1.]),
 'test_precision_macro': array([1., 1., 1.]),
 'train_precision_macro': array([1., 1., 1.]),
 'test_recall_macro': array([1., 1., 1.]),
 'train_recall_macro': array([1., 1., 1.]),
 'test_f1_macro': array([1., 1., 1.]),
 'train_f1_macro': array([1., 1., 1.])}

In [35]:
benchmark(clf6, 3)

fit_time 42.0375 0.5421
score_time 1.9426 0.2249
test_accuracy 1.0000 0.0000
train_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
train_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
train_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000
train_f1_macro 1.0000 0.0000


{'fit_time': array([41.29394078, 42.5711534 , 42.24740624]),
 'score_time': array([1.62596822, 2.07512879, 2.12661052]),
 'test_accuracy': array([1., 1., 1.]),
 'train_accuracy': array([1., 1., 1.]),
 'test_precision_macro': array([1., 1., 1.]),
 'train_precision_macro': array([1., 1., 1.]),
 'test_recall_macro': array([1., 1., 1.]),
 'train_recall_macro': array([1., 1., 1.]),
 'test_f1_macro': array([1., 1., 1.]),
 'train_f1_macro': array([1., 1., 1.])}

In [31]:
benchmark(clf7, 3)

fit_time 47.2850 0.0963
score_time 2.7380 0.0862
test_accuracy 1.0000 0.0000
train_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
train_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
train_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000
train_f1_macro 1.0000 0.0000


{'fit_time': array([47.26941299, 47.41005325, 47.17567658]),
 'score_time': array([2.85960841, 2.68455958, 2.66975546]),
 'test_accuracy': array([1., 1., 1.]),
 'train_accuracy': array([1., 1., 1.]),
 'test_precision_macro': array([1., 1., 1.]),
 'train_precision_macro': array([1., 1., 1.]),
 'test_recall_macro': array([1., 1., 1.]),
 'train_recall_macro': array([1., 1., 1.]),
 'test_f1_macro': array([1., 1., 1.]),
 'train_f1_macro': array([1., 1., 1.])}

In [32]:
benchmark(clf8, 3)

fit_time 960.5446 1.1778
score_time 0.9465 0.0246
test_accuracy 1.0000 0.0000
train_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
train_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
train_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000
train_f1_macro 1.0000 0.0000


{'fit_time': array([960.72795177, 959.01922536, 961.88667059]),
 'score_time': array([0.97122002, 0.95525241, 0.91294885]),
 'test_accuracy': array([1., 1., 1.]),
 'train_accuracy': array([1., 1., 1.]),
 'test_precision_macro': array([1., 1., 1.]),
 'train_precision_macro': array([1., 1., 1.]),
 'test_recall_macro': array([1., 1., 1.]),
 'train_recall_macro': array([1., 1., 1.]),
 'test_f1_macro': array([1., 1., 1.]),
 'train_f1_macro': array([1., 1., 1.])}

In [33]:
benchmark(clf9, 3)

fit_time 334.8191 1.7788
score_time 2.8501 0.3519
test_accuracy 1.0000 0.0000
train_accuracy 1.0000 0.0000
test_precision_macro 1.0000 0.0000
train_precision_macro 1.0000 0.0000
test_recall_macro 1.0000 0.0000
train_recall_macro 1.0000 0.0000
test_f1_macro 1.0000 0.0000
train_f1_macro 1.0000 0.0000


{'fit_time': array([333.28187704, 333.86336613, 337.31217718]),
 'score_time': array([2.54038167, 2.66753697, 3.34228373]),
 'test_accuracy': array([1., 1., 1.]),
 'train_accuracy': array([1., 1., 1.]),
 'test_precision_macro': array([1., 1., 1.]),
 'train_precision_macro': array([1., 1., 1.]),
 'test_recall_macro': array([1., 1., 1.]),
 'train_recall_macro': array([1., 1., 1.]),
 'test_f1_macro': array([1., 1., 1.]),
 'train_f1_macro': array([1., 1., 1.])}

In [34]:
benchmark(clf10, 3)

fit_time 260.3913 1.9397
score_time 1.0928 0.1510
test_accuracy 0.8354 0.0010
train_accuracy 0.9814 0.0020
test_precision_macro 0.8384 0.0003
train_precision_macro 0.9816 0.0019
test_recall_macro 0.8354 0.0010
train_recall_macro 0.9814 0.0020
test_f1_macro 0.8329 0.0014
train_f1_macro 0.9814 0.0020


{'fit_time': array([258.2733202 , 259.94048023, 262.95996761]),
 'score_time': array([1.29165792, 1.06092715, 0.92592168]),
 'test_accuracy': array([0.83653269, 0.83568357, 0.83408341]),
 'train_accuracy': array([0.98134813, 0.98385081, 0.97905105]),
 'test_precision_macro': array([0.83882527, 0.838051  , 0.83828717]),
 'train_precision_macro': array([0.98147692, 0.9839967 , 0.97925246]),
 'test_recall_macro': array([0.83653269, 0.83568357, 0.83408341]),
 'train_recall_macro': array([0.98134813, 0.98385081, 0.97905105]),
 'test_f1_macro': array([0.83440839, 0.83332907, 0.83107638]),
 'train_f1_macro': array([0.98133037, 0.98384894, 0.97902314])}

In [93]:
def validate(clf, n, data, labels):
    '''
    n-fold cross-validation with stratified sampling
    '''
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    sss = ShuffleSplit(n_splits=n, test_size=0.2, random_state=42)
    for train_index, test_index in sss.split(data, labels):
        x_train, x_test = data[train_index], data[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='macro'))
        recall_scores.append(recall_score(y_test, y_pred, average='macro'))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    print('Accuracy', "{0:.4f}".format(np.mean(accuracy_scores)), "{0:.4f}".format(np.std(accuracy_scores)))
    print('Precision', "{0:.4f}".format(np.mean(precision_scores)), "{0:.4f}".format(np.std(precision_scores)))
    print('Recall', "{0:.4f}".format(np.mean(recall_scores)), "{0:.4f}".format(np.std(recall_scores)))
    print('F1-measure', "{0:.4f}".format(np.mean(f1_scores)), "{0:.4f}".format(np.std(f1_scores))) 

In [95]:
validate(clf5, 3, X, Y.ravel())

Accuracy 1.0000 0.0000
Precision 1.0000 0.0000
Recall 1.0000 0.0000
F1-measure 1.0000 0.0000
