In [None]:
import numpy as np
import warnings
import pickle
import os
from urllib.request import urlopen
import random as rand
%matplotlib inline
import matplotlib.pyplot as plt
import time

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, multilabel_confusion_matrix, classification_report
from sklearn.metrics import label_ranking_average_precision_score, coverage_error, label_ranking_loss

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.model_selection import cross_val_predict, cross_val_score, ShuffleSplit, StratifiedShuffleSplit

from sklearn import svm as svm, neighbors as nbrs, tree as tree, gaussian_process as GPC, naive_bayes as NB

from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier

from sklearn.neural_network import MLPClassifier

dir = os.getcwd()
os.chdir(dir)

#print(dir)

url_main = 'file:///'+dir+'/'
data_file_name = "UNIQUE_STATES_40k.pkl"
label_file_name = "UNIQUE_LABELS_40k.pkl"

#DATA
data = np.load(urlopen(url_main + data_file_name))
data = np.unpackbits(data).reshape(-1, 2500)
data = data.astype('int')

#LABELS
labels = pickle.load(urlopen(url_main + label_file_name))
labels = np.unpackbits(labels).reshape(40000, -1)

X = data
Y = labels

print('X shape:', X.shape)
print('Y shape:', Y.shape)

#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2)

#print('X_train shape:', X_train.shape)
#print('Y_train shape:', Y_train.shape)

#sorted(sklearn.metrics.SCORERS.keys()) UNCOMMENT TO PRINT ALL SCORERS

In [45]:
clf1 = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42, verbose=1,
                          max_iter=1000, n_jobs = -1)

clf2 = svm.LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=1.0,
                    fit_intercept=True, intercept_scaling=1, class_weight=None, 
                    verbose=1, random_state=42, max_iter=1000)

clf3 = NB.MultinomialNB(alpha=1.5, fit_prior=True)

clf4 = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', random_state=42,
                                  min_samples_split=50, min_samples_leaf=1)

clf5 = BaggingClassifier(RandomForestClassifier(n_estimators=400),
                         max_samples=0.5, max_features=0.5, n_jobs=-1,random_state=42)

clf6 = RandomForestClassifier(oob_score=False, n_estimators=400, n_jobs=-1,random_state=42)

clf7 = ExtraTreesClassifier(oob_score=False, n_estimators=400, n_jobs=-1, bootstrap=True,
                            random_state=42)

clf8 = GradientBoostingClassifier(random_state=42)

clf9 = HistGradientBoostingClassifier(random_state=42)

clf10 = MLPClassifier(solver='lbfgs', hidden_layer_sizes = (5, 1000), alpha = 1e-08,
                      warm_start=True, verbose=1, random_state=42)

In [46]:
def cvv_benchmark(clf, n, data, labels): # ordinary CVV
    
    print("\n", clf, "\n\n CROSS-VALIDATION:\n")
    
    t1 = time.time()

    CVV = cross_validate(clf, data, labels.ravel(), cv=n, return_train_score=False, n_jobs=-1,
                       scoring = ['accuracy',
                                  'precision',
                                  'recall',
                                  'f1'])

    t2 = time.time()

    for rows in CVV:
        print(rows, "{0:.4f}".format(CVV[rows].mean()), "{0:.4f}".format(CVV[rows].std()))
    
    print()
    print("{0:.2f}".format((t2-t1)/n)+'s per one iteration of cross-validate')
    print()
    
#    return CVV

In [56]:
def strat_benchmark(clf, n, data, labels): # n-fold cross-validation, stratified sampling
      
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    t1 = time.time()
    
    SSS = StratifiedShuffleSplit(n_splits=n, test_size=0.2, random_state=42)
    for train_index, test_index in SSS.split(data, labels):
        x_train, x_test = data[train_index], data[test_index]
        y_train, y_test = labels[train_index].ravel(), labels[test_index].ravel()
        clf.fit(x_train, y_train.ravel())
        y_pred = clf.predict(x_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
    
    t2 = time.time()
    
    print('\nSTRATIFIED CROSS VALIDATION:\n')
    print('Test Accuracy', "{0:.4f}".format(np.mean(accuracy_scores)), "{0:.4f}".format(np.std(accuracy_scores)))
    print('Test Precision', "{0:.4f}".format(np.mean(precision_scores)), "{0:.4f}".format(np.std(precision_scores)))
    print('Test Recall', "{0:.4f}".format(np.mean(recall_scores)), "{0:.4f}".format(np.std(recall_scores)))
    print('Test F1-measure', "{0:.4f}".format(np.mean(f1_scores)), "{0:.4f}".format(np.std(f1_scores))) 
    
    print('\n'+"{0:.2f}".format((t2-t1)/n)+'s per one iteration of stratified cross-validate')
    

In [57]:
def totalbench(clf, n, data, labels):
    cvv_benchmark(clf, n, data, labels)
    strat_benchmark(clf, n, data, labels)

totalbench(clf3, 3, X, Y) # test benchmark, fastest

In [59]:
totalbench(clf1, 3, X, Y)


 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='lbfgs', tol=0.0001, verbose=1, warm_start=False) 

 CROSS-VALIDATION:

fit_time 70.5476 18.7508
score_time 1.5172 0.8391
test_accuracy 0.8501 0.0979
test_precision 0.8116 0.1276
test_recall 0.8768 0.0555
test_f1 0.8410 0.0957

28.95s per one iteration of cross-validate



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  6.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  6.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 10.2min finished


STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9323 0.0022
Test Precision 0.9521 0.0039
Test Recall 0.8890 0.0014
Test F1-measure 0.9195 0.0026

454.06s per one iteration of stratified cross-validate


In [60]:
totalbench(clf2, 3, X, Y)


 LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l1', random_state=42, tol=0.0001,
          verbose=1) 

 CROSS-VALIDATION:

fit_time 91.5743 59.5063
score_time 0.5660 0.0460
test_accuracy 0.8536 0.0851
test_precision 0.8157 0.1069
test_recall 0.8666 0.0712
test_f1 0.8397 0.0896

56.00s per one iteration of cross-validate

[LibLinear][LibLinear][LibLinear]STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9336 0.0020
Test Precision 0.9522 0.0029
Test Recall 0.8922 0.0031
Test F1-measure 0.9212 0.0024

115.64s per one iteration of stratified cross-validate


In [None]:
totalbench(clf3, 3, X, Y)

In [61]:
totalbench(clf4, 3, X, Y)


 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=50,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 

 CROSS-VALIDATION:

fit_time 14.4560 1.9425
score_time 0.3560 0.0072
test_accuracy 0.8012 0.1369
test_precision 0.7365 0.1842
test_recall 0.9467 0.0528
test_f1 0.8185 0.1221

7.49s per one iteration of cross-validate

STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9550 0.0037
Test Precision 0.9482 0.0069
Test Recall 0.9484 0.0027
Test F1-measure 0.9483 0.0041

14.52s per one iteration of stratified cross-validate


In [62]:
totalbench(clf5, 3, X, Y)


 BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features='auto',
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        n_estimators=400,
                                                        n_jobs=None,
     

In [63]:
totalbench(clf6, 3, X, Y)


 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False) 

 CROSS-VALIDATION:

fit_time 43.8913 7.3658
score_time 5.6153 4.0035
test_accuracy 0.8168 0.2162
test_precision 0.7808 0.2267
test_recall 0.9623 0.0533
test_f1 0.8486 0.1687

18.08s per one iteration of cross-validate

STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9895 0.0002
Test Precision 0.9946 0.0016
Test Recall 0.9811 0.0016
Test F1-measure 0.9878 0.0003

23.00s per one iteration of stratified cross-validate


In [64]:
totalbench(clf7, 3, X, Y)


 ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False) 

 CROSS-VALIDATION:

fit_time 51.8114 8.0418
score_time 5.8535 4.0490
test_accuracy 0.8183 0.2131
test_precision 0.7811 0.2247
test_recall 0.9635 0.0516
test_f1 0.8494 0.1667

20.71s per one iteration of cross-validate

STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9892 0.0003
Test Precision 0.9944 0.0016
Test Recall 0.9808 0.0014
Test F1-measure 0.9876 0.0003

27.46s per one iteration of stratified cross-validate


In [68]:
totalbench(clf8, 2, X, Y)


 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False) 

 CROSS-VALIDATION:

fit_time 471.9470 2.5256
score_time 1.4446 0.0770
test_accuracy 0.7977 0.1681
test_precision 0.7372 0.1900
test_recall 0.9302 0.0698
test_f1 0.8156 0.1466

240.40s per one iteration of cross-validate

STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9854 0.0002
Test Precision 0.9921 0.0009
Test Recall 0.

In [67]:
totalbench(clf9, 2, X, Y)


 HistGradientBoostingClassifier(l2_regularization=0.0, learning_rate=0.1,
                               loss='auto', max_bins=256, max_depth=None,
                               max_iter=100, max_leaf_nodes=31,
                               min_samples_leaf=20, n_iter_no_change=None,
                               random_state=42, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0) 

 CROSS-VALIDATION:

fit_time 141.9721 7.2925
score_time 8.3975 1.2843
test_accuracy 0.7659 0.1118
test_precision 0.6735 0.1089
test_recall 0.9456 0.0502
test_f1 0.7844 0.0919

81.59s per one iteration of cross-validate

STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9920 0.0000
Test Precision 0.9928 0.0014
Test Recall 0.9888 0.0014
Test F1-measure 0.9908 0.0000

105.18s per one iteration of stratified cross-validate


In [66]:
totalbench(clf10, 3, X, Y)


 MLPClassifier(activation='relu', alpha=1e-08, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 1000), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=1, warm_start=True) 

 CROSS-VALIDATION:

fit_time 241.7237 161.0382
score_time 1.7361 0.2262
test_accuracy 0.7173 0.1957
test_precision 0.5050 0.4073
test_recall 0.6529 0.4617
test_f1 0.5560 0.4139

124.05s per one iteration of cross-validate

STRATIFIED CROSS VALIDATION:

Test Accuracy 0.9899 0.0044
Test Precision 0.9887 0.0046
Test Recall 0.9880 0.0059
Test F1-measure 0.9884 0.0051

114.37s per one iteration of stratified cross-validate
