In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import metrics
from time import time

import os, sys

parentPath = os.path.abspath("..")
if parentPath not in sys.path:
    sys.path.insert(0, parentPath)

from utils.experiment import *

# Load data ** Chose your path before get stated **

In [4]:
path = os.getcwd()
rootDir, _ = path.split('ECG-Arritmia-Paper1')
path = 'ECG-Arritmia-Paper1/Database_extracted/Dataset_with_features'
filename = 'MIT-BIH__DS1_5classes__SCM.csv'
dataset = pd.read_csv(os.path.join(rootDir, path, filename));

In [5]:
dataset = dataset.iloc[:,[0, 1, 2, 3, 4, 8]]

In [6]:
dataset.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var9
0,0.834552,0.666667,2.576261,0.417694,0.923901,0
1,0.852697,0.673684,2.490621,0.421475,0.651352,0
2,0.848921,0.666667,2.537718,0.429076,0.708984,0
3,0.854177,0.688172,2.570455,0.390772,0.580959,0
4,0.858868,0.673684,2.497976,0.43851,0.630987,0


# Data preprocessing

In this step we are going standardize our dataset.

In [7]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

Separete features from labels.

In [6]:
data_std = std_scaler.fit_transform(dataset.values[:,:-1])
data_label = dataset.values[:,-1]

# Classifiers specifications

In [7]:
# Non-linear models:
from sklearn.neural_network import MLPClassifier

# Kernel-based models
from sklearn.svm import SVC

##### Model pre-loadings:

# MLP:
mlp_clf = MLPClassifier(solver='adam', learning_rate='adaptive', 
                        max_iter=1300, learning_rate_init=5e-04, tol=1e-4)

# SVM
svm_rbf_clf = SVC(kernel='rbf')

## Hyperameter tunning by randomized search:

Classifiers definitions:

In [8]:
classifiers = {'MLP': mlp_clf, 'SVM-RBF': svm_rbf_clf
              }

Define param range for searching:

In [9]:
param_dist_dict = {'MLP': {"hidden_layer_sizes": list(np.arange(2,500))}, 
                   'SVM-RBF': {'gamma': [2**i for i in range(-15,3)], 'C': [2**i for i in range(-5,15)]}}

In [10]:
from sklearn.model_selection import RandomizedSearchCV

random_search = dict((k,[]) for k in classifiers.keys())

for clf in param_dist_dict.keys():
#     start = time()
    random_search[clf] = RandomizedSearchCV(classifiers[clf], param_dist_dict[clf], cv=5, n_iter=10, verbose=5, n_jobs=100, scoring='accuracy')    
#     random_search[clf] = RandomizedSearchCV(classifiers[clf], param_dist_dict[clf], cv=8, n_iter=20, verbose=5, n_jobs=100, scoring='accuracy')
    start_time = time()
    random_search[clf].fit(data_std, data_label)
    end_time = time() - start_time
    print('Elapsed time: {}'.format(end_time))
    f = open('Results/train_time-{}.txt'.format(clf), 'w')
    f.write('{}'.format(end_time))
    f.close()

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] hidden_layer_sizes=283 ..........................................
[CV] hidden_layer_sizes=283 ..........................................
[CV] hidden_layer_sizes=283 ..........................................
[CV] hidden_layer_sizes=283 ..........................................
[CV] hidden_layer_sizes=283 ..........................................
[CV] hidden_layer_sizes=179 ..........................................
[CV] hidden_layer_sizes=179 ..........................................
[CV] hidden_layer_sizes=179 ..........................................
[CV] hidden_layer_sizes=179 ..........................................
[CV] hidden_layer_sizes=179 ..........................................
[CV] hidden_layer_sizes=112 ..........................................
[CV] hidden_layer_sizes=112 ..........................................
[CV] hidden_layer_sizes=112 ..........................................
[CV] hidden_laye

[Parallel(n_jobs=100)]: Done   5 out of  50 | elapsed:  2.4min remaining: 21.2min


[CV] .. hidden_layer_sizes=44, score=0.9324244801883091, total= 8.3min
[CV] .. hidden_layer_sizes=44, score=0.8723759073965077, total= 8.8min
[CV] . hidden_layer_sizes=179, score=0.9041687101520353, total= 9.5min
[CV] .. hidden_layer_sizes=96, score=0.8603099862664312, total= 9.5min
[CV] . hidden_layer_sizes=129, score=0.9408591604550804, total= 9.9min
[CV] ... hidden_layer_sizes=99, score=0.939486072969792, total=10.1min
[CV] .. hidden_layer_sizes=96, score=0.9287956061200471, total=10.2min
[CV] .. hidden_layer_sizes=96, score=0.9049534085335948, total=10.3min
[CV] .. hidden_layer_sizes=179, score=0.853737492642731, total=10.6min
[CV] . hidden_layer_sizes=266, score=0.9387995292271479, total=10.9min
[CV] ... hidden_layer_sizes=99, score=0.853050814204434, total=11.0min


[Parallel(n_jobs=100)]: Done  16 out of  50 | elapsed: 11.0min remaining: 23.5min


[CV] . hidden_layer_sizes=266, score=0.9035707278791446, total=11.2min
[CV] .. hidden_layer_sizes=99, score=0.9055419323197645, total=11.3min
[CV] . hidden_layer_sizes=257, score=0.8548165587600549, total=11.4min
[CV] . hidden_layer_sizes=112, score=0.9331110239309534, total=11.7min
[CV] . hidden_layer_sizes=257, score=0.9406630051000392, total=11.7min
[CV] . hidden_layer_sizes=112, score=0.8467726113400039, total=11.8min
[CV] . hidden_layer_sizes=179, score=0.9413495488426834, total=12.0min
[CV] .. hidden_layer_sizes=112, score=0.813474551338629, total=12.2min
[CV] . hidden_layer_sizes=112, score=0.9027859525210908, total=12.2min
[CV] .. hidden_layer_sizes=99, score=0.9021973710025505, total=12.3min
[CV] .. hidden_layer_sizes=44, score=0.9054438450220696, total=12.4min


[Parallel(n_jobs=100)]: Done  27 out of  50 | elapsed: 12.5min remaining: 10.6min


[CV] .. hidden_layer_sizes=44, score=0.9042574063174417, total=12.7min
[CV] . hidden_layer_sizes=257, score=0.8285770324605276, total=12.9min
[CV] ... hidden_layer_sizes=96, score=0.902393564842064, total=13.0min
[CV] . hidden_layer_sizes=283, score=0.9402706943899568, total=13.0min
[CV] . hidden_layer_sizes=129, score=0.9063266307013241, total=13.3min
[CV] .. hidden_layer_sizes=129, score=0.807100127488477, total=13.3min
[CV] . hidden_layer_sizes=179, score=0.8069039913700108, total=13.3min
[CV] . hidden_layer_sizes=283, score=0.8242620378542709, total=13.4min
[CV] . hidden_layer_sizes=112, score=0.9101520353114272, total=13.5min
[CV] . hidden_layer_sizes=266, score=0.8451049637041397, total=13.5min
[CV] . hidden_layer_sizes=129, score=0.8454973513831666, total=13.5min


[Parallel(n_jobs=100)]: Done  38 out of  50 | elapsed: 13.5min remaining:  4.3min


[CV] .. hidden_layer_sizes=44, score=0.8151417083455919, total=13.6min
[CV] .. hidden_layer_sizes=283, score=0.853737492642731, total=13.6min
[CV] . hidden_layer_sizes=283, score=0.8998528690534576, total=13.7min
[CV] .... hidden_layer_sizes=96, score=0.84348337746396, total=13.7min
[CV] . hidden_layer_sizes=266, score=0.8278905560458959, total=13.7min
[CV] .. hidden_layer_sizes=257, score=0.903080243280361, total=13.8min
[CV] .. hidden_layer_sizes=266, score=0.904561059342815, total=13.8min
[CV] . hidden_layer_sizes=257, score=0.9057381069151544, total=13.9min
[CV] .. hidden_layer_sizes=99, score=0.8199470432480142, total=14.0min
[CV] . hidden_layer_sizes=179, score=0.9031783402001177, total=14.0min
[CV] . hidden_layer_sizes=129, score=0.9020992740827938, total=14.0min
[CV] . hidden_layer_sizes=283, score=0.9028840494408475, total=14.2min


[Parallel(n_jobs=100)]: Done  50 out of  50 | elapsed: 14.2min finished


Elapsed time: 868.5715322494507
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] gamma=0.000244140625, C=4 .......................................
[CV] gamma=0.000244140625, C=4 .......................................
[CV] gamma=0.000244140625, C=4 .......................................
[CV] gamma=0.000244140625, C=4 .......................................
[CV] gamma=0.000244140625, C=4 .......................................
[CV] gamma=0.0001220703125, C=0.25 ...................................
[CV] gamma=0.0001220703125, C=0.25 ...................................
[CV] gamma=0.0001220703125, C=0.25 ...................................
[CV] gamma=0.0001220703125, C=0.25 ...................................
[CV] gamma=0.0001220703125, C=0.25 ...................................
[CV] gamma=0.0625, C=0.03125 .........................................
[CV] gamma=0.0625, C=0.03125 .........................................
[CV] gamma=0.0625, C=0.03125 ..........................

[Parallel(n_jobs=100)]: Done   5 out of  50 | elapsed:  7.7min remaining: 69.4min


[CV] . gamma=0.0625, C=0.03125, score=0.898970083374203, total= 5.4min
[CV]  gamma=0.0625, C=0.03125, score=0.8923212709620476, total= 5.5min
[CV]  gamma=0.000244140625, C=4, score=0.898970083374203, total= 5.3min
[CV]  gamma=0.000244140625, C=4, score=0.8988819144762652, total= 5.3min
[CV] .. gamma=0.125, C=0.0625, score=0.8514268902618417, total= 5.5min
[CV]  gamma=0.0625, C=0.03125, score=0.8990582695703355, total= 5.6min
[CV]  gamma=0.000244140625, C=4, score=0.8990582695703355, total= 5.5min
[CV]  gamma=0.0625, C=0.03125, score=0.8990582695703355, total= 5.8min
[CV] .. gamma=0.125, C=0.0625, score=0.9016184404119667, total= 5.8min
[CV]  gamma=0.0625, C=0.03125, score=0.8988819144762652, total= 5.8min
[CV]  gamma=0.000244140625, C=4, score=0.8990582695703355, total= 5.7min


[Parallel(n_jobs=100)]: Done  16 out of  50 | elapsed:  8.8min remaining: 18.7min


[CV] .. gamma=0.125, C=0.0625, score=0.8997449480086326, total= 6.0min
[CV]  gamma=0.000244140625, C=4, score=0.8987937628714328, total= 6.1min
[CV] .. gamma=0.125, C=0.0625, score=0.8988819144762652, total= 6.1min
[CV] .. gamma=0.125, C=0.0625, score=0.9078869923484403, total= 6.3min
[CV] ......... gamma=0.5, C=4, score=0.8564842063959192, total=10.0min
[CV] ......... gamma=0.5, C=4, score=0.8456408747670884, total=10.9min
[CV] .......... gamma=0.5, C=4, score=0.936837975676736, total=11.5min
[CV] ....... gamma=0.125, C=32, score=0.866686286050618, total=12.8min
[CV] ......... gamma=0.5, C=4, score=0.9025897586815774, total=12.6min
[CV] ......... gamma=0.5, C=4, score=0.9168219715546837, total=12.8min
[CV] ....... gamma=4, C=0.125, score=0.9139774399215301, total=13.8min


[Parallel(n_jobs=100)]: Done  27 out of  50 | elapsed: 15.9min remaining: 13.6min


[CV] ....... gamma=4, C=0.125, score=0.8935648420639591, total=14.4min
[CV] ........ gamma=4, C=0.25, score=0.8530940472688046, total=14.5min
[CV] ....... gamma=4, C=0.125, score=0.8588800627635579, total=14.6min
[CV] ........ gamma=4, C=0.25, score=0.9143697891123099, total=14.8min
[CV] ........ gamma=4, C=0.25, score=0.9028840494408475, total=15.0min
[CV] ........ gamma=4, C=0.25, score=0.8733568765940749, total=15.1min
[CV] ....... gamma=4, C=0.125, score=0.9021973710025505, total=15.3min
[CV] ......... gamma=4, C=0.5, score=0.8603099862664312, total=15.7min
[CV] ....... gamma=4, C=0.125, score=0.9335033346410357, total=15.6min
[CV] .......... gamma=4, C=0.5, score=0.848092576247916, total=16.0min
[CV] ......... gamma=4, C=0.5, score=0.9159391858754291, total=15.9min


[Parallel(n_jobs=100)]: Done  38 out of  50 | elapsed: 17.6min remaining:  5.5min


[CV] ......... gamma=4, C=0.5, score=0.9027859525210908, total=16.6min
[CV] ......... gamma=4, C=0.25, score=0.937328364064339, total=16.6min
[CV] ......... gamma=4, C=0.5, score=0.9390937622597096, total=16.9min
[CV] ...... gamma=0.125, C=32, score=0.8449543983524566, total=18.6min
[CV] ...... gamma=0.125, C=32, score=0.9320321694782268, total=18.9min
[CV] ....... gamma=0.125, C=32, score=0.901706886403767, total=19.6min
[CV] ...... gamma=0.125, C=32, score=0.9178028445316332, total=20.2min
[CV]  gamma=0.001953125, C=16384, score=0.8491713248994802, total=24.9min
[CV]  gamma=0.001953125, C=16384, score=0.9029821463606043, total=25.2min
[CV]  gamma=0.001953125, C=16384, score=0.9006278202864431, total=25.7min
[CV]  gamma=0.001953125, C=16384, score=0.9120156939676312, total=25.8min
[CV]  gamma=0.001953125, C=16384, score=0.9031973322871715, total=26.1min


[Parallel(n_jobs=100)]: Done  50 out of  50 | elapsed: 26.2min finished


Elapsed time: 1651.0508208274841


# Save all models

In [11]:
print(clf)

SVM-RBF


In [12]:
save_models(random_search)

{'MLP': RandomizedSearchCV(cv=8, error_score='raise',
           estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
        beta_2=0.999, early_stopping=False, epsilon=1e-08,
        hidden_layer_sizes=(100,), learning_rate='adaptive',
        learning_rate_init=0.0005, max_iter=1300, momentum=0.9,
        nesterovs_momentum=True, power_t=0.5, random_state=None,
        shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
        verbose=False, warm_start=False),
           fit_params=None, iid=True, n_iter=20, n_jobs=100,
           param_distributions={'hidden_layer_sizes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, ...81, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000]},
     

In [13]:
u = random_search['MLP']

In [14]:
u.cv_results_



{'mean_fit_time': array([815.47842536, 712.35253181, 735.59505739, 768.82377558,
        704.33608208, 764.58576913, 125.41355691, 756.6995378 ,
        679.49937758, 669.20606089]),
 'mean_score_time': array([0.06226726, 0.07450514, 0.0326221 , 0.04779453, 0.03339429,
        0.08390164, 0.03966637, 0.0768734 , 0.03386216, 0.03690991]),
 'mean_test_score': array([0.88420041, 0.88186598, 0.88125785, 0.88037508, 0.88404347,
        0.88657407, 0.87855069, 0.88398462, 0.8879865 , 0.88592671]),
 'mean_train_score': array([0.92162981, 0.9202468 , 0.9199133 , 0.92048219, 0.92049202,
        0.92141894, 0.91854011, 0.92133557, 0.91970243, 0.91922672]),
 'param_hidden_layer_sizes': masked_array(data=[283, 179, 112, 129, 99, 257, 26, 266, 96, 44],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'hidden_layer_sizes': 283},
  {'hidden_layer_sizes': 179},
  {'hidden_laye

# Compute the Naive-Bayes train time

In [15]:
from sklearn.naive_bayes import GaussianNB

start_time = time()
bayes_clf = GaussianNB()
bayes_clf.fit(data_std, data_label)
end_time = time() - start_time

print('Elapsed time: {}'.format(end_time))
f = open('Results/train_time-Bayes.txt', 'w')
f.write('{}'.format(end_time))
f.close()

Elapsed time: 0.009860038757324219


# Save pipeline to .pkl file

In [9]:
save_pipeline(std_scaler)