### Imports

In [19]:
import numpy as np
import pickle
import sklearn
import matplotlib.pyplot as plt

from constants import CLASSIFIERS_FOLDER
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.base import clone

from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors.classification import RadiusNeighborsClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model.ridge import RidgeClassifierCV
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.linear_model.passive_aggressive import PassiveAggressiveClassifier
from sklearn.gaussian_process.gpc import GaussianProcessClassifier
#from sklearn.ensemble.voting_classifier import VotingClassifier
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.svm.classes import OneClassSVM
#from sklearn.mixture import DPGMM
#from sklearn.mixture import GMM
from sklearn.mixture import GaussianMixture
#from sklearn.mixture import VBGMM

from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

### Load Data

In [20]:
# X = np.load('X.npy')
# X = np.load('X_left.npy')
X = np.load('X_right.npy')
y = np.load('y.npy')

### Split Test Train

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=True, random_state=0)

### Learning Curve

In [22]:
def learning_curve(model, X_train, X_test, y_train, y_test):
    train_scores = []
    test_scores = []
    
    for i in np.arange(0.1, 1.1, 0.1):
        train_size = int(len(X_train) * i)
        X_train_ = X_train[:train_size]
        y_train_ = y_train[:train_size]
        
        model_ = clone(model)
        model_.fit(X_train_, y_train_)
        train_pred = model_.predict(X_train_)
        test_pred = model_.predict(X_test)
        
        train_acc = accuracy_score(y_train_, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        
        train_scores.append(train_acc)
        test_scores.append(test_acc)
        
        print(f'{int(i * 100)}%', end=' ')
    print()
        
    plt.plot(train_scores, label='Train')
    plt.plot(test_scores, label='Test')
    plt.title('Learning Curve')
    plt.ylabel('Accuracy')
    plt.xlabel('% of Training Size')
    plt.xticks(range(10), (np.arange(0.1, 1.1, 0.1) * 100).astype(int))
    plt.ylim(0.0, 1.01)
    plt.legend()
    
    print(f'Final Training Accuracy: {train_scores[-1] * 100}%')
    print(f'Final Testing Accuracy: {test_scores[-1] * 100}%')

### Models

In [23]:
classifiers = [
    ExtraTreeClassifier(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    NearestCentroid(),
    #RadiusNeighborsClassifier(),
    KNeighborsClassifier(),
    #ClassifierChain(),
    #MultiOutputClassifier(),
    #OutputCodeClassifier(),
    #OneVsOneClassifier(),
    #OneVsRestClassifier(),
    SGDClassifier(),
    RidgeClassifierCV(),
    RidgeClassifier(),
    PassiveAggressiveClassifier(),
    ##GaussianProcessClassifier(),
#     VotingClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    RandomForestClassifier(),
    CalibratedClassifierCV(),
    BernoulliNB(),
    GaussianNB(),
    LabelPropagation(),
    LabelSpreading(),
    LogisticRegression(),
    LogisticRegressionCV(),
    Perceptron(),
    #MultinomialNB(),
    QuadraticDiscriminantAnalysis(),
    LinearDiscriminantAnalysis(),
    SVC(),
    LinearSVC(),
    #NuSVC(),
    ##OneClassSVM(),
#     DPGMM(),
#     GMM(),
    GaussianMixture(),
#     VBGMM()
]

### Test All Models

In [24]:
scores = []
for classifier in classifiers:
    name2 = str(type(classifier))[16:-2].split('.')
    name2 = ''.join([c for c in name2[-1] if c.isupper()]).lower() + '-' + '-'.join(x[0] for x in name2[:-1])
    name = type(classifier).__name__ + ' '+ name2
    ignore_warnings(category=ConvergenceWarning)(classifier.fit)(X_train, y_train)
    train_pred = classifier.predict(X_train)
    test_pred = classifier.predict(X_test)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)
    scores.append((name, train_acc, test_acc))
    print(name, train_acc, test_acc)
    #learning_curve(classifer, X_train, X_test, y_train, y_test)

ExtraTreeClassifier etc-t-t 0.9540833060019679 0.7433628318584071
DecisionTreeClassifier dtc-t-t 0.9540833060019679 0.8023598820058997
MLPClassifier mlpc-n-m 0.8934076746474254 0.8495575221238938
NearestCentroid nc-n-n 0.3876680878976714 0.4277286135693215
KNeighborsClassifier knc-n-c 0.8838963594621188 0.8584070796460177
SGDClassifier sgdc-l-s 0.670055755985569 0.6843657817109144
RidgeClassifierCV rccv-l-r 0.722859954083306 0.6902654867256637
RidgeClassifier rc-l-r 0.722859954083306 0.6902654867256637
PassiveAggressiveClassifier pac-l-p 0.5080354214496556 0.5250737463126843
AdaBoostClassifier abc-e-w 0.23122335191866186 0.23008849557522124
GradientBoostingClassifier gbc-e-g 0.9540833060019679 0.8377581120943953
BaggingClassifier bc-e-b 0.9508035421449655 0.8495575221238938
ExtraTreesClassifier etc-e-f 0.9540833060019679 0.8613569321533924




RandomForestClassifier rfc-e-f 0.9530993768448671 0.8584070796460177




CalibratedClassifierCV cccv-c 0.7520498524106264 0.7345132743362832
BernoulliNB bnb-n 0.26697277795998686 0.2920353982300885
GaussianNB gnb-n 0.558543784847491 0.5899705014749262


  probabilities /= normalizer


LabelPropagation lp-s-l 0.9540833060019679 0.15634218289085547


  probabilities /= normalizer


LabelSpreading ls-s-l 0.9540833060019679 0.15634218289085547
LogisticRegression lr-l-l 0.8740570678911118 0.8289085545722714




LogisticRegressionCV lrcv-l-l 0.8478189570350935 0.8377581120943953
Perceptron p-l-p 0.6589045588717612 0.6548672566371682
QuadraticDiscriminantAnalysis qda-d 0.9176779271892423 0.6696165191740413




LinearDiscriminantAnalysis lda-d 0.8461790751065923 0.8141592920353983
SVC svc-s-c 0.9540833060019679 0.18584070796460178
LinearSVC lsvc-s-c 0.625778943916038 0.6135693215339233
GaussianMixture gm-m-g 0.14266972777959988 0.12389380530973451


In [25]:
scores.sort(key=lambda x: x[2], reverse=True)
for i, score in enumerate(scores):
    print(i, score[0], score[1], score[2])

0 ExtraTreesClassifier etc-e-f 0.9540833060019679 0.8613569321533924
1 KNeighborsClassifier knc-n-c 0.8838963594621188 0.8584070796460177
2 RandomForestClassifier rfc-e-f 0.9530993768448671 0.8584070796460177
3 MLPClassifier mlpc-n-m 0.8934076746474254 0.8495575221238938
4 BaggingClassifier bc-e-b 0.9508035421449655 0.8495575221238938
5 GradientBoostingClassifier gbc-e-g 0.9540833060019679 0.8377581120943953
6 LogisticRegressionCV lrcv-l-l 0.8478189570350935 0.8377581120943953
7 LogisticRegression lr-l-l 0.8740570678911118 0.8289085545722714
8 LinearDiscriminantAnalysis lda-d 0.8461790751065923 0.8141592920353983
9 DecisionTreeClassifier dtc-t-t 0.9540833060019679 0.8023598820058997
10 ExtraTreeClassifier etc-t-t 0.9540833060019679 0.7433628318584071
11 CalibratedClassifierCV cccv-c 0.7520498524106264 0.7345132743362832
12 RidgeClassifierCV rccv-l-r 0.722859954083306 0.6902654867256637
13 RidgeClassifier rc-l-r 0.722859954083306 0.6902654867256637
14 SGDClassifier sgdc-l-s 0.6700557559

### Save

In [26]:
#str(type(NearestCentroid()))[16:-2].split('.')

In [27]:
if not os.path.isdir(CLASSIFIERS_FOLDER):
    os.mkdir(CLASSIFIERS_FOLDER)
    
for classifier in classifiers:
    name = str(type(classifier))[16:-2].split('.')
    name = ''.join([c for c in name[-1] if c.isupper()]).lower() + '-' + '-'.join(x[0] for x in name[:-1])
    with open(os.path.join(CLASSIFIERS_FOLDER, name), 'wb') as file:
        pickle.dump(classifier, file)