### Imports

In [1]:
import numpy as np
import pickle
import sklearn
import matplotlib.pyplot as plt

from constants import CLASSIFIERS_FOLDER
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.base import clone

from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors.classification import RadiusNeighborsClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model.ridge import RidgeClassifierCV
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.linear_model.passive_aggressive import PassiveAggressiveClassifier
from sklearn.gaussian_process.gpc import GaussianProcessClassifier
#from sklearn.ensemble.voting_classifier import VotingClassifier
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.svm.classes import OneClassSVM
#from sklearn.mixture import DPGMM
#from sklearn.mixture import GMM
from sklearn.mixture import GaussianMixture
#from sklearn.mixture import VBGMM

from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

### Load Data

In [2]:
# X = np.load('X.npy')
# X = np.load('X_left.npy')
X = np.load('X_right.npy')
y = np.load('y.npy')

### Split Test Train

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=True, random_state=0)

### Learning Curve

In [4]:
def learning_curve(model, X_train, X_test, y_train, y_test):
    train_scores = []
    test_scores = []
    
    for i in np.arange(0.1, 1.1, 0.1):
        train_size = int(len(X_train) * i)
        X_train_ = X_train[:train_size]
        y_train_ = y_train[:train_size]
        
        model_ = clone(model)
        model_.fit(X_train_, y_train_)
        train_pred = model_.predict(X_train_)
        test_pred = model_.predict(X_test)
        
        train_acc = accuracy_score(y_train_, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        
        train_scores.append(train_acc)
        test_scores.append(test_acc)
        
        print(f'{int(i * 100)}%', end=' ')
    print()
        
    plt.plot(train_scores, label='Train')
    plt.plot(test_scores, label='Test')
    plt.title('Learning Curve')
    plt.ylabel('Accuracy')
    plt.xlabel('% of Training Size')
    plt.xticks(range(10), (np.arange(0.1, 1.1, 0.1) * 100).astype(int))
    plt.ylim(0.0, 1.01)
    plt.legend()
    
    print(f'Final Training Accuracy: {train_scores[-1] * 100}%')
    print(f'Final Testing Accuracy: {test_scores[-1] * 100}%')

### Models

In [5]:
classifiers = [
    ExtraTreeClassifier(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    NearestCentroid(),
    #RadiusNeighborsClassifier(),
    KNeighborsClassifier(),
    #ClassifierChain(),
    #MultiOutputClassifier(),
    #OutputCodeClassifier(),
    #OneVsOneClassifier(),
    #OneVsRestClassifier(),
    SGDClassifier(),
    RidgeClassifierCV(),
    RidgeClassifier(),
    PassiveAggressiveClassifier(),
    ##GaussianProcessClassifier(),
#     VotingClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    RandomForestClassifier(),
    CalibratedClassifierCV(),
    BernoulliNB(),
    GaussianNB(),
    LabelPropagation(),
    LabelSpreading(),
    LogisticRegression(),
    LogisticRegressionCV(),
    Perceptron(),
    #MultinomialNB(),
    QuadraticDiscriminantAnalysis(),
    LinearDiscriminantAnalysis(),
    SVC(),
    LinearSVC(),
    #NuSVC(),
    ##OneClassSVM(),
#     DPGMM(),
#     GMM(),
    GaussianMixture(),
#     VBGMM()
]

### Test All Models

In [6]:
scores = []
for classifier in classifiers:
    name = type(classifier).__name__
    ignore_warnings(category=ConvergenceWarning)(classifier.fit)(X_train, y_train)
    train_pred = classifier.predict(X_train)
    test_pred = classifier.predict(X_test)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)
    scores.append((name, train_acc, test_acc))
    print(name, train_acc, test_acc)
    #learning_curve(classifer, X_train, X_test, y_train, y_test)

ExtraTreeClassifier 0.9540833060019679 0.7315634218289085
DecisionTreeClassifier 0.9540833060019679 0.8171091445427728
MLPClassifier 0.9203017382748442 0.8702064896755162
NearestCentroid 0.3876680878976714 0.4277286135693215
KNeighborsClassifier 0.8838963594621188 0.8584070796460177
SGDClassifier 0.6569367005575598 0.6548672566371682
RidgeClassifierCV 0.722859954083306 0.6902654867256637
RidgeClassifier 0.722859954083306 0.6902654867256637
PassiveAggressiveClassifier 0.5401771072482782 0.504424778761062
GaussianProcessClassifier 0.9540833060019679 0.058997050147492625
AdaBoostClassifier 0.23122335191866186 0.22123893805309736
GradientBoostingClassifier 0.9540833060019679 0.8377581120943953
BaggingClassifier 0.9517874713020662 0.8761061946902655
ExtraTreesClassifier 0.9540833060019679 0.8761061946902655




RandomForestClassifier 0.9521154476877665 0.8584070796460177




CalibratedClassifierCV 0.7520498524106264 0.7345132743362832
BernoulliNB 0.26697277795998686 0.2920353982300885
GaussianNB 0.558543784847491 0.5899705014749262


  probabilities /= normalizer


LabelPropagation 0.9540833060019679 0.15634218289085547


  probabilities /= normalizer


LabelSpreading 0.9540833060019679 0.15634218289085547
LogisticRegression 0.8740570678911118 0.8289085545722714




LogisticRegressionCV 0.8478189570350935 0.8377581120943953
Perceptron 0.6589045588717612 0.6548672566371682
QuadraticDiscriminantAnalysis 0.9176779271892423 0.6696165191740413




LinearDiscriminantAnalysis 0.8461790751065923 0.8141592920353983




SVC 0.9540833060019679 0.18584070796460178
LinearSVC 0.5073794686782551 0.44542772861356933




OneClassSVM 0.05673991472613972 0.0058997050147492625
GaussianMixture 0.14266972777959988 0.12389380530973451


In [7]:
scores.sort(key=lambda x: x[2], reverse=True)
for i, score in enumerate(scores):
    print(i, score[0], score[1], score[2])

0 BaggingClassifier 0.9517874713020662 0.8761061946902655
1 ExtraTreesClassifier 0.9540833060019679 0.8761061946902655
2 MLPClassifier 0.9203017382748442 0.8702064896755162
3 KNeighborsClassifier 0.8838963594621188 0.8584070796460177
4 RandomForestClassifier 0.9521154476877665 0.8584070796460177
5 GradientBoostingClassifier 0.9540833060019679 0.8377581120943953
6 LogisticRegressionCV 0.8478189570350935 0.8377581120943953
7 LogisticRegression 0.8740570678911118 0.8289085545722714
8 DecisionTreeClassifier 0.9540833060019679 0.8171091445427728
9 LinearDiscriminantAnalysis 0.8461790751065923 0.8141592920353983
10 CalibratedClassifierCV 0.7520498524106264 0.7345132743362832
11 ExtraTreeClassifier 0.9540833060019679 0.7315634218289085
12 RidgeClassifierCV 0.722859954083306 0.6902654867256637
13 RidgeClassifier 0.722859954083306 0.6902654867256637
14 QuadraticDiscriminantAnalysis 0.9176779271892423 0.6696165191740413
15 SGDClassifier 0.6569367005575598 0.6548672566371682
16 Perceptron 0.65890

### Save

In [8]:
#str(type(NearestCentroid()))[16:-2].split('.')

In [9]:
if not os.path.isdir(CLASSIFIERS_FOLDER):
    os.mkdir(CLASSIFIERS_FOLDER)
    
for classifier in classifiers:
    name = str(type(classifier))[16:-2].split('.')
    name = ''.join([c for c in name[-1] if c.isupper()]).lower() + '-' + '-'.join(x[0] for x in name[:-1])
    with open(os.path.join(CLASSIFIERS_FOLDER, name), 'wb') as file:
        pickle.dump(classifier, file)