In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Classification

In [1]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_features=100)

In [18]:
import time
import numpy as np
np.random.seed(42)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [8]:
n_clusters = len(np.unique(y))

models = {
    'KMeans': KMeans(n_clusters=n_clusters),
    'Logistic Regression': LogisticRegression(),
    'Linear Discriminant': LinearDiscriminantAnalysis(),
    'Nearest Neighbors': KNeighborsClassifier(),
    'Linear SVM': SVC(kernel='linear', gamma='auto'),
    'Gaussian Process': GaussianProcessClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Neural Net': MLPClassifier(alpha=1, max_iter=1000),
    'AdaBoost': AdaBoostClassifier(),
    'Naive Bayes': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis()
}

parameters = {
    'Nearest Neighbors': {'n_neighbors': [1, 3, 5, 7]},
    'Linear SVM': {'C': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3]},
    'Decision Tree': {'max_depth': [None, 3, 5, 7]},
    'Random Forest': {'n_estimators': [30, 100, 300]},
    'Neural Net': {'hidden_layer_sizes': [30, 100, 300], 'activation': ['logistic','tanh','relu']},
    'AdaBoost': {'n_estimators': [30, 100, 300]}
}

only_execute = ['KMeans', 'Logistic Regression', 'Linear Discriminant', 'RBF SVM', 'Gaussian Process', 'Naive Bayes', 'QDA']
search_execute = ['Nearest Neighbors', 'Linear SVM', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost']

In [16]:
def experimental(X, y):
    ans = {}
    for c in only_execute:
        start = time.process_time()

        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', models[c])])
        ans[c] = cross_validate(pipeline, X, y, scoring='accuracy')
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))

    for c in search_execute:
        start = time.process_time()

        clf = Pipeline([('transformer', StandardScaler()), 
                        ('estimator', GridSearchCV(models[c], param_grid=parameters[c]))]).fit(X, y) # Grid search
        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', clf)])
        ans[c] = cross_validate(pipeline, X, y, scoring='accuracy')
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))

    return ans

In [None]:
ans = experimental(X, y)

In [None]:
# scoring='accuracy': the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
for m in models.keys():
    print(m + ' ' + str(ans[m]['test_score'].mean()))