# Basic Modeling Benchmarks
In this notebook I will benchmark various learning models using the same data without preprocessing, feature selection, and hyperparameter tuning to get a general idea for which models are performant on the multiclass classifier problem. Resampling will be used in both training and validation to ensure that minority classes get appropriate representation.

8 models that we'll consider:
* Logistic Regression
* SVC with Linear Kernel
* SVC with RBF Kernel
* k-Nearest Neighbors
* Decision Trees
* Artificial Neural Networks
* Naive Bayes
* AdaBoost


In [3]:
# Prepare the data

import ml_utils as mu
from sklearn.model_selection import train_test_split

with mu.HiddenPrints():
    data = mu.get_training_data()
    X, y = mu.resample_to_equal_class_sizes(*mu.split_x_and_y(data))

    valid_data = mu.get_validation_data(data.columns, use_mean_adjusted_data=True)
    X_valid, y_valid = mu.resample_to_equal_class_sizes(*mu.split_x_and_y(valid_data))


X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size = 0.2, random_state = 43, stratify=y)

In [11]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.metrics import confusion_matrix

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel='linear', C=0.025),
    SVC(kernel='rbf', gamma=2, C=1),
    RandomForestClassifier(max_depth=5),
    AdaBoostClassifier(),
    GaussianNB(),
    LogisticRegression(C=1e5),
    MLPClassifier(hidden_layer_sizes=[25,25,25], alpha=1)
]
for model in classifiers:
    print("=======================================================================")
    print(model)
    print('----------------------------------')
    print("Cross-Validation Mean Accuracy: %f" %cross_val_score(model, X_train, y_train, cv=StratifiedKFold(5)).mean())
    print('----------------------------------')
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_holdout)
    print("Holdout Data Score: %f" %clf.score(X_holdout, y_holdout))
    print()
    print(confusion_matrix(y_holdout, y_pred))
    print()
    print('----------------------------------')
    y_valid_pred = clf.predict(X_valid)
    print("Validation Data Score: %f" %clf.score(X_valid, y_valid))
    print()
    print(confusion_matrix(y_valid, y_valid_pred))
    print()
    

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
----------------------------------
Cross-Validation Mean Accuracy: 0.818948
----------------------------------
Holdout Data Score: 0.853125

[[76  1  2  1]
 [ 3 42 16 19]
 [ 0  1 79  0]
 [ 0  4  0 76]]

----------------------------------
Validation Data Score: 0.710938

[[32  0  0  0]
 [ 0 15  9  8]
 [ 0  0 22 10]
 [ 0  1  9 22]]

SVC(C=0.025, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
----------------------------------
Cross-Validation Mean Accuracy: 0.664608
----------------------------------
Holdout Data Score: 0.687500

[[76  0  2  2]
 [ 5 42 15 18]
 [ 0 11 55 14]
 [ 0  7 26 47]]

----------------------------------
Validation Data Score: 0.812500

[[