In [87]:
import heapq
import warnings

import mglearn as mglearn
import np as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import sklearn_evaluation
from sklearn.calibration import CalibratedClassifierCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_selection import SelectFromModel, SelectPercentile, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.svm import SVC, LinearSVC

!pip install numpy scipy scikit-learn matplotlib pandas
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingClassifier
import heapq


You should consider upgrading via the '/Users/engineer/workspace/cse590-machine-learning/venv/bin/python -m pip install --upgrade pip' command.[0m


read data from files

In [121]:
X = pd.read_csv("../dataset/extracted_features.csv").values
y = pd.read_csv("../dataset/labels.csv").values
# images = pd.read_csv("../dataset/raw_images.csv").values
X_train_default, X_test_default, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
)
y_train = y_train.ravel()


perform 4 feature selection methods

In [122]:
index_for_removal = np.array(
    [
        0, 10, 14, 17, 25, 26, 35, 39, 49, 59, 62,
    ]
)
X_train_zero_variance = np.delete(X_train_default, index_for_removal, 1)
X_test_zero_variance = np.delete(X_test_default, index_for_removal, 1)

select = SelectPercentile(percentile=90)
select.fit(X_train_zero_variance, y_train)
X_train_univariate = select.transform(X_train_zero_variance)
X_test_univariate = select.transform(X_test_zero_variance)

rf = RandomForestClassifier(
    n_estimators=200,
    max_features=3,
).fit(X_train_default, y_train)

select_sfm = SelectFromModel(
    rf,
    threshold='0.1*mean',
)
select_sfm.fit(X_train_univariate, y_train)
X_train_from_model = select_sfm.transform(X_train_univariate)
X_test_from_model = select_sfm.transform(X_test_univariate)

select_rfe = RFE(
    RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    n_features_to_select=40
)

select_rfe.fit(X_train_from_model, y_train)
X_train_iterative = select_rfe.transform(X_train_from_model)
X_test_iterative = select_rfe.transform(X_test_from_model)


standard scaler: default and post-feature selection

In [123]:
scaler_standard_default = StandardScaler().fit(X_train_default)
X_train_standard_default = scaler_standard_default.transform(X_train_default)
X_test_standard_default = scaler_standard_default.transform(X_test_default)

scaler_standard_refined = StandardScaler().fit(X_train_iterative)
X_train_standard_refined = scaler_standard_refined.transform(X_train_iterative)
X_test_standard_refined = scaler_standard_refined.transform(X_test_iterative)


robust scaler: default and post-feature selection

In [124]:
scaler_robust_default = RobustScaler().fit(X_train_default)
X_train_robust_default = scaler_robust_default.transform(X_train_default)
X_test_robust_default = scaler_robust_default.transform(X_test_default)

scaler_robust_refined = RobustScaler().fit(X_train_iterative)
X_train_robust_refined = scaler_robust_refined.transform(X_train_iterative)
X_test_robust_refined = scaler_robust_refined.transform(X_test_iterative)


minmax scaler: default and post-feature selection

In [125]:
scaler_minmax_default = MinMaxScaler().fit(X_train_default)
X_train_minmax_default = scaler_minmax_default.transform(X_train_default)
X_test_minmax_default = scaler_minmax_default.transform(X_test_default)

scaler_minmax_refined = MinMaxScaler().fit(X_train_iterative)
X_train_minmax_refined = scaler_minmax_refined.transform(X_train_iterative)
X_test_minmax_refined = scaler_minmax_refined.transform(X_test_iterative)


PCA: default and post-feature selection

In [126]:
pca_default = PCA(n_components=56, random_state=42)
X_train_pca_default = pca_default.fit_transform(X_train_standard_default)
X_test_pca_default = pca_default.transform(X_test_standard_default)

pca_refined = PCA(n_components=37, random_state=42)
X_train_pca_refined = pca_refined.fit_transform(X_train_standard_refined)
X_test_pca_refined = pca_refined.transform(X_test_standard_refined)


In [127]:
data_train_default = {
    'default': X_train_default,
    'standard': X_train_standard_default,
    'robust': X_train_robust_default,
    'minmax': X_train_minmax_default,
    'pca': X_train_pca_default
}
data_test_default = {
    'default': X_test_default,
    'standard': X_test_standard_default,
    'robust': X_test_robust_default,
    'minmax': X_test_minmax_default,
    'pca': X_test_pca_default
}
data_default = {
    'train': data_train_default,
    'test': data_test_default
}


In [128]:
data_train_refined = {
    'default': X_train_iterative,
    'standard': X_train_standard_refined,
    'robust': X_train_robust_refined,
    'minmax': X_train_minmax_refined,
    'pca': X_train_pca_refined
}
data_test_refined = {
    'default': X_test_iterative,
    'standard': X_test_standard_refined,
    'robust': X_test_robust_refined,
    'minmax': X_test_minmax_refined,
    'pca': X_test_pca_refined
}
data_refined = {
    'train': data_train_refined,
    'test': data_test_refined
}


In [136]:
score_types = {'accuracy', 'auc', 'f1'}


def canTrain(key, canAcceptNegative):
    if key == 'standard' or key == 'robust' or key == 'pca':
        return canAcceptNegative
    else:
        return True


def grid_search_all_data(classifier, param_grid, data, canAcceptNegative):
    grid_results = {}

    for key in data['train']:
        if not canTrain(key, canAcceptNegative):
            continue

        X_train_data = data['train'][key]
        grid_search = GridSearchCV(
            classifier,
            param_grid,
            cv=4,
            return_train_score=True
        )
        grid_results[key] = grid_search.fit(X_train_data, y_train)

    # print_grid_search_results(grid_search, hyper_parameters)
    return grid_results


In [137]:



def score_model_after_cv(classifier, grid_results, data, requiresCalibration, canAcceptNegative):
    scores = {}

    for key in grid_results:
        if not canTrain(key, canAcceptNegative):
            continue

        clf = classifier.set_params(**grid_results[key].best_params_)
        if requiresCalibration:
            clf = CalibratedClassifierCV(clf)
        X_train_data = data['train'][key]
        X_test_data = data['test'][key]
        clf.fit(X_train_data, y_train)
        scores[key] = {}
        scores[key]['train acc'] = clf.score(X_train_data, y_train)
        scores[key]['test acc'] = clf.score(X_test_data, y_test)
        y_pred = clf.predict_proba(X_test_data)
        scores[key]['auc'] = roc_auc_score(y_test, y_pred, multi_class='ovr')
        # scores[key]['f1'] = f1_score(y_test, y_pred)

    return scores

    # print(mlp.score(X_train_minmax, y_train))
    # print(mlp.score(X_test_minmax, y_test))
    # print(roc_auc_score(y_test, mlp.predict_proba(X_test_minmax), multi_class='ovr'))


In [250]:
def display_grid_results(grid_results):
    keys = list(grid_results.keys())
    param_by_data = {}
    param_names = list(grid_results[keys[0]].best_params_.keys())
    print(param_names)

    for key in grid_results:
        param_values = []
        for param in grid_results[key].best_params_:
            param_values.append(grid_results[key].best_params_[param])
        param_by_data[key] = param_values
    print(param_by_data)
    df = pd.DataFrame.from_dict(param_by_data,  orient='index', columns=param_names)
    return df

KNN

In [65]:
knn_param_grid = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}

knn_grid_results_default = grid_search_all_data(KNeighborsClassifier(), knn_param_grid, data_default, True)

In [64]:
knn_grid_results_refined = grid_search_all_data(KNeighborsClassifier(), knn_param_grid, data_refined, True)

In [57]:
# pd.DataFrame(knn_default_grid_results)
print(knn_grid_results_default['default'].best_params_)

{'n_neighbors': 5}


In [67]:
knn_scores_default = score_model_after_cv(
    classifier=KNeighborsClassifier(), grid_results=knn_grid_results_default, data=data_default,
    requiresCalibration=False, canAcceptNegative=True
)

In [80]:
pd.DataFrame(knn_scores_default)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.909296,0.866479,0.67493,0.898028,0.867042
test acc,0.853604,0.790541,0.581081,0.828829,0.790541
auc,0.974448,0.939579,0.883708,0.952293,0.939512


In [78]:
knn_scores_refined = score_model_after_cv(
    classifier=KNeighborsClassifier(), grid_results=knn_grid_results_refined, data=data_refined,
    requiresCalibration=False, canAcceptNegative=True
)


In [79]:
pd.DataFrame(knn_scores_refined)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.908732,0.864789,0.744789,0.897465,0.849014
test acc,0.86036,0.804054,0.567568,0.826577,0.79955
auc,0.974616,0.957757,0.844677,0.966323,0.959398


In [232]:

display_grid_results(knn_grid_results_default)

['n_neighbors']
{'default': [5], 'standard': [5], 'robust': [9], 'minmax': [4], 'pca': [5]}


Unnamed: 0,n_neighbors
default,5
standard,5
robust,9
minmax,4
pca,5


In [233]:
display_grid_results(knn_grid_results_refined)



['n_neighbors']
{'default': [5], 'standard': [7], 'robust': [4], 'minmax': [5], 'pca': [9]}


Unnamed: 0,n_neighbors
default,5
standard,7
robust,4
minmax,5
pca,9


Logreg

In [69]:
logreg_param_grid = {
    'C': np.linspace(0, 5, 20)
}


In [72]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    logreg_grid_results_default = grid_search_all_data(LogisticRegression(), logreg_param_grid, data_default, True)

  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weigh

In [73]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    logreg_grid_results_refined = grid_search_all_data(LogisticRegression(), logreg_param_grid, data_refined, True)

  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weight),
  loss += 0.5 * alpha * squared_norm(w)
  grad[:, :n_features] += alpha * w
  args=(X, target, 1.0 / C, sample_weigh

In [81]:
logreg_scores_default = score_model_after_cv(
    classifier=LogisticRegression(), grid_results=logreg_grid_results_default, data=data_default,
    requiresCalibration=False, canAcceptNegative=True
)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [82]:
pd.DataFrame(logreg_scores_default)


Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.976338,0.957746,0.96,0.941972,0.957746
test acc,0.912162,0.916667,0.887387,0.923423,0.916667
auc,0.991583,0.992378,0.988755,0.992831,0.992378


In [83]:
logreg_scores_refined = score_model_after_cv(
    classifier=LogisticRegression(), grid_results=logreg_grid_results_refined, data=data_refined,
    requiresCalibration=False, canAcceptNegative=True
)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [84]:
pd.DataFrame(logreg_scores_refined)


Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.962254,0.942535,0.943099,0.934648,0.936901
test acc,0.907658,0.912162,0.891892,0.912162,0.898649
auc,0.990123,0.991504,0.988231,0.991812,0.989709


In [235]:

display_grid_results(logreg_grid_results_default)


['C']
{'default': [0.2631578947368421], 'standard': [0.2631578947368421], 'robust': [0.5263157894736842], 'minmax': [4.7368421052631575], 'pca': [0.2631578947368421]}


Unnamed: 0,C
default,0.263158
standard,0.263158
robust,0.526316
minmax,4.736842
pca,0.263158


In [237]:
display_grid_results(logreg_grid_results_refined)

['C']
{'default': [0.2631578947368421], 'standard': [0.2631578947368421], 'robust': [0.2631578947368421], 'minmax': [4.7368421052631575], 'pca': [0.2631578947368421]}


Unnamed: 0,C
default,0.263158
standard,0.263158
robust,0.263158
minmax,4.736842
pca,0.263158


LSVC

In [144]:
lsvc_param_grid = {
    'C': np.linspace(0, 5, 20)
}


In [145]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    lsvc_grid_results_default = grid_search_all_data(LinearSVC(), lsvc_param_grid, data_default, True)


4 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/engineer/workspace/cse590-machine-learning/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/engineer/workspace/cse590-machine-learning/venv/lib/python3.9/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/Users/engineer/workspace/cse590-machine-learning/venv/lib/python3.9/site-packages/sklearn/svm/_base.py", line 1186, in _fit_liblinear
    raw_coef_, n_iter_ = liblinear.tr

In [146]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    lsvc_grid_results_refined = grid_search_all_data(LinearSVC(), lsvc_param_grid, data_refined, True)


4 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/engineer/workspace/cse590-machine-learning/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/engineer/workspace/cse590-machine-learning/venv/lib/python3.9/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/Users/engineer/workspace/cse590-machine-learning/venv/lib/python3.9/site-packages/sklearn/svm/_base.py", line 1186, in _fit_liblinear
    raw_coef_, n_iter_ = liblinear.tr

In [147]:
lsvc_scores_default = score_model_after_cv(
    classifier=LinearSVC(), grid_results=lsvc_grid_results_default, data=data_default, requiresCalibration=True,
    canAcceptNegative=True
)



In [148]:

pd.DataFrame(lsvc_scores_default)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.943662,0.947606,0.949296,0.930141,0.947606
test acc,0.912162,0.916667,0.916667,0.912162,0.916667
auc,0.985756,0.986922,0.986757,0.98758,0.986886


In [152]:
lsvc_scores_refined = score_model_after_cv(
    classifier=LinearSVC(), grid_results=lsvc_grid_results_refined, data=data_refined, requiresCalibration=True,
    canAcceptNegative=True
)



In [153]:
pd.DataFrame(lsvc_scores_refined)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.929577,0.932958,0.934648,0.92169,0.925634
test acc,0.90991,0.907658,0.903153,0.905405,0.896396
auc,0.985518,0.985873,0.985634,0.986456,0.984051


Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.931268,0.932958,0.933521,0.92169,0.925634
test acc,0.900901,0.907658,0.903153,0.905405,0.898649
auc,0.984959,0.985888,0.985189,0.986456,0.984056


In [238]:

display_grid_results(lsvc_grid_results_default)

['C']
{'default': [0.2631578947368421], 'standard': [0.2631578947368421], 'robust': [0.2631578947368421], 'minmax': [0.5263157894736842], 'pca': [0.2631578947368421]}


Unnamed: 0,C
default,0.263158
standard,0.263158
robust,0.263158
minmax,0.526316
pca,0.263158


In [239]:

display_grid_results(lsvc_grid_results_refined)

['C']
{'default': [0.2631578947368421], 'standard': [0.2631578947368421], 'robust': [0.2631578947368421], 'minmax': [0.7894736842105263], 'pca': [0.5263157894736842]}


Unnamed: 0,C
default,0.263158
standard,0.263158
robust,0.263158
minmax,0.789474
pca,0.526316


## MNB

In [154]:
mnb_param_grid = {
    'alpha': np.linspace(0, 2, 20)
}

In [155]:
mnb_grid_results_default = grid_search_all_data(MultinomialNB(), mnb_param_grid, data_default, False)




In [156]:
mnb_grid_results_refined = grid_search_all_data(MultinomialNB(), mnb_param_grid, data_refined, False)



In [140]:
mnb_scores_default = score_model_after_cv(
    classifier=MultinomialNB(),
    grid_results=mnb_grid_results_default,
    data=data_default,
    requiresCalibration=False,
    canAcceptNegative=True
)

In [157]:

pd.DataFrame(mnb_scores_default)

Unnamed: 0,default,minmax
train acc,0.901408,0.842254
test acc,0.898649,0.84009
auc,0.987748,0.978726


In [158]:

mnb_scores_refined = score_model_after_cv(
    classifier=MultinomialNB(),
    grid_results=mnb_grid_results_refined,
    data=data_refined,
    requiresCalibration=False,
    canAcceptNegative=True
)




In [159]:
pd.DataFrame(mnb_scores_refined)


Unnamed: 0,default,minmax
train acc,0.892958,0.828169
test acc,0.88964,0.835586
auc,0.9868,0.976534


In [240]:

display_grid_results(mnb_grid_results_default)

['alpha']
{'default': [0.21052631578947367], 'minmax': [0.10526315789473684]}


Unnamed: 0,alpha
default,0.210526
minmax,0.105263


In [241]:

display_grid_results(mnb_grid_results_refined)


['alpha']
{'default': [0.0], 'minmax': [0.0]}


Unnamed: 0,alpha
default,0.0
minmax,0.0


## Random Forest

In [161]:
rf_param_grid = {
    'max_features': [1, 3, 5],
    'n_estimators': [200, 400, 600],
    'random_state': [42]
}

In [162]:

rf_grid_results_default = grid_search_all_data(RandomForestClassifier(), rf_param_grid, data_default, True)

In [163]:

rf_grid_results_refined = grid_search_all_data(RandomForestClassifier(), rf_param_grid, data_refined, True)

In [164]:
rf_scores_default = score_model_after_cv(
    classifier=RandomForestClassifier(),
    grid_results=rf_grid_results_default,
    data=data_default,
    requiresCalibration=False,
    canAcceptNegative=True
)

In [165]:

pd.DataFrame(rf_scores_default)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.991549,0.991549,0.991549,0.991549,0.991549
test acc,0.88964,0.88964,0.88964,0.88964,0.876126
auc,0.98657,0.98657,0.98657,0.98657,0.984269


In [166]:

rf_scores_refined = score_model_after_cv(
    classifier=RandomForestClassifier(),
    grid_results=rf_grid_results_refined,
    data=data_refined,
    requiresCalibration=False,
    canAcceptNegative=True
)

In [167]:

pd.DataFrame(rf_scores_refined)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.991549,0.991549,0.991549,0.991549,0.991549
test acc,0.887387,0.887387,0.887387,0.887387,0.878378
auc,0.986719,0.986719,0.986719,0.986719,0.982399


In [242]:

display_grid_results(rf_grid_results_default)

['max_features', 'n_estimators', 'random_state']
{'default': [3, 400, 42], 'standard': [3, 400, 42], 'robust': [3, 400, 42], 'minmax': [3, 400, 42], 'pca': [5, 400, 42]}


Unnamed: 0,max_features,n_estimators,random_state
default,3,400,42
standard,3,400,42
robust,3,400,42
minmax,3,400,42
pca,5,400,42


In [243]:

display_grid_results(rf_grid_results_refined)

['max_features', 'n_estimators', 'random_state']
{'default': [3, 600, 42], 'standard': [3, 600, 42], 'robust': [3, 600, 42], 'minmax': [3, 600, 42], 'pca': [3, 600, 42]}


Unnamed: 0,max_features,n_estimators,random_state
default,3,600,42
standard,3,600,42
robust,3,600,42
minmax,3,600,42
pca,3,600,42


## GBRT


In [168]:
gbrt_param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [1, 3],
    'learning_rate': [0.01, 0.1],
    'random_state': [42]
}

In [169]:

gbrt_grid_results_default = grid_search_all_data(GradientBoostingClassifier(), gbrt_param_grid, data_default, True)

In [170]:

gbrt_grid_results_refined = grid_search_all_data(GradientBoostingClassifier(), gbrt_param_grid, data_refined, True)

In [171]:

gbrt_scores_default = score_model_after_cv(
    classifier=GradientBoostingClassifier(),
    grid_results=gbrt_grid_results_default,
    data=data_default,
    requiresCalibration=False,
    canAcceptNegative=True
)

In [172]:

pd.DataFrame(gbrt_scores_default)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.974648,0.974648,0.974648,0.974648,0.991549
test acc,0.878378,0.878378,0.878378,0.878378,0.858108
auc,0.988681,0.988681,0.988681,0.988681,0.986279


In [173]:

gbrt_scores_refined = score_model_after_cv(
    classifier=GradientBoostingClassifier(),
    grid_results=gbrt_grid_results_refined,
    data=data_refined,
    requiresCalibration=False,
    canAcceptNegative=True
)

In [174]:

pd.DataFrame(gbrt_scores_refined)


Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.973521,0.973521,0.973521,0.973521,0.991549
test acc,0.871622,0.871622,0.871622,0.871622,0.86036
auc,0.988029,0.988029,0.988029,0.988029,0.987662


In [244]:

display_grid_results(gbrt_grid_results_default)

['learning_rate', 'max_depth', 'n_estimators', 'random_state']
{'default': [0.1, 1, 400, 42], 'standard': [0.1, 1, 400, 42], 'robust': [0.1, 1, 400, 42], 'minmax': [0.1, 1, 400, 42], 'pca': [0.1, 3, 400, 42]}


Unnamed: 0,learning_rate,max_depth,n_estimators,random_state
default,0.1,1,400,42
standard,0.1,1,400,42
robust,0.1,1,400,42
minmax,0.1,1,400,42
pca,0.1,3,400,42


In [245]:

display_grid_results(gbrt_grid_results_refined)

['learning_rate', 'max_depth', 'n_estimators', 'random_state']
{'default': [0.1, 1, 400, 42], 'standard': [0.1, 1, 400, 42], 'robust': [0.1, 1, 400, 42], 'minmax': [0.1, 1, 400, 42], 'pca': [0.1, 3, 400, 42]}


Unnamed: 0,learning_rate,max_depth,n_estimators,random_state
default,0.1,1,400,42
standard,0.1,1,400,42
robust,0.1,1,400,42
minmax,0.1,1,400,42
pca,0.1,3,400,42


## KVSM
### linear

In [181]:
ksvm_param_grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 5, 10],
    'degree': [1, 2, 3],
    'gamma': [.1, 1, 10],
    'probability': [True]
}

In [183]:

ksvm_linear_grid_results_default = grid_search_all_data(SVC(), ksvm_param_grid, data_default, True)

In [185]:

ksvm_linear_grid_results_refined = grid_search_all_data(SVC(), ksvm_param_grid, data_refined, True)

In [184]:


ksvm_scores_default = score_model_after_cv(
    classifier=SVC(),
    grid_results=ksvm_linear_grid_results_default,
    data=data_default,
    requiresCalibration=False,
    canAcceptNegative=True
)

In [189]:

pd.DataFrame(ksvm_scores_default)


Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.970704,0.961127,0.959437,0.981972,0.961127
test acc,0.912162,0.905405,0.862613,0.921171,0.905405
auc,0.992343,0.992409,0.987326,0.995784,0.992686


In [187]:

ksvm_scores_refined = score_model_after_cv(
    classifier=SVC(),
    grid_results=ksvm_linear_grid_results_refined,
    data=data_refined,
    requiresCalibration=False,
    canAcceptNegative=True
)

In [190]:

pd.DataFrame(ksvm_scores_refined)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.961127,0.945915,0.948169,0.96338,0.938028
test acc,0.90991,0.900901,0.855856,0.912162,0.891892
auc,0.992178,0.991542,0.986628,0.995137,0.990545


In [191]:
pd.DataFrame(ksvm_scores_refined)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.961127,0.945915,0.948169,0.96338,0.938028
test acc,0.90991,0.900901,0.855856,0.912162,0.891892
auc,0.992178,0.991542,0.986628,0.995137,0.990545


In [246]:

display_grid_results(ksvm_linear_grid_results_default)

['C', 'degree', 'gamma', 'kernel', 'probability']
{'default': [0.1, 1, 0.1, 'poly', True], 'standard': [0.1, 1, 0.1, 'linear', True], 'robust': [0.1, 1, 0.1, 'linear', True], 'minmax': [5, 1, 1, 'rbf', True], 'pca': [0.1, 1, 0.1, 'linear', True]}


Unnamed: 0,C,degree,gamma,kernel,probability
default,0.1,1,0.1,poly,True
standard,0.1,1,0.1,linear,True
robust,0.1,1,0.1,linear,True
minmax,5.0,1,1.0,rbf,True
pca,0.1,1,0.1,linear,True


In [247]:

display_grid_results(ksvm_linear_grid_results_refined)

['C', 'degree', 'gamma', 'kernel', 'probability']
{'default': [0.1, 1, 0.1, 'poly', True], 'standard': [0.1, 1, 0.1, 'linear', True], 'robust': [0.1, 1, 0.1, 'linear', True], 'minmax': [1, 1, 1, 'rbf', True], 'pca': [0.1, 1, 0.1, 'linear', True]}


Unnamed: 0,C,degree,gamma,kernel,probability
default,0.1,1,0.1,poly,True
standard,0.1,1,0.1,linear,True
robust,0.1,1,0.1,linear,True
minmax,1.0,1,1.0,rbf,True
pca,0.1,1,0.1,linear,True


# MLP


In [252]:
mlp_param_grid = {
    'activation': ['identity', 'relu', 'logistic'],
    'hidden_layer_sizes': [128, 256, 512],
    'alpha': [0.001, 0.005, 0.01]
}

In [254]:

mlp_grid_results_default = grid_search_all_data(MLPClassifier(), mlp_param_grid, data_default, True)




In [258]:

mlp_grid_results_refined = grid_search_all_data(MLPClassifier(), mlp_param_grid, data_refined, True)



In [253]:
mlp_scores_default = score_model_after_cv(
    classifier=MLPClassifier(),
    grid_results=mlp_grid_results_default,
    data=data_default,
    requiresCalibration=False,
    canAcceptNegative=True
)

NameError: name 'mlp_grid_results_default' is not defined

In [None]:

pd.DataFrame(mlp_scores_default)

In [259]:

mlp_scores_refined = score_model_after_cv(
    classifier=MLPClassifier(),
    grid_results=mlp_grid_results_refined,
    data=data_refined,
    requiresCalibration=False,
    canAcceptNegative=True
)



In [260]:

pd.DataFrame(mlp_scores_refined)

Unnamed: 0,default,standard,robust,minmax,pca
train acc,0.988169,0.988732,0.984789,0.986479,0.988732
test acc,0.927928,0.927928,0.912162,0.936937,0.918919
auc,0.996097,0.996343,0.99399,0.996657,0.994676


In [261]:
display_grid_results(mlp_grid_results_refined)


['activation', 'alpha', 'hidden_layer_sizes']
{'default': ['logistic', 0.001, 128], 'standard': ['relu', 0.01, 128], 'robust': ['logistic', 0.005, 128], 'minmax': ['relu', 0.01, 512], 'pca': ['relu', 0.01, 128]}


Unnamed: 0,activation,alpha,hidden_layer_sizes
default,logistic,0.001,128
standard,relu,0.01,128
robust,logistic,0.005,128
minmax,relu,0.01,512
pca,relu,0.01,128


In [257]:
print('hw')

hw
