In [486]:
%load_ext autoreload
%autoreload 2

import sys

# implementation of reverse_shuffle
sys.path.insert(0, "/Users/user/git/eli5/")
import eli5

sys.path.insert(0, "/Users/user/git/datasci")
from sci.metrics import rmse, rmse_score, neg_rmse_score
import sci.plots as scip
import sci.learn as scil

%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10, 6)


import scikitplot as skplt

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.5f}'.format(x)) #Limiting floats output to 3 decimal points
pd.options.display.max_rows = 1000

random_state = 7

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

In [3]:
data = scil.load_data("processed_min", random_state=random_state)

print("train:", data.X_train.shape)
print("test:", data.X_test.shape)

(250, 300) (250,)
train: (200, 300)
test: (50, 300)


## Choose Model

In [4]:
from sklearn.model_selection import *

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import *
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import *
from sklearn.ensemble import RandomForestClassifier

import xgboost
from xgboost import XGBClassifier, XGBRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor

from MeanModels import MeanModelsClassifier, MeanModelsRegressor
from StackingAveragedModels import StackingAveragedModels

## Setup

In [73]:
from mlxtend.evaluate import BootstrapOutOfBag, RandomHoldoutSplit
from sklearn.metrics import *

metric = roc_auc_score
scoring = 'roc_auc'

scil.metric_global = metric
scil.scoring_global = 'roc_auc' # make_scorer(scil.metric_global, needs_proba=True)

# format
# cv_{total_splits}_{train/test-fold}
cv_1_10 = RandomHoldoutSplit(valid_size=0.1, stratify=True, random_seed=None)
cv_5_5 = 5
cv_10_10 = 10
cv_1000_10 = RepeatedStratifiedKFold(n_splits=10, n_repeats=100)
cv_1000_20 = RepeatedStratifiedKFold(n_splits=20, n_repeats=50)

cv_single = cv_1_10
cv_fast = cv_10_10
cv_slow = cv_1000_20

## Bias-Variance Decomposition

In [83]:
from mlxtend.evaluate import bias_variance_decomp
def bv(model):
    scil.fit(data, model, as_numpy=True)
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(model, data.X_train.values, data.y_train.values, data.X_test.values, data.y_test.values)
    return avg_expected_loss, avg_bias, avg_var

res = []
for name, model in models_sk.items():
    res.append(bv(model))

fitting KNeighborsClassifier ..
fitting SVC ..
fitting SVC ..
fitting SVC ..
fitting SVC ..
fitting GaussianProcessClassifier ..
fitting DecisionTreeClassifier ..
fitting DecisionTreeClassifier ..
fitting RandomForestClassifier ..
fitting RandomForestClassifier ..
fitting MLPClassifier ..
fitting AdaBoostClassifier ..
fitting SGDClassifier ..
fitting GaussianNB ..
fitting BernoulliNB ..
fitting LinearDiscriminantAnalysis ..
fitting QuadraticDiscriminantAnalysis ..
fitting LogisticRegression ..
fitting GradientBoostingClassifier ..
fitting NuSVC ..


In [84]:
df = pd.DataFrame(res)

df.columns = ['avg_expected_loss', 'avg_bias', 'avg_var']
df.index = list(models_sk.keys())

In [85]:
df

Unnamed: 0,avg_expected_loss,avg_bias,avg_var
KNN,0.4502,0.44,0.2548
SVC-linear,0.4,0.4,0.0
SVC-rbf-def,0.3947,0.4,0.0099
SVC-rbf1,0.4,0.4,0.0
SVC-rbf2,0.4,0.4,0.0
GaussianProcessClassifier,0.4486,0.44,0.1742
DecisionTreeClassifier-def,0.4215,0.3,0.3679
DecisionTreeClassifier-5,0.4234,0.38,0.3442
RandomForestClassifier-def,0.4206,0.42,0.286
RandomForestClassifier-1,0.418,0.4,0.0994


## bootstrap

In [87]:
from mlxtend.evaluate import bootstrap
rng = np.random.RandomState(123)
x = rng.normal(loc=5., size=100)
original, std_err, ci_bounds = bootstrap(x,
                                         num_rounds=1000,
                                         func=np.mean,
                                         ci=0.95,
                                         seed=123)
print('Mean: %.2f, SE: +/- %.2f, CI95: [%.2f, %.2f]' % (original,
                                                        std_err,
                                                        ci_bounds[0],
                                                        ci_bounds[1]))

Mean: 5.03, SE: +/- 0.11, CI95: [4.80, 5.26]
