<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [3]:
# Some important imports
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import pickle
from mpl_toolkits.mplot3d import Axes3D
sns.set_context('notebook')

# Sklearn imports
import sklearn
from sklearn import calibration
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

# Some imports that may require package installation
try:
    import missingno as msno
except ModuleNotFoundError:
    print('You need to run: pip install missingno')

# Own Scripts import
from scripts.energy_test_DP import *
from scripts.utils import *
from scripts.preprocessing import *
from scripts.plot import *
from scripts.model_selection import *

# get rid of warning due to deprecated modules in sklearn
import warnings
warnings.simplefilter('ignore')

# Constants
DATA_FOLDER = './Data'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
x,y,dates = usable_data('27_06',DATA_FOLDER)

Reading ./Data/sample_27_06.xlsx
Performing some transformation
Saving to ./Data/sample_27_06_sick_only.pk
The sample is composed of : 24173 vectors of dimension 291
	n_sick		=   7110
	n_healthy	=  17063
Deleting 54 features (18.305%).


In [6]:
simple_preproc = make_pipeline(
    sklearn.preprocessing.Imputer(strategy='mean'))

standard_preproc = make_pipeline(
    sklearn.preprocessing.Imputer(strategy='mean'),
    sklearn.preprocessing.StandardScaler())

PCA_preproc = make_pipeline(
    sklearn.preprocessing.Imputer(strategy='mean'),
    sklearn.preprocessing.StandardScaler(),
    PCA(0.95))

LDA_preproc = make_pipeline(sklearn.preprocessing.Imputer(
    strategy='mean'), LinearDiscriminantAnalysis())

preprocessing = {'simple_preproc':simple_preproc,
                 'standard_preproc':standard_preproc,
                 'PCA_preproc':PCA_preproc,
                 'LDA_preproc':LDA_preproc}

In [7]:
preproc = simple_preproc

In [None]:
learning_rates = np.linspace(0.05,0.3,5)
n_estimators = [int(x) for x in np.linspace(20,100,10)]
params = {'learning_rate': learning_rates, 'n_estimators':n_estimators}

estimator = lambda kw_args: make_pipeline(simple_preproc,
                                          sklearn.ensemble.GradientBoostingClassifier(**kw_args))

res_init = custom_GridSearchCV(x,y,dates,estimator,params,cv=5)

optimal_args = res_init['best']['best_args']
opt_learning_rate = optimal_args['learning_rate']
opt_n_est = optimal_args['n_estimators']

params = {'learning_rate': [opt_learning_rate], 'n_estimators': [opt_n_est], 
             'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}

res = custom_GridSearchCV(x,y,dates,estimator,params,cv=5)


[############################################----------------] 74.0% (37/50) Trying different combinations

In [None]:
with open(DATA_FOLDER+'/tuned_simple_preproc.pickle', 'wb') as handle:
    pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
preproc = standard_preproc
learning_rates = np.linspace(0.05,0.3,5)
n_estimators = [int(x) for x in np.linspace(20,100,10)]
params = {'learning_rate': learning_rates, 'n_estimators':n_estimators}

estimator = lambda kw_args: make_pipeline(simple_preproc,
                                          sklearn.ensemble.GradientBoostingClassifier(**kw_args))

res_init = custom_GridSearchCV(x,y,dates,estimator,params,cv=5)

optimal_args = res_init['best']['best_args']
opt_learning_rate = optimal_args['learning_rate']
opt_n_est = optimal_args['n_estimators']

params = {'learning_rate': [opt_learning_rate], 'n_estimators': [opt_n_est], 
             'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}

res = custom_GridSearchCV(x,y,dates,estimator,params,cv=5)

In [None]:
with open(DATA_FOLDER+'/tuned_standard_preproc.pickle', 'wb') as handle:
    pickle.dump(res, handle, protocol=pickle.HIGHEST_PROTOCOL)