# Introdução à Seleção de Modelos


## Scikit-learn pipelines

Abaixo é apresentado um exemplo de pipeline que combina PCA e KNN

<img src="https://github.com/rasbt/stat451-machine-learning-fs20/raw/ee813e1c30a5610a2e6475a77c67c1174a63b75c/L05/code/images/sklearn-pipeline.png" width="400">


In [6]:
import numpy as np

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
iris = datasets.load_iris()

X = iris.data
y = iris.target

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 1, stratify = y)

In [4]:
pipe = Pipeline([
        ('z-score', StandardScaler()),
        ('reduce_dim', PCA(n_components = 3)),
        ('classify', KNeighborsClassifier(n_neighbors = 5))])

In [5]:
pipe.fit(X_train, y_train)

In [9]:
y_test_pred = pipe.predict(X_test)

In [10]:
accuracy_score(y_test, y_test_pred)

0.9666666666666667

## Scikit-learn grid-search

<img src="https://github.com/rasbt/stat451-machine-learning-fs20/raw/ee813e1c30a5610a2e6475a77c67c1174a63b75c/L05/code/images/holdout-tuning.png" width="400">

In [11]:
pipe = Pipeline([
        ('z-score', StandardScaler()),
        ('reduce_dim', PCA()),
        ('classify', KNeighborsClassifier())])

In [17]:
param_grid = {
    'reduce_dim__n_components': [1, 2, 3, 4],
    'classify__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]
}

In [18]:
grid = GridSearchCV(pipe, cv = 10, n_jobs = 1, param_grid = param_grid, scoring='accuracy')

In [19]:
grid.fit(X_train, y_train)

In [15]:
print(grid.cv_results_)

{'mean_fit_time': array([0.00143981, 0.00128589, 0.00132163, 0.00128684, 0.00125968,
       0.00127091, 0.00127211, 0.00124526, 0.00125639, 0.00125852,
       0.00126507, 0.00125568, 0.0012481 , 0.00125279, 0.0012485 ,
       0.00122364]), 'std_fit_time': array([2.27266312e-04, 4.39909350e-05, 1.17615684e-04, 6.93971901e-05,
       2.53234316e-05, 4.14267221e-05, 3.21656972e-05, 1.67229857e-05,
       3.63622414e-05, 3.27066844e-05, 2.54097502e-05, 4.06919893e-05,
       1.97001998e-05, 1.62493063e-05, 6.90525405e-06, 3.00634484e-05]), 'mean_score_time': array([0.00201674, 0.00181785, 0.00187125, 0.00181916, 0.00180442,
       0.00178125, 0.00179985, 0.0017936 , 0.00179727, 0.00180435,
       0.00179968, 0.00179667, 0.00177596, 0.00178649, 0.00179143,
       0.0017895 ]), 'std_score_time': array([1.86275134e-04, 5.73333184e-05, 1.02795475e-04, 6.19441064e-05,
       5.29730512e-05, 1.45963780e-05, 3.94508119e-05, 4.33193436e-05,
       3.42407679e-05, 6.25556923e-05, 4.26898702e-05, 2.

In [20]:
grid.cv_results_['mean_test_score']

array([0.91666667, 0.89166667, 0.94166667, 0.94166667, 0.93333333,
       0.93333333, 0.95833333, 0.93333333, 0.94166667, 0.91666667,
       0.95833333, 0.95      , 0.89166667, 0.925     , 0.95833333,
       0.95      , 0.90833333, 0.93333333, 0.95833333, 0.95833333,
       0.91666667, 0.95      , 0.95833333, 0.96666667, 0.91666667,
       0.95      , 0.95833333, 0.96666667, 0.94166667, 0.94166667,
       0.96666667, 0.975     ])

In [21]:
print(grid.best_score_)
print(grid.best_params_)

0.9749999999999999
{'classify__n_neighbors': 15, 'reduce_dim__n_components': 4}


In [22]:
clf = grid.best_estimator_

In [28]:
y_test_pred2 = clf.predict(X_test)

In [29]:
accuracy_score(y_test, y_test_pred2)

0.9666666666666667