# Live Let's Data - Otimização de Hiperparâmetros

Nessa live vamos aprender sobre como testar hiperparâmetros de formas diferentes! Usando GridSearch, RandomizedSearch e BayesSearch.

Lembrando que tunar HP é muito legal, mas o que dá diferença mesmo no modelo é feature engineering!!!


Fortemente inspirado nesse [notebook](https://github.com/lukenew2/ds-demos/blob/master/notebooks/hyperparameter_optimization_skopt.ipynb), mas com a base do Titanic (de novo, Leon????) ao invés de MNIST.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
treino = pd.read_csv('./data/train.csv')
teste = pd.read_csv('./data/test.csv')

In [3]:
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
teste.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
treino.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
treino['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
moda_embarked = treino['Embarked'].value_counts().index[0]
treino.loc[treino['Embarked'].isna(), 'Embarked'] = moda_embarked
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
media_idade = treino.loc[~treino['Age'].isna(), 'Age'].mean()
treino.loc[treino['Age'].isna(), 'Age'] = media_idade
treino.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
X_treino = treino.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis="columns")
X_treino['Sex'] = X_treino['Sex'].map({'male': 0, 'female':1})
X_treino['Embarked'] = X_treino['Embarked'].map({'C': 0, 'S': 1, 'Q': 2})

X_treino.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22.0,1,0,7.25,1
1,2,1,1,38.0,1,0,71.2833,0
2,3,3,1,26.0,0,0,7.925,1
3,4,1,1,35.0,1,0,53.1,1
4,5,3,0,35.0,0,0,8.05,1


In [10]:
y_treino = treino['Survived']
y_treino.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [11]:
teste.loc[treino['Embarked'].isna(), 'Embarked'] = moda_embarked
teste.loc[teste['Age'].isna(), 'Age'] = media_idade

In [12]:
X_teste = treino.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis="columns")
X_teste['Sex'] = X_teste['Sex'].map({'male': 0, 'female':1})
X_teste['Embarked'] = X_teste['Embarked'].map({'C': 0, 'S': 1, 'Q': 2})

X_teste.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22.0,1,0,7.25,1
1,2,1,1,38.0,1,0,71.2833,0
2,3,3,1,26.0,0,0,7.925,1
3,4,1,1,35.0,1,0,53.1,1
4,5,3,0,35.0,0,0,8.05,1


In [13]:
X_treino.shape

(891, 8)

In [14]:
X_treino.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

## Sem tunagem

In [15]:
from sklearn.ensemble import RandomForestClassifier
from numpy import mean
from numpy import std

cv = KFold(n_splits=10, random_state=1, shuffle=True)

classificador_rf_sem_tunagem = RandomForestClassifier(random_state=42)

scores = cross_val_score(classificador_rf_sem_tunagem, X_treino, y_treino, scoring='accuracy', cv=cv, n_jobs=-1)

print('Acurácia sem tuning: %.4f (%.3f)' % (mean(scores), std(scores)))




Acurácia sem tuning: 0.8183 (0.041)


## Com tunagem GridSeach (testar todas as combinações)

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {'bootstrap': [True],
     'max_depth': [6, 10],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [3, 5],
     'min_samples_split': [4, 6],
     'n_estimators': [100, 350]
    }
     

forest_clf = RandomForestClassifier()

forest_grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                                  scoring="accuracy",
                                  return_train_score=True,
                                  verbose=True,
                                  n_jobs=-1)

forest_grid_search.fit(X_treino, y_treino)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [6, 10],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [3, 5],
                         'min_samples_split': [4, 6],
                         'n_estimators': [100, 350]},
             return_train_score=True, scoring='accuracy', verbose=True)

O grid de parâmetros (param_grid) faz com que o Scikit-Learn avalie 1 x 2 x 2 x 2 x 2 x 2 = 32 combinações dos hiperparâmetros:  bootstrap, max_depth, max_features, min_samples_leaf, min_samples_split e n_estimators.

Como estamos fazendo validação cruzada com 5 folds, vamos treinar cada uma das 32 combinações 5 vezes! Ou seja, teremos 160 combinações! 

In [17]:
forest_grid_search.best_params_

{'bootstrap': True,
 'max_depth': 6,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 6,
 'n_estimators': 100}

In [18]:
forest_grid_search.best_estimator_

RandomForestClassifier(max_depth=6, max_features='sqrt', min_samples_leaf=5,
                       min_samples_split=6)

In [19]:
print('Acurácia com Grid Search: %.5f' % (forest_grid_search.best_score_))

Acurácia com Grid Search: 0.82492


## Com tunagem randomizada

In [20]:
from sklearn.model_selection import RandomizedSearchCV

param_space = {"bootstrap": [True],
        "max_depth": [6, 8, 10, 12, 14],
        "max_features": ['auto', 'sqrt','log2'],
        "min_samples_leaf": [2, 3, 4],
        "min_samples_split": [2, 3, 4, 5],
        "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
}

forest_rand_search = RandomizedSearchCV(forest_clf, param_space, n_iter=32,
                                        scoring="accuracy", verbose=True, cv=5,
                                        n_jobs=-1, random_state=42)

forest_rand_search.fit(X_treino, y_treino)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=32,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [6, 8, 10, 12, 14],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4],
                                        'min_samples_split': [2, 3, 4, 5],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000]},
                   random_state=42, scoring='accuracy', verbose=True)

In [21]:
forest_rand_search.best_params_

{'n_estimators': 200,
 'min_samples_split': 4,
 'min_samples_leaf': 4,
 'max_features': 'log2',
 'max_depth': 10,
 'bootstrap': True}

In [22]:
forest_rand_search.best_estimator_

RandomForestClassifier(max_depth=10, max_features='log2', min_samples_leaf=4,
                       min_samples_split=4, n_estimators=200)

In [23]:
print('Acurácia com Grid Search: %.5f' % (forest_rand_search.best_score_))

Acurácia com Grid Search: 0.82606


## Finalmente, com tunagem bayesiana

In [24]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {"bootstrap": Categorical([True, False]), 
        "max_depth": Integer(6, 20), 
        "max_features": Categorical(['auto', 'sqrt','log2']), 
        "min_samples_leaf": Integer(2, 10),
        "min_samples_split": Integer(2, 10),
        "n_estimators": Integer(100, 500)
    }

forest_bayes_search = BayesSearchCV(forest_clf, search_space, n_iter=32,
                                    scoring="accuracy", n_jobs=-1, cv=5)

forest_bayes_search.fit(X_treino, y_treino)

BayesSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=32, n_jobs=-1,
              scoring='accuracy',
              search_spaces={'bootstrap': Categorical(categories=(True, False), prior=None),
                             'max_depth': Integer(low=6, high=20, prior='uniform', transform='normalize'),
                             'max_features': Categorical(categories=('auto', 'sqrt', 'log2'), prior=None),
                             'min_samples_leaf': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=10, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=100, high=500, prior='uniform', transform='normalize')})

In [25]:
forest_bayes_search.best_params_

OrderedDict([('bootstrap', False),
             ('max_depth', 12),
             ('max_features', 'auto'),
             ('min_samples_leaf', 6),
             ('min_samples_split', 10),
             ('n_estimators', 500)])

In [26]:
forest_bayes_search.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=12, min_samples_leaf=6,
                       min_samples_split=10, n_estimators=500)

In [27]:
print('Acurácia com Grid Search: %.5f' % (forest_bayes_search.best_score_))

Acurácia com Grid Search: 0.82604
