# Otimização de Hiperparâmetros

## Carregando os dados

In [None]:
import numpy as np
import pandas as pd

In [None]:
df_abt = pd.read_csv('/content/drive/MyDrive/projeto-ia-codigos/projeto-ia-aula8/propensao_revenda_abt (1).csv')
df_abt.head()

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_abt['data_ref_safra'].value_counts()

2018-03-01    1874
2018-02-01    1805
2018-01-01    1690
Name: data_ref_safra, dtype: int64

In [None]:
df_abt_train = df_abt.query('data_ref_safra < "2018-03-01"')
df_abt_oot   = df_abt.query('data_ref_safra == "2018-03-01"')

In [None]:
# variáveis chaves da tabela
key_vars = ['data_ref_safra', 'seller_id']

# variáveis numéricas
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']

# variáveis categóricas
cat_vars = ['uf']

# variável resposta/target
target = 'nao_revendeu_next_6m'

# criando a lista com as features
features = cat_vars + num_vars

# filtrando a base de features
X_train = df_abt_train[features]
# filtrando o target
y_train = df_abt_train[target]

In [None]:
!pip install feature-engine==1.0.2

Collecting feature-engine==1.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/57/6d/0c7594c89bf07a7c447b1a251d4e04b07104d4a9332de71e1de42b78b838/feature_engine-1.0.2-py2.py3-none-any.whl (152kB)
[K     |██▏                             | 10kB 11.8MB/s eta 0:00:01[K     |████▎                           | 20kB 10.4MB/s eta 0:00:01[K     |██████▌                         | 30kB 6.7MB/s eta 0:00:01[K     |████████▋                       | 40kB 7.5MB/s eta 0:00:01[K     |██████████▊                     | 51kB 4.3MB/s eta 0:00:01[K     |█████████████                   | 61kB 4.3MB/s eta 0:00:01[K     |███████████████                 | 71kB 4.8MB/s eta 0:00:01[K     |█████████████████▏              | 81kB 5.0MB/s eta 0:00:01[K     |███████████████████▍            | 92kB 5.4MB/s eta 0:00:01[K     |█████████████████████▌          | 102kB 4.5MB/s eta 0:00:01[K     |███████████████████████▋        | 112kB 4.5MB/s eta 0:00:01[K     |█████████████████████████▉  

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

dt_pipe = Pipeline(steps=[
                          ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
                          ('categoric_imputer', CategoricalImputer(variables=cat_vars)),
                          ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                          ('model', DecisionTreeClassifier(random_state=42))
])

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(dt_pipe, X, y, scoring='roc_auc', cv=skf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)
cv_results_df

NameError: ignored

In [None]:
# roc_auc média
cv_results_df['test_score'].mean()

0.7641324920936757

In [None]:
dt_pipe[-1].get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 42,
 'splitter': 'best'}

# GridSearch

Iremos utilizar a função `GridSearchCV`.

**Parâmetros**
* `estimator`: modelo de machine learning, também chamado estimador
* `param_grid`: grid em forma de dicionário com os parâmetros em que a busca será realizada
* `scoring`: métrica a ser otimizada
* `cv`: estratégia de validação a ser utilizada
* `n_jobs`: quantidade de cores do processador a ser utilizada para realizar o processamento em paralelo. O valor `-1` significa que todos os cores serão utilizados
* `refit`: Retreine o melhor modelo encontrado em toda a base de dados
* `verbose`: mostra as mensagens. Quanto maior o número inteiro, mais mensagens serão mostradas.

In [None]:
from sklearn.model_selection import GridSearchCV

grid_parametros = {
    'model__max_depth': [2, 3, 4, 5, 6, 7]
}

grid_search = GridSearchCV(estimator=dt_pipe, param_grid=grid_parametros, scoring='roc_auc', cv=skf, n_jobs=-1, refit=True, verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.6s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('numeric_imputer',
                                        ArbitraryNumberImputer(arbitrary_number=-999,
                                                               imputer_dict=None,
                                                               variables=['tot_orders_12m',
                                                                          'tot_items_12m',
                                                                          'tot_items_dist_12m',
                                                                          'receita_12m',
                                                                          'recencia'])),
                                       ('categoric_imputer',
                                        CategoricalImputer(fill_val...
                                 

In [None]:
# retorna a melhor combinação de hiperparâmetros
best_params = grid_search.best_params_
best_params

{'model__max_depth': 5}

In [None]:
# retorna o melhor score, nesse caso a roc-auc média usando um cv=5
best_score = grid_search.best_score_
best_score

0.890518364384624

In [None]:
# retorna o melhor modelo do grid
best_dt_model = grid_search.best_estimator_

## Avaliando a performance na base out of time

In [None]:
from sklearn.metrics import roc_auc_score

X_oot = df_abt_oot[features]
y_oot = df_abt_oot[target]

y_proba_oot = best_dt_model.predict_proba(X_oot)[:, 1]
rocauc_oot  = roc_auc_score(y_oot, y_proba_oot)
print(f"Decision Tree: ROCAUC OOT = {rocauc_oot}")

Decision Tree: ROCAUC OOT = 0.8968114296299949


# Grid Search com mais hiperparâmetros

In [None]:
from sklearn.model_selection import GridSearchCV

grid_parametros = {
    'model__max_depth': [2, 3, 4, 5, 6, 7],
    'model__criterion': ['gini', 'entropy'],
    'model__class_weight': ['balanced', None]
}

grid_search = GridSearchCV(dt_pipe, grid_parametros, scoring='roc_auc', cv=skf, n_jobs=-1, verbose=1, refit=True)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    6.2s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('numeric_imputer',
                                        ArbitraryNumberImputer(arbitrary_number=-999,
                                                               imputer_dict=None,
                                                               variables=['tot_orders_12m',
                                                                          'tot_items_12m',
                                                                          'tot_items_dist_12m',
                                                                          'receita_12m',
                                                                          'recencia'])),
                                       ('categoric_imputer',
                                        CategoricalImputer(fill_val...
                                 

In [None]:
# melhor combinação de hiperparâmetros
grid_search.best_params_

{'model__class_weight': 'balanced',
 'model__criterion': 'entropy',
 'model__max_depth': 4}

In [None]:
# score da melhor combinação de hiperparâmetros
grid_search.best_score_

0.8957473988538434

## Avaliando a performance na base out of time

In [None]:
from sklearn.metrics import roc_auc_score

best_dt_model2 = grid_search.best_estimator_

y_proba_oot = best_dt_model2.predict_proba(X_oot)[:, 1]
rocauc_oot  = roc_auc_score(y_oot, y_proba_oot)
print(f"Decision Tree: ROCAUC OOT = {rocauc_oot}")

Decision Tree: ROCAUC OOT = 0.9011605736948658


# Exercício: Faça uma busca de hiperparâmetros (grid search) para a regressão logística