# Hiperparâmetros

Tente encontrar a melhor escolha de hiperparâmetros para um modelo de árvore de decisão.

In [None]:
from google.colab import files
upload = files.upload()

Saving white-wine.csv to white-wine.csv


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
wine = pd.read_csv('white-wine.csv', sep=';')

In [None]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [None]:
wine.shape

(4898, 12)

In [None]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [None]:
wine.quality.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [None]:
wine['target'] = np.where(wine['quality'] >= 6, 1, 0)

In [None]:
wine.target.value_counts()

1    3258
0    1640
Name: target, dtype: int64

# Decision tree | cross validation

In [None]:
X = wine.drop(columns=['quality', 'target'])
y = wine['target']

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model_sem_otim = DecisionTreeClassifier(max_depth = 2)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
results_no_optim = cross_val_score(model_sem_otim, X, y, scoring='accuracy', cv=5, n_jobs=2)

In [None]:
acc_sem_otimizacao = results_no_optim.mean()

In [None]:
print(acc_sem_otimizacao)

0.7311321423359946


# Grid Search

In [None]:
model_grid = DecisionTreeClassifier()

In [None]:
print("Hiperparâmetros do DecisionTreeClassifier:")
display(model_grid.get_params())

Hiperparâmetros do DecisionTreeClassifier:


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [None]:
# Critério do split
criterions = ['gini', 'entropy']

# Profundidades máximas que iremos testar
max_depth = [int(x) for x in range(2, 16)]
max_depth.append(None)

# Número de pontos mínimos necessário para permitir um split no nó
min_samples_split = [int(x) for x in range(2, 11)]

# Número de pontos mínimos que podem existir em cada folha (nó final)
min_samples_leaf = [int(x) for x in range(1, 5)]

# Criamos o grid de escolhas
params_grid = {'criterion': criterions,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

display(params_grid)

{'criterion': ['gini', 'entropy'],
 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None],
 'min_samples_leaf': [1, 2, 3, 4],
 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [None]:
# Pegamos a classe GridSearchCV do sklearn
from sklearn.model_selection import GridSearchCV

In [None]:
# Construindo o objeto "otimizador via grid search com validação cruzada"
grid_search = GridSearchCV(estimator = model_grid, 
                           param_grid = params_grid, 
                           scoring='accuracy', 
                           cv = 5, 
                           verbose = 2,
                           n_jobs = 2)

In [None]:
%%time

grid_search.fit(X, y)

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
CPU times: user 4.67 s, sys: 192 ms, total: 4.87 s
Wall time: 1min 28s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=2,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                       14, 15, None],
                         'min_samples_leaf': [1, 2, 3, 4],
                         'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='accuracy', verbose=2)

In [None]:
print(grid_search.cv_results_)

{'mean_fit_time': array([0.01014042, 0.009097  , 0.00955338, ..., 0.04573288, 0.04184041,
       0.04144793]), 'std_fit_time': array([0.00103645, 0.00139314, 0.00144664, ..., 0.00435281, 0.00052425,
       0.00118789]), 'mean_score_time': array([0.00217113, 0.00248404, 0.0023562 , ..., 0.00218573, 0.0021873 ,
       0.00213079]), 'std_score_time': array([1.43221484e-04, 6.43923508e-04, 5.51906002e-05, ...,
       2.44062381e-05, 1.18717840e-04, 1.62952874e-04]), 'param_criterion': masked_array(data=['gini', 'gini', 'gini', ..., 'entropy', 'entropy',
                   'entropy'],
             mask=[False, False, False, ..., False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[2, 2, 2, ..., None, None, None],
             mask=[False, False, False, ..., False, False, False],
       fill_value='?',
            dtype=object), 'param_min_samples_leaf': masked_array(data=[1, 1, 1, ..., 4, 4, 4],
             mask=[False, False, False,

In [None]:
df = pd.DataFrame(grid_search.cv_results_)

In [None]:
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01014,0.001036,0.002171,0.000143,gini,2,1,2,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.663265,0.720408,0.72449,0.758938,0.78856,0.731132,0.042038,220
1,0.009097,0.001393,0.002484,0.000644,gini,2,1,3,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.663265,0.720408,0.72449,0.758938,0.78856,0.731132,0.042038,220
2,0.009553,0.001447,0.002356,5.5e-05,gini,2,1,4,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.663265,0.720408,0.72449,0.758938,0.78856,0.731132,0.042038,220
3,0.009314,0.001829,0.002071,7.1e-05,gini,2,1,5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.663265,0.720408,0.72449,0.758938,0.78856,0.731132,0.042038,220
4,0.009253,0.001986,0.003616,0.002645,gini,2,1,6,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.663265,0.720408,0.72449,0.758938,0.78856,0.731132,0.042038,220


In [None]:
display(grid_search.best_params_)

{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [None]:
acc_grid_search = grid_search.best_score_

In [None]:
print(acc_grid_search)

0.7443976569177211


# Random Search

In [None]:
model_random = DecisionTreeClassifier()

In [None]:
print("Hiperparâmetros do DecisionTreeClassifier:")
display(model_random.get_params())

Hiperparâmetros do DecisionTreeClassifier:


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [None]:
# Critério do split
criterions = ['gini', 'entropy']

# Profundidades máximas que iremos testar
max_depth = [int(x) for x in range(2, 16)]
max_depth.append(None)

# Número de pontos mínimos necessário para permitir um split no nó
min_samples_split = [int(x) for x in range(2, 11)]

# Número de pontos mínimos que podem existir em cada folha (nó final)
min_samples_leaf = [int(x) for x in range(1, 5)]

# Criamos o grid de escolhas
params_grid = {'criterion': criterions,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

display(params_grid)

{'criterion': ['gini', 'entropy'],
 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None],
 'min_samples_leaf': [1, 2, 3, 4],
 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# O parâmetro n_iter vai controlar o tamanho da nossa amostra.
random_search = RandomizedSearchCV(estimator = model_random, 
                                   param_distributions = params_grid, 
                                   scoring='accuracy',
                                   n_iter = 100, 
                                   cv = 5, 
                                   verbose=2,
                                   n_jobs = 2)

In [None]:
%%time

random_search.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
CPU times: user 695 ms, sys: 56.8 ms, total: 752 ms
Wall time: 8.61 s


RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=100,
                   n_jobs=2,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      None],
                                        'min_samples_leaf': [1, 2, 3, 4],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10]},
                   scoring='accuracy', verbose=2)

In [None]:
df = pd.DataFrame(random_search.cv_results_)

In [None]:
df.head() 

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.019287,0.001104,0.002208,0.000159,4,1,6,gini,"{'min_samples_split': 4, 'min_samples_leaf': 1...",0.695918,0.730612,0.729592,0.755873,0.785495,0.739498,0.029867,6
1,0.035319,0.002409,0.002473,0.000245,2,3,15,gini,"{'min_samples_split': 2, 'min_samples_leaf': 3...",0.62449,0.644898,0.687755,0.705822,0.723187,0.67723,0.03706,92
2,0.028925,0.00447,0.002201,0.000142,8,1,9,gini,"{'min_samples_split': 8, 'min_samples_leaf': 1...",0.643878,0.678571,0.721429,0.72523,0.779367,0.709695,0.045899,49
3,0.020708,0.002714,0.002408,0.000789,7,4,5,entropy,"{'min_samples_split': 7, 'min_samples_leaf': 4...",0.690816,0.715306,0.75,0.740552,0.776302,0.734595,0.029329,16
4,0.032157,0.002275,0.002222,8.1e-05,3,4,12,gini,"{'min_samples_split': 3, 'min_samples_leaf': 4...",0.637755,0.645918,0.67449,0.735444,0.742594,0.68724,0.04406,83


In [None]:
display(random_search.best_params_)

{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 8}

In [None]:
acc_random_search = random_search.best_score_

In [None]:
print(acc_random_search)

0.7443976569177211


# Bayes Search

In [None]:
model_bayes = DecisionTreeClassifier()

In [None]:
print("Hiperparâmetros do DecisionTreeClassifier:")
display(model_bayes.get_params())

Hiperparâmetros do DecisionTreeClassifier:


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [None]:
pip install scikit-optimize


Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l[K     |███▎                            | 10 kB 25.1 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 29.8 MB/s eta 0:00:01[K     |█████████▉                      | 30 kB 32.2 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 31.5 MB/s eta 0:00:01[K     |████████████████▍               | 51 kB 16.3 MB/s eta 0:00:01[K     |███████████████████▋            | 61 kB 15.7 MB/s eta 0:00:01[K     |██████████████████████▉         | 71 kB 13.1 MB/s eta 0:00:01[K     |██████████████████████████▏     | 81 kB 14.0 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 92 kB 15.4 MB/s eta 0:00:01[K     |████████████████████████████████| 100 kB 6.7 MB/s 
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [None]:
# Vamos definir os hiperparâmetros de busca
from skopt.space import Real, Categorical, Integer

# Critério do split
criterions = ['gini', 'entropy']

# Profundidades máximas que iremos testar
max_depth = Categorical([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None])

# Número de pontos mínimos necessário para permitir um split no nó
min_samples_split = Integer(2,11)

# Número de pontos mínimos que podem existir em cada folha (nó final)
min_samples_leaf = Integer(1, 5)

# Criamos o grid de escolhas
params_grid = {'criterion': criterions,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

display(params_grid)

{'criterion': ['gini', 'entropy'],
 'max_depth': Categorical(categories=(2, 3, 4, ..., 14, 15, None), prior=None),
 'min_samples_leaf': Integer(low=1, high=5, prior='uniform', transform='identity'),
 'min_samples_split': Integer(low=2, high=11, prior='uniform', transform='identity')}

In [None]:
from skopt import BayesSearchCV

In [None]:
# O parâmetro n_iter vai controlar o tamanho da nossa amostra.
bayes_search = BayesSearchCV(estimator = model_bayes, 
                              search_spaces = params_grid, 
                              n_iter = 10, 
                              scoring='accuracy',
                              cv = 5, 
                              verbose=1,
                              n_jobs = 2)

In [None]:
%%time

bayes_search.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
CPU times: user 1.03 s, sys: 448 ms, total: 1.48 s
Wall time: 2.97 s


BayesSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=10, n_jobs=2,
              scoring='accuracy',
              search_spaces={'criterion': ['gini', 'entropy'],
                             'max_depth': Categorical(categories=(2, 3, 4, ..., 14, 15, None), prior=None),
                             'min_samples_leaf': Integer(low=1, high=5, prior='uniform', transform='normalize'),
                             'min_samples_split': Integer(low=2, high=11, prior='uniform', transform='normalize')},
              verbose=1)

In [None]:
df = pd.DataFrame(bayes_search.cv_results_)

In [None]:
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.025049,0.001341,0.00215,0.000238,gini,9,5,3,"{'criterion': 'gini', 'max_depth': 9, 'min_sam...",0.641837,0.666327,0.711224,0.733401,0.772217,0.705001,0.04658,5
1,0.032461,0.003294,0.00216,0.000202,gini,13,4,3,"{'criterion': 'gini', 'max_depth': 13, 'min_sa...",0.622449,0.631633,0.647959,0.73238,0.740552,0.674994,0.050918,10
2,0.04274,0.003686,0.002466,0.000566,entropy,13,1,9,"{'criterion': 'entropy', 'max_depth': 13, 'min...",0.65,0.654082,0.721429,0.713994,0.704801,0.688861,0.030549,8
3,0.037924,0.007133,0.002252,0.000279,entropy,10,5,5,"{'criterion': 'entropy', 'max_depth': 10, 'min...",0.65,0.676531,0.723469,0.723187,0.736466,0.701931,0.033016,7
4,0.025224,0.001946,0.002064,0.000222,entropy,7,3,6,"{'criterion': 'entropy', 'max_depth': 7, 'min_...",0.679592,0.707143,0.717347,0.730337,0.770174,0.720918,0.02975,3


In [None]:
display(bayes_search.best_params_)

OrderedDict([('criterion', 'entropy'),
             ('max_depth', 3),
             ('min_samples_leaf', 2),
             ('min_samples_split', 11)])

In [None]:
acc_bayes_search = bayes_search.best_score_

In [None]:
print(acc_bayes_search)

0.7388847428654813


# Comparando modelos:

In [None]:
print("Comparação das Acurácias: ")
print('Acurácia sem Otimização:         ', np.round(acc_sem_otimizacao, 3))
print('Acurácia com GridSearchCV:       ', np.round(acc_grid_search, 3))
print('Acurácia com RandomizedSearchCV: ', np.round(acc_random_search, 3))
print('Acurácia com BayesSearchCV:      ', np.round(acc_bayes_search, 3))

Comparação das Acurácias: 
Acurácia sem Otimização:          0.731
Acurácia com GridSearchCV:        0.744
Acurácia com RandomizedSearchCV:  0.744
Acurácia com BayesSearchCV:       0.739
