# Avaliando algoritmos de apredizagem

## Grid-search

***

O Grid-search é usado para encontrar os hiperparâmetros ideais de um modelo que resultem em previsões mais "precisas". 

[Link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

## Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#carregando o csv
dataset = pd.read_csv("https://raw.githubusercontent.com/johnattandouglas/monitoria-ml/main/Datasets/Iris.csv")

# Mapeando os valores da classe para inteiro (para fins de visualização da região de decisão)
dataset['Species'] = pd.factorize(dataset['Species'])[0]


### Separando o conjunto de dados

In [3]:
#Vamos usar somente duas features SepalLengthCm e SepalWidthCm
X = dataset.loc[:,["SepalLengthCm", "SepalWidthCm"]] 
y = dataset.loc[:,["Species"]]

#Separando o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

## Treinamento do modelo com os parâmetros default 

In [4]:
# vamos criar um classificador kNN com k=5
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.69      0.60      0.64        15
           2       0.65      0.73      0.69        15

    accuracy                           0.78        45
   macro avg       0.78      0.78      0.78        45
weighted avg       0.78      0.78      0.78        45



## Seleção de parâmetros com o Grid-Search 

In [5]:
model = KNeighborsClassifier()

parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
              'metric':["euclidean", "manhattan"]}

grid = GridSearchCV(estimator = model,             # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados (Pares Chave-Valor)
                    scoring = 'accuracy',          # métrica de avaliação
                    cv = 5)                        # cross-validation

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print("Melhor parametro:", grid.best_params_)         
# performance no dataset de teste
print(classification_report(y_test, grid.predict(X_test)))

Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 11}
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       0.64      0.60      0.62        15
           2       0.67      0.67      0.67        15

    accuracy                           0.76        45
   macro avg       0.75      0.76      0.75        45
weighted avg       0.75      0.76      0.75        45



In [6]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001022,0.000747,0.006015,0.001945,euclidean,11,"{'metric': 'euclidean', 'n_neighbors': 11}",0.714286,0.761905,0.809524,0.857143,0.809524,0.790476,0.048562,1
1,0.011117,0.016538,0.004543,0.003156,euclidean,9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.714286,0.761905,0.761905,0.809524,0.809524,0.771429,0.035635,6
2,0.001127,0.001493,0.002437,0.002752,euclidean,7,"{'metric': 'euclidean', 'n_neighbors': 7}",0.714286,0.857143,0.761905,0.809524,0.761905,0.780952,0.048562,5
3,0.002069,0.000707,0.002649,0.000415,euclidean,5,"{'metric': 'euclidean', 'n_neighbors': 5}",0.619048,0.857143,0.809524,0.761905,0.809524,0.771429,0.081927,6
4,0.001664,0.000749,0.00303,0.000751,euclidean,3,"{'metric': 'euclidean', 'n_neighbors': 3}",0.619048,0.761905,0.714286,0.761905,0.809524,0.733333,0.064594,10
5,0.000508,0.000643,0.003139,0.001371,euclidean,1,"{'metric': 'euclidean', 'n_neighbors': 1}",0.619048,0.666667,0.809524,0.714286,0.666667,0.695238,0.064594,11
6,0.000377,0.000572,0.00394,0.003917,manhattan,11,"{'metric': 'manhattan', 'n_neighbors': 11}",0.714286,0.761905,0.809524,0.857143,0.809524,0.790476,0.048562,1
7,0.003466,0.001137,0.002182,0.001825,manhattan,9,"{'metric': 'manhattan', 'n_neighbors': 9}",0.666667,0.761905,0.761905,0.809524,0.809524,0.761905,0.052164,8
8,0.001266,0.001041,0.003846,0.003067,manhattan,7,"{'metric': 'manhattan', 'n_neighbors': 7}",0.666667,0.857143,0.761905,0.857143,0.809524,0.790476,0.07127,1
9,0.000159,0.000319,0.003123,0.002028,manhattan,5,"{'metric': 'manhattan', 'n_neighbors': 5}",0.619048,0.904762,0.809524,0.809524,0.809524,0.790476,0.093314,4


In [7]:
model = KNeighborsClassifier()

parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
              'metric':["euclidean", "manhattan"]}

# Configurar a validação cruzada K-Fold
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

grid = GridSearchCV(estimator = model,             
                    param_grid = parameters,       
                    scoring = 'accuracy',
                    cv=skf)  #StratifiedKFold                      

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print("Melhor parametro:", grid.best_params_)         
# performance no dataset de teste
print(classification_report(y_test, grid.predict(X_test)))

Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 7}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.69      0.73      0.71        15
           2       0.71      0.67      0.69        15

    accuracy                           0.80        45
   macro avg       0.80      0.80      0.80        45
weighted avg       0.80      0.80      0.80        45



In [8]:
# Imprimir os melhores parâmetros e a melhor acurácia
print("Melhores parâmetros:", grid.best_params_)
print("Melhor acurácia média:", grid.best_score_)

# Exibir resultados detalhados
print("\nDetalhes dos resultados:")
results = grid.cv_results_
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"{params}, Acurácia média: {mean_score:.3f}")

Melhores parâmetros: {'metric': 'euclidean', 'n_neighbors': 7}
Melhor acurácia média: 0.819047619047619

Detalhes dos resultados:
{'metric': 'euclidean', 'n_neighbors': 11}, Acurácia média: 0.771
{'metric': 'euclidean', 'n_neighbors': 9}, Acurácia média: 0.781
{'metric': 'euclidean', 'n_neighbors': 7}, Acurácia média: 0.819
{'metric': 'euclidean', 'n_neighbors': 5}, Acurácia média: 0.790
{'metric': 'euclidean', 'n_neighbors': 3}, Acurácia média: 0.733
{'metric': 'euclidean', 'n_neighbors': 1}, Acurácia média: 0.714
{'metric': 'manhattan', 'n_neighbors': 11}, Acurácia média: 0.781
{'metric': 'manhattan', 'n_neighbors': 9}, Acurácia média: 0.810
{'metric': 'manhattan', 'n_neighbors': 7}, Acurácia média: 0.800
{'metric': 'manhattan', 'n_neighbors': 5}, Acurácia média: 0.819
{'metric': 'manhattan', 'n_neighbors': 3}, Acurácia média: 0.762
{'metric': 'manhattan', 'n_neighbors': 1}, Acurácia média: 0.714
