# Avaliando algoritmos de apredizagem

## Grid-search

***

O Grid-search é usado para encontrar os hiperparâmetros ideais de um modelo que resultem em previsões mais "precisas". 

[Link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

## Importando bibliotecas

In [10]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#carregando o csv
dataset = pd.read_csv("https://raw.githubusercontent.com/johnattandouglas/monitoria-ml/main/Datasets/Iris.csv")

# Mapeando os valores da classe para inteiro (para fins de visualização da região de decisão)
dataset['Species'] = pd.factorize(dataset['Species'])[0]


### Separando o conjunto de dados

In [3]:
#Vamos usar somente duas features SepalLengthCm e SepalWidthCm
X = dataset.loc[:,["SepalLengthCm", "SepalWidthCm"]] 
y = dataset.loc[:,["Species"]]

#Separando o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

## Treinamento do modelo com os parâmetros default 

In [4]:
# vamos criar um classificador kNN com k=5
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.70      0.93      0.80        15
           2       0.90      0.60      0.72        15

    accuracy                           0.84        45
   macro avg       0.87      0.84      0.84        45
weighted avg       0.87      0.84      0.84        45



## Seleção de parâmetros com o Grid-Search 

In [8]:
model = KNeighborsClassifier()

parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
              'metric':["euclidean", "manhattan"]}

grid = GridSearchCV(estimator = model,             # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados (Pares Chave-Valor)
                    scoring = 'accuracy',          # métrica de avaliação
                    cv = 5)                        # cross-validation

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print("Melhor parametro:", grid.best_params_)         
# performance no dataset de teste
print(classification_report(y_test, grid.predict(X_test)))

Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 7}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.70      0.93      0.80        15
           2       0.90      0.60      0.72        15

    accuracy                           0.84        45
   macro avg       0.87      0.84      0.84        45
weighted avg       0.87      0.84      0.84        45



In [6]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003127,0.006255,0.002417,0.00296,euclidean,11,"{'metric': 'euclidean', 'n_neighbors': 11}",0.845202,0.754335,0.617934,0.727096,0.714286,0.73177,0.073024,5
1,0.001041,0.001294,0.007036,0.007367,euclidean,9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.845202,0.754335,0.617934,0.792593,0.714286,0.74487,0.076788,2
2,0.00047,0.00094,0.005244,0.004653,euclidean,7,"{'metric': 'euclidean', 'n_neighbors': 7}",0.785621,0.706716,0.805556,0.902778,0.611111,0.762356,0.098078,1
3,0.000606,0.001212,0.003297,0.006562,euclidean,5,"{'metric': 'euclidean', 'n_neighbors': 5}",0.742929,0.708333,0.805556,0.750446,0.664957,0.734444,0.04668,4
4,0.0,0.0,0.005978,0.00739,euclidean,3,"{'metric': 'euclidean', 'n_neighbors': 3}",0.708333,0.619048,0.688889,0.809524,0.569231,0.679005,0.082022,11
5,0.0,0.0,0.006089,0.006238,euclidean,1,"{'metric': 'euclidean', 'n_neighbors': 1}",0.708333,0.575666,0.708333,0.611111,0.750446,0.670778,0.065991,12
6,0.0,0.0,0.004036,0.004233,manhattan,11,"{'metric': 'manhattan', 'n_neighbors': 11}",0.751984,0.754335,0.688889,0.640058,0.714286,0.70991,0.042624,8
7,0.001263,0.001547,0.005877,0.005819,manhattan,9,"{'metric': 'manhattan', 'n_neighbors': 9}",0.800758,0.760684,0.617934,0.792593,0.714286,0.737251,0.066946,3
8,0.001613,0.003225,0.004634,0.005855,manhattan,7,"{'metric': 'manhattan', 'n_neighbors': 7}",0.695707,0.708333,0.688889,0.850267,0.619048,0.712449,0.075591,7
9,0.000326,0.000652,0.004328,0.003565,manhattan,5,"{'metric': 'manhattan', 'n_neighbors': 5}",0.688889,0.714286,0.805556,0.750446,0.611111,0.714057,0.064695,6


In [12]:
def grid_Search_kfold(skf):
  score_list = []
  fold = 0

  model = KNeighborsClassifier()
  parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
                'metric':["euclidean", "manhattan"]}

  for train, test in skf.split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        grid = GridSearchCV(estimator = model,     # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados.
                    scoring = 'accuracy',          # métrica de avaliação
                    cv = 5)                      # cross-validation
        
        grid.fit(X_train, y_train)

        y_pred = grid.predict(X_test)
    
        print("Melhor parametro:", grid.best_params_)         
        print("Fold %d: %.3f" %(fold, accuracy_score(y_test, y_pred)))
        
        score_list.append(accuracy_score(y_test, y_pred))
        fold += 1

    
    
  score = np.array(score_list)
  print("\n Acurácia média (desvio): %.3f +- (%.3f)" %(score.mean(), score.std()))

grid_Search_kfold(StratifiedKFold(n_splits=5, random_state=42, shuffle=True))

Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 5}
Fold 0: 0.833
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 7}
Fold 1: 0.733
Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 9}
Fold 2: 0.733
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 5}
Fold 3: 0.800
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 11}
Fold 4: 0.700

 Acurácia média (desvio): 0.760 +- (0.049)
