# Avaliando algoritmos de apredizagem

## Grid-search

***

O Grid-search é usado para encontrar os hiperparâmetros ideais de um modelo que resultem em previsões mais "precisas". 

[Link](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

## Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier

# Funções de avaliação dos modelos
from sklearn.metrics import classification_report, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
#carregando o csv
dataset = pd.read_csv("https://raw.githubusercontent.com/johnattandouglas/monitoria-ml/main/Datasets/Iris.csv")

# Mapeando os valores da classe para inteiro (para fins de visualização da região de decisão)
dataset['Species'] = pd.factorize(dataset['Species'])[0]


### Separando o conjunto de dados

In [3]:
#Vamos usar somente duas features SepalLengthCm e SepalWidthCm
X = dataset.loc[:,["SepalLengthCm", "SepalWidthCm"]] 
y = dataset.loc[:,["Species"]]

#Separando o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

## Treinamento do modelo com os parâmetros default 

In [4]:
# vamos criar um classificador kNN com k=5
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.60      0.60      0.60        15
           2       0.60      0.60      0.60        15

    accuracy                           0.73        45
   macro avg       0.73      0.73      0.73        45
weighted avg       0.73      0.73      0.73        45



## Seleção de parâmetros com o Grid-Search 

In [5]:
model = KNeighborsClassifier()

parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
              'metric':["euclidean", "manhattan"]}

grid = GridSearchCV(estimator = model,             # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados.
                    scoring = 'f1_macro',          # métrica de avaliação
                    cv = 5)                        # cross-validation

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print("Melhor parametro:", grid.best_params_)         
# e ver a sua performance no dataset de teste
print(classification_report(y_test, grid.predict(X_test)))

Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 5}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.60      0.60      0.60        15
           2       0.60      0.60      0.60        15

    accuracy                           0.73        45
   macro avg       0.73      0.73      0.73        45
weighted avg       0.73      0.73      0.73        45



In [6]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000968,0.00086,0.004501,0.003452,euclidean,11,"{'metric': 'euclidean', 'n_neighbors': 11}",0.706716,0.805556,0.952137,0.760684,0.85641,0.8163,0.083997,5
1,0.000226,0.000452,0.002921,0.004124,euclidean,9,"{'metric': 'euclidean', 'n_neighbors': 9}",0.655556,0.805556,0.952137,0.760684,0.904762,0.815739,0.105188,6
2,0.00066,0.00084,0.004228,0.006513,euclidean,7,"{'metric': 'euclidean', 'n_neighbors': 7}",0.754335,0.805556,0.904762,0.714286,0.904762,0.81674,0.077475,3
3,0.003127,0.006254,0.0,0.0,euclidean,5,"{'metric': 'euclidean', 'n_neighbors': 5}",0.754335,0.805556,0.904762,0.85641,0.809524,0.826117,0.050899,1
4,0.0,0.0,0.006249,0.007653,euclidean,3,"{'metric': 'euclidean', 'n_neighbors': 3}",0.805556,0.760684,0.904762,0.85641,0.760684,0.817619,0.056085,2
5,0.000696,0.000577,0.005181,0.005442,euclidean,1,"{'metric': 'euclidean', 'n_neighbors': 1}",0.805556,0.583883,0.708333,0.664957,0.714286,0.695403,0.07214,11
6,0.000387,0.000474,0.004511,0.005696,manhattan,11,"{'metric': 'manhattan', 'n_neighbors': 11}",0.706716,0.805556,0.904762,0.760684,0.85641,0.806825,0.069574,7
7,0.001914,0.002907,0.001857,0.002318,manhattan,9,"{'metric': 'manhattan', 'n_neighbors': 9}",0.655556,0.805556,0.904762,0.714286,0.904762,0.796984,0.100147,10
8,0.0,0.0,0.005496,0.006761,manhattan,7,"{'metric': 'manhattan', 'n_neighbors': 7}",0.706716,0.805556,0.952137,0.714286,0.904762,0.816691,0.0988,4
9,0.000361,0.000721,0.004296,0.008496,manhattan,5,"{'metric': 'manhattan', 'n_neighbors': 5}",0.714286,0.805556,0.904762,0.805556,0.760684,0.798168,0.063107,8


In [7]:
def grid_Search_kfold(skf):
  score_list = []
  fold = 0

  model = KNeighborsClassifier()
  parameters = {'n_neighbors': [11, 9, 7, 5, 3, 1],
                'metric':["euclidean", "manhattan"]}

  for train, test in skf.split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        grid = GridSearchCV(estimator = model,     # k-nn
                    param_grid = parameters,       # dicionário com valores para serem testados.
                    scoring = 'f1_macro',          # métrica de avaliação
                    cv = 5)                      # cross-validation
        
        grid.fit(X_train, y_train)

        y_pred = grid.predict(X_test)
    
        print("Melhor parametro:", grid.best_params_)         
        print("Fold %d: %.3f" %(fold, f1_score(y_test, y_pred, average="macro")))
        
        score_list.append(f1_score(y_test, y_pred, average="macro"))
        fold += 1

    
    
  score = np.array(score_list)
  print("\n F1-score média (desvio): %.3f +- (%.3f)" %(score.mean(), score.std()))

grid_Search_kfold(StratifiedKFold(n_splits=5, random_state=42, shuffle=True))

Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 5}
Fold 0: 0.833
Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 7}
Fold 1: 0.699
Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 9}
Fold 2: 0.731
Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 5}
Fold 3: 0.800
Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 7}
Fold 4: 0.733

 F1-score média (desvio): 0.759 +- (0.049)
