# GridSearch + Validation croisée + Hyperparamètres

In [7]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

X, y = load_wine(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Pipeline + GridSearch (KNN)

In [8]:
pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier())
])

param_grid_knn = {
    "model__n_neighbors": range(1, 31)
}

grid_knn = GridSearchCV(
    pipe_knn,
    param_grid_knn,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

grid_knn.fit(X_train, y_train)

print("Best params KNN:", grid_knn.best_params_)
print("Best CV score:", round(grid_knn.best_score_, 3))
print("Test score:", round(grid_knn.score(X_test, y_test), 3))

Best params KNN: {'model__n_neighbors': 29}
Best CV score: 0.979
Test score: 0.972


## Pipeline + GridSearch (Logistic Regression)

In [9]:
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=5000))
])

param_grid_lr = {
    "model__C": [0.01, 0.1, 1, 10, 100]
}

grid_lr = GridSearchCV(
    pipe_lr,
    param_grid_lr,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

grid_lr.fit(X_train, y_train)

print("Best params LR:", grid_lr.best_params_)
print("Best CV score:", round(grid_lr.best_score_, 3))
print("Test score:", round(grid_lr.score(X_test, y_test), 3))

Best params LR: {'model__C': 1}
Best CV score: 0.979
Test score: 0.972


### Ce que fait réellement GridSearchCV

Pour chaque combinaison d’hyperparamètres :

- Il applique la validation croisée

- Il calcule le score moyen

- Il compare les scores

- Il sélectionne le meilleur

- Il réentraîne sur tout le train avec la meilleure config

**Donc :**

> GridSearchCV = exploration systématique + validation croisée automatisée.


## Comparer plusieurs modèles proprement

In [10]:
models = {
    "KNN": (pipe_knn, param_grid_knn),
    "LogReg": (pipe_lr, param_grid_lr)
}

results = {}

for name, (pipe, grid) in models.items():
    gs = GridSearchCV(pipe, grid, cv=cv, scoring="accuracy", n_jobs=-1)
    gs.fit(X_train, y_train)

    results[name] = {
        "best_params": gs.best_params_,
        "cv_score": gs.best_score_,
        "test_score": gs.score(X_test, y_test)
    }

results

{'KNN': {'best_params': {'model__n_neighbors': 29},
  'cv_score': np.float64(0.9788177339901478),
  'test_score': 0.9722222222222222},
 'LogReg': {'best_params': {'model__C': 1},
  'cv_score': np.float64(0.9790640394088669),
  'test_score': 0.9722222222222222}}

**Ici on observe :**

- Comparaison équitable

- Même jeu de données

- Même CV

- Même métrique

> C’est une vraie logique “entreprise”.

## Le message fondamental (important pour le DL)

> En ML, le cœur du travail n’est pas le modèle. C’est la méthode de validation.

### En Deep Learning, ce sera exactement pareil :

- On choisira learning rate

- On choisira architecture

- On choisira nombre de couches

- On regardera la courbe validation

- La validation est déjà introduite.