## 0. Setup + dataset (Adult Income)

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score

# Préprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Outils pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# Modèles
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Metrics et évaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [8]:
# 1) Chargement dataset réel : Adult (Census Income)
adult = fetch_openml("adult", version=2, as_frame=True)
X = adult.data
y = (adult.target == ">50K").astype(int)  # binaire 0/1

# 2) Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3) Colonnes num / cat
cat_cols = X.select_dtypes(include=["object", "category", "string"]).columns
num_cols = X.columns.difference(cat_cols)

print("Num cols:", len(num_cols), "| Cat cols:", len(cat_cols))

Num cols: 6 | Cat cols: 8


## 1. Préprocessing manuel (hors pipeline) + modèle “hors pipeline”

### 1.1 Imputer + scaler (num) / imputer + one-hot (cat), à la main

In [10]:
# --- NUM : imputation + scaling (fit sur train, transform train/test)
num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()

X_train_num = num_imputer.fit_transform(X_train[num_cols])
X_test_num  = num_imputer.transform(X_test[num_cols])

X_train_num = num_scaler.fit_transform(X_train_num)
X_test_num  = num_scaler.transform(X_test_num)


In [11]:
# --- CAT : imputation + one-hot
cat_imputer = SimpleImputer(strategy="most_frequent")
ohe = OneHotEncoder(handle_unknown="ignore")

X_train_cat = cat_imputer.fit_transform(X_train[cat_cols])
X_test_cat  = cat_imputer.transform(X_test[cat_cols])

X_train_cat = ohe.fit_transform(X_train_cat)
X_test_cat  = ohe.transform(X_test_cat)


In [12]:
# --- CONCAT (attention: sparse)
from scipy.sparse import hstack

X_train_prepared = hstack([X_train_num, X_train_cat])
X_test_prepared  = hstack([X_test_num,  X_test_cat])

print("Shape after preprocessing:", X_train_prepared.shape)

Shape after preprocessing: (39073, 105)


### 1.2 Modèle hors pipeline

In [13]:
clf = LogisticRegression(max_iter=5000, solver="saga")
clf.fit(X_train_prepared, y_train)

y_pred = clf.predict(X_test_prepared)
print("Accuracy (manual preprocessing):", accuracy_score(y_test, y_pred))

Accuracy (manual preprocessing): 0.8521854846964889


**Remarques :**

- “On doit se souvenir d’appliquer exactement les mêmes transfos au test”

- “On doit faire fit uniquement sur train”

- “C’est vite ingérable quand on fait de la CV / GridSearch”

## 2. L’état de l’art : Pipeline, make_pipeline, ColumnTransformer

### 2.1 ColumnTransformer = préprocessing “par type de colonne”

In [18]:
numeric_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer([
    ("num", numeric_preprocess, num_cols),
    ("cat", categorical_preprocess, cat_cols),
])

> **Message clé :ColumnTransformer applique en parallèle des étapes différentes selon les colonnes.**

In [21]:
# application du preprocessing via le pipeline.
X_train_prepared = preprocess.fit_transform(X_train)
X_test_prepared = preprocess.transform(X_test)

# Modele
clf = LogisticRegression(max_iter=5000, solver="saga")
clf.fit(X_train_prepared, y_train)

# Prédiction et évaluation
y_pred = clf.predict(X_test_prepared)
print("Accuracy (manual preprocessing):", accuracy_score(y_test, y_pred))

Accuracy (manual preprocessing): 0.8521854846964889


### 2.2 Pipeline (version explicite, nommée)

In [22]:
# Création d'un pipeline complet : preprocessing + modèle
pipe_lr = Pipeline([
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=5000, solver="saga"))
])

# Entraînement du pipeline (fit sur train, transform + predict/score sur test)
pipe_lr.fit(X_train, y_train)
print("Accuracy (Pipeline):", pipe_lr.score(X_test, y_test))

Accuracy (Pipeline): 0.8521854846964889


> **Message clé : là, tu donnes au modèle les données brutes → le pipeline fait tout.**

### 2.3 make_pipeline (raccourci)
Même chose mais sans nommer toi-même les étapes :

In [8]:
pipe_lr2 = make_pipeline(
    preprocess,
    LogisticRegression(max_iter=5000, solver="saga")
)

pipe_lr2.fit(X_train, y_train)
print("Accuracy (make_pipeline):", pipe_lr2.score(X_test, y_test))

Accuracy (make_pipeline): 0.8521854846964889


> **Message clé : make_pipeline = plus rapide, mais noms auto (pratique, moins lisible en GridSearch).**

> ### Best practice

 - Toujours mettre preprocessing dans un Pipeline

 - Utiliser ColumnTransformer pour données mixtes

 - Faire tuning / CV sur le pipeline, jamais sur données déjà transformées

## 3. Modèle dans le pipeline + plusieurs pipelines (modèles + preprocessing)

### Vrai bénéfice : 
> **Il est possible de comparer plusieurs modèles facilement, et chacun peut avoir un preprocessing adapté.**

### 3.1 Créer plusieurs pipelines

In [10]:
pipelines = {
    "LogReg (scaled + OHE)": Pipeline([
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=5000, solver="saga"))
    ]),
    
    # KNN = distance => scaling important (et OHE pour cat)
    "KNN (scaled + OHE)": Pipeline([
        ("preprocess", preprocess),
        ("model", KNeighborsClassifier(n_neighbors=15))
    ]),
    
    # RandomForest = arbres => scaling pas nécessaire, mais OHE utile ici (sklearn)
    # On peut faire un preprocess "sans scaler" pour illustrer
}

preprocess_no_scaler = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols),
    ("cat", categorical_preprocess, cat_cols),
])

pipelines["RandomForest (no scale + OHE)"] = Pipeline([
    ("preprocess", preprocess_no_scaler),
    ("model", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
])

### 3.2 Comparaison propre en cross-validation (recommandé)

In [11]:
for name, pipe in pipelines.items():
    scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="accuracy")
    print(f"{name:30s} | acc CV: {scores.mean():.3f} ± {scores.std():.3f}")

LogReg (scaled + OHE)          | acc CV: 0.851 ± 0.004
KNN (scaled + OHE)             | acc CV: 0.841 ± 0.004
RandomForest (no scale + OHE)  | acc CV: 0.852 ± 0.002


> #### Point clé :
- Comparaison des pipelines complets (donc sans fuite)
- c’est plug-and-play pour GridSearchCV