# Projet end-to-end "entreprise-like" sur Titanic

## 1) Imports et chargement des données

### Librairie

In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

### Chargement Data

In [None]:
# Chargement dataset réel Titanic
titanic = fetch_openml("titanic", version=1, as_frame=True)

X = titanic.data
y = (titanic.target == "1").astype(int) # 1 = Survived

### Séparation Train Test

In [None]:
# split train/test (test utilisé seulement à la fin)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)

## 2) Feature engineering (simple et clair)

On ajoute quelques features simples, très “business-like” (reproductibles) :  

- family_size = sibsp + parch + 1

- is_alone

- title (extrait de Name)

- cabin_known

In [None]:
def create_features(df):
    
    df = df.copy()
    
    # taille de la famille
    df["family_size"] = df["sibsp"] + df["parch"] + 1
    
    # personne seule ou non
    df["is_alone"] = (df["family_size"] == 1).astype(int)
    
    # titre extrait du nom
    df["title"] = df["name"].str.extract(r",\s*([^\.]+)\.")
    
    # cabine connue ou non
    df["cabin_known"] = df["cabin"].notna().astype(int)
    
    return df

## 3) Colonnes utilisées

In [None]:
NUM_COLS = [
    "age",
    "fare",
    "family_size",
    "is_alone",
    "cabin_known"
]

CAT_COLS = [
    "sex",
    "pclass",
    "embarked",
    "title"
]

## 4) Preprocessing

* Numérique :  

    * imputation
    
    * scaling

* Catégoriel :  

    * imputation

    * OneHot

In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessing = ColumnTransformer([
    ("num", numeric_pipeline, NUM_COLS),
    ("cat", categorical_pipeline, CAT_COLS)
])

## 5) Pipeline complet (feature engineering + preprocessing + modèle)

Version Logistic Regression :

In [None]:
pipe_logreg = Pipeline([
    
    ("features", FunctionTransformer(create_features)),
    
    ("preprocess", preprocessing),
    
    ("model", LogisticRegression(max_iter=5000))
])

Version Random Forest :

In [None]:
pipe_rf = Pipeline([
    
    ("features", FunctionTransformer(create_features)),
    
    ("preprocess", preprocessing),
    
    ("model", RandomForestClassifier(random_state=42))
])

## 6) Cross-validation

In [None]:
# Validation croisée en 5 folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## 7) Optimisation des hyperparamètres

Logistic Regression

In [None]:
grid_logreg = GridSearchCV(
    
    pipe_logreg,
    
    param_grid={
        "model__C": [0.01, 0.1, 1, 10]
    },
    
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

grid_logreg.fit(X_train, y_train)

print("Best params:", grid_logreg.best_params_)
print("Best CV score:", grid_logreg.best_score_)

Random Forest

In [None]:
grid_rf = GridSearchCV(
    
    pipe_rf,
    
    param_grid={
        "model__n_estimators": [100, 300],
        "model__max_depth": [None, 5, 10]
    },
    
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

print("Best params:", grid_rf.best_params_)
print("Best CV score:", grid_rf.best_score_)

## 8) Comparaison des modèles

In [None]:
print("LogReg CV score:", grid_logreg.best_score_)
print("RF CV score:", grid_rf.best_score_)

Choisir le meilleur :

In [None]:
if grid_rf.best_score_ > grid_logreg.best_score_:
    best_model = grid_rf.best_estimator_
    print("Best model: Random Forest")
else:
    best_model = grid_logreg.best_estimator_
    print("Best model: Logistic Regression")

## 9) Évaluation finale sur le test set

In [None]:
# prédictions
y_pred = best_model.predict(X_test)

# probabilités
y_proba = best_model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_proba))

print(classification_report(y_test, y_pred))

## RECAP

Ce pipeline reproduit exactement ce qu’on fait en entreprise :  

- Feature engineering

- Preprocessing

- Pipeline complet

- Cross-validation

- Optimisation des hyperparamètres

- Comparaison de modèles

- Évaluation finale sur un test indépendant