In [1]:
import pandas as pd
import joblib
import numpy as np
from pathlib import Path
from sklearn.metrics import roc_auc_score, confusion_matrix



In [2]:
# === Business score ===
def business_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    fn = cm[1][0]
    fp = cm[0][1]
    return 10 * fn + fp



In [3]:
# === Préparation données ===
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    X = df.drop(columns=["TARGET", "SK_ID_CURR"], errors="ignore")
    y = df["TARGET"]
    return X, y



In [9]:
# === Modèles à tester ===
model_dir = Path("src/models")
model_paths = list(model_dir.glob("*.pkl"))





In [10]:
print(model_paths)

[PosixPath('src/models/LightGBM_custom.pkl'), PosixPath('src/models/LightGBM_pipeline.pkl'), PosixPath('src/models/RandomForest_smote.pkl'), PosixPath('src/models/XGBoost_custom.pkl'), PosixPath('src/models/RandomForest_pipeline.pkl'), PosixPath('src/models/XGBoost_smote.pkl'), PosixPath('src/models/XGBoost_pipeline.pkl'), PosixPath('src/models/RandomForest_custom.pkl'), PosixPath('src/models/LightGBM_smote.pkl')]


In [11]:
# === Chargement datasets ===
X_train, y_train = load_data("data/processed/train_clean.csv")
X_test, y_test = load_data("data/processed/test_clean.csv")

results = []

In [13]:
# === Évaluation de chaque modèle ===
for model_path in model_paths:
    model = joblib.load(model_path)
    name = model_path.stem

    for dataset_name, X, y in [
        ("train", X_train, y_train),
        ("test", X_test, y_test)
    ]:
        proba = model.predict_proba(X)[:, 1]
        auc = roc_auc_score(y, proba)
        y_pred = (proba >= 0.5).astype(int)
        cost = business_score(y, y_pred)

        results.append({
            "Model": name,
            "Dataset": dataset_name,
            "AUC": auc,
            "Business_Cost": cost
        })



  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_true contains NaN.

In [None]:
# === Résultats ===
df_results = pd.DataFrame(results)
print("\n📊 Comparatif des performances :")
print(df_results.pivot(index="Model", columns="Dataset", values=["AUC", "Business_Cost"]))


In [None]:
# === Meilleur modèle selon test set ===
best_model = df_results[df_results["Dataset"] == "test"].sort_values("Business_Cost").iloc[0]
print(f"\n🏆 Meilleur modèle : {best_model['Model']} | AUC: {best_model['AUC']:.4f} | Coût: {best_model['Business_Cost']}")
