In [6]:

import pandas as pd
import joblib
from collections import Counter
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1) Définir le répertoire ML et charger le dataset
BASE = Path().resolve() / "erp_app" / "ml"
csv_path = BASE / "factures_training_dataset.csv"
df = pd.read_csv(csv_path)

# 2) Encoder le label
df = df[df["statut_final_facture"].isin(["impayée","partielle","payée"])]
df["label"] = df["statut_final_facture"].map({"impayée":0,"partielle":1,"payée":2})

# 3) Sélection des features
num_feats = [
    "montant_total", "nb_relances", "delai_paiement",
    "nb_commandes_client", "total_achats_client", "moyenne_retard_client"
]
cat_feats = ["type_client"]

# 4) Pré-processing
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", num_pipe, num_feats),
    ("cat", cat_pipe, cat_feats)
])

# 5) Pipeline avec class_weight pour gérer le déséquilibre
pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(
        class_weight="balanced_subsample",
        random_state=42
    ))
])

# 6) Split train/test
X = df[num_feats + cat_feats]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)
print("Répartition des classes (train) :", Counter(y_train))

# 7) GridSearchCV pour optimiser n_estimators et max_depth
param_grid = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 10],
    "clf__min_samples_split": [2, 5],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipeline, param_grid, cv=cv,
                    scoring="f1_weighted", n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Meilleur score CV :", grid.best_score_)
print("Meilleurs params   :", grid.best_params_)

# 8) Évaluation sur le test set
best = grid.best_estimator_
y_pred = best.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["impayée","partielle","payée"]))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 9) Sauvegarde du modèle
out_path = BASE / "model_risque_facture_simple.joblib"
joblib.dump(best, out_path)
print("✅ Modèle sauvegardé dans", out_path)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/m2/Documents/ESTM/erp_project/erp_app/ml/factures_training_dataset.csv'