In [1]:
import pandas as pd

df = pd.read_csv(r'../data/PaySim_historical.csv') 
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.0,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.0,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5726353,399,CASH_OUT,115032.25,C1127654098,0.0,0.00,C988464921,132609.43,247641.68,0,0
5726354,399,CASH_OUT,164763.16,C1698066957,0.0,0.00,C2147157546,583495.20,748258.35,0,0
5726355,399,CASH_OUT,108840.67,C1655030605,0.0,0.00,C1820170137,804648.41,913489.08,0,0
5726356,399,PAYMENT,44318.65,C1466491284,15222.0,0.00,M566159769,0.00,0.00,0,0


In [2]:
df_train_pipeline = df.copy()

In [3]:
# On ne garde que la première lettre : 'C' pour Client ou 'M' pour Merchant
df_train_pipeline['nameOrig'] = df_train_pipeline['nameOrig'].str[0]
df_train_pipeline['nameDest'] = df_train_pipeline['nameDest'].str[0]

#Temps : Conversion du step en heure journalière (0-23)
df_train_pipeline['hour'] = df_train_pipeline['step'] % 24

cols_to_drop = ['newbalanceOrig', 'newbalanceDest', 'isFlaggedFraud', 'step']
df_train_pipeline = df_train_pipeline.drop(cols_to_drop, axis=1, errors='ignore')

df_train_pipeline.sample(5)

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,nameDest,oldbalanceDest,isFraud,hour
1198602,PAYMENT,4936.03,C,360438.6,M,0.0,0,13
3296707,PAYMENT,7420.07,C,82364.0,M,0.0,0,12
1681252,CASH_OUT,333846.74,C,5097.0,C,8439537.12,0,15
3846367,CASH_IN,252252.3,C,294.0,C,0.0,0,18
2182361,CASH_IN,284895.19,C,10983.0,C,0.0,0,17


In [4]:
# bibliothèques nécessaires
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

# définition du type de colonnes 
num_cols = ["amount", "oldbalanceOrg", "oldbalanceDest", "hour"]
cat_cols = ["type", "nameDest", "nameOrig"]

# X = features (tout sauf isFraud), y = target (isFraud)
X = df_train_pipeline.drop('isFraud', axis=1)
y = df_train_pipeline['isFraud']

# Train Test Split
# Stratify=y est CRUCIAL : il garantit que le petit % de fraude (0.13%) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessor
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first"), cat_cols)
    ])

# Onc calcule le ratio pour le donner en parametre Xgboost
count_norm = (y_train == 0).sum()
count_fraud = (y_train == 1).sum()
ratio = count_norm / count_fraud

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model",XGBClassifier(
    tree_method='hist',     # Utilise l'algorithme d'histogrammes (requis pour GPU)
    device='cuda',          # LA commande qui active ta carte NVIDIA
    scale_pos_weight=ratio, # Remplace class_weight='balanced' (plus précis sur XGB)
    random_state=42
))      
])

param_grid = {
    'model__n_estimators': [500, 1000],
    'model__max_depth': [3, 6, 9],
    'model__learning_rate': [0.05, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
}


random_search = RandomizedSearchCV(pipeline, param_distributions= param_grid, scoring= 'f1_macro', verbose=2)

# entraînement sur les données standardisées (X_train) et les réponses sur y_train
random_search.fit(X_train, y_train)

print(f"Meilleurs paramètres : {random_search.best_params_}")
print(f"Meilleur score : {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_

# prédictions
y_pred = best_model.predict(X_test)

print("--- RAPPORT DE CLASSIFICATION ---")
print(classification_report(y_test, y_pred))

print("--- SCORE AUC-ROC ---")

print(roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]))


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=9, model__n_estimators=1000, model__subsample=0.8; total time=  20.9s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=9, model__n_estimators=1000, model__subsample=0.8; total time=  19.7s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=9, model__n_estimators=1000, model__subsample=0.8; total time=  19.6s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=9, model__n_estimators=1000, model__subsample=0.8; total time=  19.7s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=9, model__n_estimators=1000, model__subsample=0.8; total time=  19.4s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=500, model__subsample=0.8; total time=   7.3s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=3, mode

In [5]:
print(f"Meilleurs paramètres : {random_search.best_params_}")
print(f"Meilleur score : {random_search.best_score_:.4f}")

Meilleurs paramètres : {'model__subsample': 0.8, 'model__n_estimators': 1000, 'model__max_depth': 9, 'model__learning_rate': 0.1, 'model__colsample_bytree': 1.0}
Meilleur score : 0.8771


In [6]:
best_model = random_search.best_estimator_

# prédictions
y_pred_test = best_model.predict(X_test)
y_pred_train = best_model.predict(X_train)

print("--- RAPPORT DE CLASSIFICATION ---")
print("Test Set")
print(classification_report(y_test, y_pred_test))
print('---------------------------------------------')
print("Train Set")
print(classification_report(y_train, y_pred_train))

--- RAPPORT DE CLASSIFICATION ---
Test Set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1144382
           1       0.72      0.86      0.78       890

    accuracy                           1.00   1145272
   macro avg       0.86      0.93      0.89   1145272
weighted avg       1.00      1.00      1.00   1145272

---------------------------------------------
Train Set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   4577527
           1       0.89      1.00      0.94      3559

    accuracy                           1.00   4581086
   macro avg       0.95      1.00      0.97   4581086
weighted avg       1.00      1.00      1.00   4581086



In [10]:
import joblib

# Récupération du preprocessor depuis le pipeline
best_pipeline = random_search.best_estimator_
fitted_preprocessor = best_pipeline.named_steps['preprocess']


fitted_model = best_pipeline.named_steps['model']

joblib.dump(preprocessor, '../src/models/preprocessor.joblib')
joblib.dump(best_model, '../src/models/model_ml.joblib')

# Liste des colonnes
joblib.dump(fitted_preprocessor, '../src/models/preprocessor.joblib')
joblib.dump(fitted_model, '../src/models/model_ml.joblib')
joblib.dump(X_train.columns.tolist(), '../src/models/features_list.joblib')
joblib.dump(best_pipeline, '../src/models/pipeline_v1.joblib')

print("Fichiers sauvegardés !")

Fichiers sauvegardés !


In [11]:
print(X_train.columns.tolist())

['type', 'amount', 'nameOrig', 'oldbalanceOrg', 'nameDest', 'oldbalanceDest', 'hour']
