In [1]:
import pandas as pd

df = pd.read_csv(r'../data/PaySim_historical.csv') 
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.0,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.0,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5726353,399,CASH_OUT,115032.25,C1127654098,0.0,0.00,C988464921,132609.43,247641.68,0,0
5726354,399,CASH_OUT,164763.16,C1698066957,0.0,0.00,C2147157546,583495.20,748258.35,0,0
5726355,399,CASH_OUT,108840.67,C1655030605,0.0,0.00,C1820170137,804648.41,913489.08,0,0
5726356,399,PAYMENT,44318.65,C1466491284,15222.0,0.00,M566159769,0.00,0.00,0,0


In [None]:
def prepare_data(df):
    # 1. Filtrage : On se concentre sur les zones à risque
    df_prep = df[df['type'].isin(['TRANSFER', 'CASH_OUT'])].copy()
    # CONSERVER PAYMENT, DEBIT, CASH_IN
    
    # 2. Temps : Conversion du step en heure journalière (0-23)
    df_prep['hour'] = df_prep['step'] % 24
    
    """ # 3. Feature Engineering : Les détecteurs d'anomalies (Crucial !)
    # Si le calcul ne fait pas 0, c'est un signal fort de fraude
    df_prep['errorBalanceOrig'] = df_prep['oldbalanceOrg'] - df_prep['amount'] - df_prep['newbalanceOrig']
    df_prep['errorBalanceDest'] = df_prep['oldbalanceDest'] + df_prep['amount'] - df_prep['newbalanceDest']
    # à SUPPRIMER! """
    
    # 4. Encoding : On transforme le texte 'type' en colonnes 0/1
    # drop_first=True évite la redondance (si ce n'est pas un Transfert, c'est un Cash_out)
    df_prep = pd.get_dummies(df_prep, columns=['type'], drop_first=True)
    
    # 5. Nettoyage : On supprime les colonnes inutiles ou dangeureuses (overfitting)
    cols_to_drop = ['nameOrig', 'nameDest', 'isFlaggedFraud', 'step']
    # SUPPRIMER 'nameOrig', 'nameDest'!
    
    # errors='ignore' permet de ne pas planter si la colonne est déjà supprimée
    df_prep = df_prep.drop(cols_to_drop, axis=1, errors='ignore')
    
    return df_prep

# Application
df_train_final = prepare_data(df)

In [36]:
df_train_final.sample(20)

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,hour,type_TRANSFER
416180,61669.64,20046.0,0.0,68240.05,129909.68,0,18,False
3943251,197475.18,0.0,0.0,370989.66,568464.84,0,22,False
5405895,305433.45,0.0,0.0,1056381.98,1361815.43,0,17,False
4495358,250123.05,142045.0,0.0,228984.54,479107.59,0,13,False
1395261,201192.33,50046.0,0.0,37084394.28,37285586.61,0,19,False
3963122,232431.24,116754.0,0.0,205769.3,438200.53,0,9,False
3074597,4827.48,0.0,0.0,144794.8,149622.28,0,19,False
1110964,149415.04,0.0,0.0,729488.29,878903.33,0,10,False
5271215,173712.5,3594.0,0.0,0.0,173712.5,0,12,False
3800609,28294.2,0.0,0.0,55406.91,83701.11,0,17,True


In [37]:
# bibliothèques nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline

# X = features (tout sauf isFraud), y = target (isFraud)
X = df_train_final.drop('isFraud', axis=1)
y = df_train_final['isFraud']

# Train Test Split
# Stratify=y est CRUCIAL : il garantit que le petit % de fraude (3.5%) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# PENSER A FAIRE UN PIPELINE

# Standardisation
scaler = StandardScaler()

# mise à l'échelle des données d'entraînement
X_train_scaled = scaler.fit_transform(X_train)

# pas d'entraînement sur les données de test
X_test_scaled = scaler.transform(X_test)

# Entraînement du modèle par classification
# n_estimators=100 : nombre d'arbres dans la forêt
# class_weight='balanced' : gère le déséquilibre de classes
# random_state=42 : sert à obtenir les mêmes résultats à chaque fois
# n_jobs=-1 : se servir de tous les cœurs du processeur pour accélérer l'entraînement
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)

# entraînement sur les données standardisées (X_train_scaled) et les réponses sur Xtrain_scaled (y_train)
model.fit(X_train_scaled, y_train)

# prédictions
y_pred = model.predict(X_test_scaled)

print("--- RAPPORT DE CLASSIFICATION ---")
print(classification_report(y_test, y_pred))

print("--- SCORE AUC-ROC ---")
# L'AUC-ROC est la métrique reine pour la fraude (doit être proche de 1)
print(roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]))


--- RAPPORT DE CLASSIFICATION ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    499429
           1       0.98      0.77      0.86       890

    accuracy                           1.00    500319
   macro avg       0.99      0.88      0.93    500319
weighted avg       1.00      1.00      1.00    500319

--- SCORE AUC-ROC ---
0.9850118576987953


In [38]:
df_test= df.copy()

#Temps : Conversion du step en heure journalière (0-23)
df_test['hour'] = df_test['step'] % 24

# On ne garde que la première lettre : 'C' pour Client ou 'M' pour Merchant
df_test['nameOrig'] = df_test['nameOrig'].str[0]
df_test['nameDest'] = df_test['nameDest'].str[0]

#A valeurs booléennes 
df_test = pd.get_dummies(df_test, columns=['type', 'nameOrig', 'nameDest'], drop_first=True)

cols_to_drop = ['nameOrig', 'nameDest', 'newbalanceOrig', 'newbalanceDest', 'isFlaggedFraud', 'step']
df_test = df_test.drop(cols_to_drop, axis=1, errors='ignore')

df_test

Unnamed: 0,amount,oldbalanceOrg,oldbalanceDest,isFraud,hour,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,nameDest_M
0,9839.64,170136.0,0.00,0,1,False,False,True,False,True
1,1864.28,21249.0,0.00,0,1,False,False,True,False,True
2,181.00,181.0,0.00,1,1,False,False,False,True,False
3,181.00,181.0,21182.00,1,1,True,False,False,False,False
4,11668.14,41554.0,0.00,0,1,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...
5726353,115032.25,0.0,132609.43,0,15,True,False,False,False,False
5726354,164763.16,0.0,583495.20,0,15,True,False,False,False,False
5726355,108840.67,0.0,804648.41,0,15,True,False,False,False,False
5726356,44318.65,15222.0,0.00,0,15,False,False,True,False,True


In [39]:
df_test.head()

Unnamed: 0,amount,oldbalanceOrg,oldbalanceDest,isFraud,hour,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,nameDest_M
0,9839.64,170136.0,0.0,0,1,False,False,True,False,True
1,1864.28,21249.0,0.0,0,1,False,False,True,False,True
2,181.0,181.0,0.0,1,1,False,False,False,True,False
3,181.0,181.0,21182.0,1,1,True,False,False,False,False
4,11668.14,41554.0,0.0,0,1,False,False,True,False,True


In [40]:
# bibliothèques nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline

# X = features (tout sauf isFraud), y = target (isFraud)
X = df_test.drop('isFraud', axis=1)
y = df_test['isFraud']

# Train Test Split
# Stratify=y est CRUCIAL : il garantit que le petit % de fraude (3.5%) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# PENSER A FAIRE UN PIPELINE

# Standardisation
scaler = StandardScaler()

# mise à l'échelle des données d'entraînement
X_train_scaled = scaler.fit_transform(X_train)

# pas d'entraînement sur les données de test
X_test_scaled = scaler.transform(X_test)

# Entraînement du modèle par classification
# n_estimators=100 : nombre d'arbres dans la forêt
# class_weight='balanced' : gère le déséquilibre de classes
# random_state=42 : sert à obtenir les mêmes résultats à chaque fois
# n_jobs=-1 : se servir de tous les cœurs du processeur pour accélérer l'entraînement
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)

# entraînement sur les données standardisées (X_train_scaled) et les réponses sur Xtrain_scaled (y_train)
model.fit(X_train_scaled, y_train)

# prédictions
y_pred = model.predict(X_test_scaled)

print("--- RAPPORT DE CLASSIFICATION ---")
print(classification_report(y_test, y_pred))

print("--- SCORE AUC-ROC ---")
# L'AUC-ROC est la métrique reine pour la fraude (doit être proche de 1)
print(roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]))

--- RAPPORT DE CLASSIFICATION ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1144382
           1       0.93      0.64      0.76       890

    accuracy                           1.00   1145272
   macro avg       0.97      0.82      0.88   1145272
weighted avg       1.00      1.00      1.00   1145272

--- SCORE AUC-ROC ---
0.9739213495124468


In [2]:
df_train_pipeline = df.copy()

In [3]:
# On ne garde que la première lettre : 'C' pour Client ou 'M' pour Merchant
df_train_pipeline['nameOrig'] = df_train_pipeline['nameOrig'].str[0]
df_train_pipeline['nameDest'] = df_train_pipeline['nameDest'].str[0]

#Temps : Conversion du step en heure journalière (0-23)
df_train_pipeline['hour'] = df_train_pipeline['step'] % 24

cols_to_drop = ['newbalanceOrig', 'newbalanceDest', 'isFlaggedFraud', 'step']
df_train_pipeline = df_train_pipeline.drop(cols_to_drop, axis=1, errors='ignore')

df_train_pipeline.sample(5)

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,nameDest,oldbalanceDest,isFraud,hour
5333243,CASH_OUT,70609.49,C,4366.39,C,151632.26,0,14
2576650,CASH_OUT,170157.0,C,0.0,C,310742.97,0,15
4289193,CASH_OUT,41277.69,C,0.0,C,286180.79,0,19
1935903,CASH_IN,96134.66,C,23309833.38,C,186220.36,0,9
5515318,CASH_IN,37701.49,C,50319.0,C,131360.03,0,20


In [None]:
# bibliothèques nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# définition du type de colonnes 
num_cols = ["amount", "oldbalanceOrg", "oldbalanceDest", "hour"]
cat_cols = ["type", "nameDest", "nameOrig"]

# X = features (tout sauf isFraud), y = target (isFraud)
X = df_train_pipeline.drop('isFraud', axis=1)
y = df_train_pipeline['isFraud']

# Train Test Split
# Stratify=y est CRUCIAL : il garantit que le petit % de fraude (3.5%) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessor
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop="first"), cat_cols)
    ])

# On calcule le ratio pour le donner en parametre Xgboost
count_norm = (y_train == 0).sum()
count_fraud = (y_train == 1).sum()
ratio = count_norm / count_fraud

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model",XGBClassifier(
    tree_method='hist',     # Utilise l'algorithme d'histogrammes (requis pour GPU)
    device='cuda',          # LA commande qui active ta carte NVIDIA
    scale_pos_weight=ratio, # Remplace class_weight='balanced' (plus précis sur XGB)
    random_state=42
))      
])

param_grid = {
    'model__n_estimators': [500, 1000],
    'model__max_depth': [3, 6, 9],
    'model__learning_rate': [0.05, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
}


random_search = RandomizedSearchCV(pipeline, param_distributions= param_grid, scoring= 'f1_macro', verbose=2)

# entraînement sur les données standardisées (X_train) et les réponses sur y_train
random_search.fit(X_train, y_train)

print(f"Meilleurs paramètres : {random_search.best_params_}")
print(f"Meilleur score : {random_search.best_score_:.4f}")

best_model = random_search.best_estimator_

# prédictions
y_pred = best_model.predict(X_test)

print("--- RAPPORT DE CLASSIFICATION ---")
print(classification_report(y_test, y_pred))

print("--- SCORE AUC-ROC ---")
# L'AUC-ROC est la métrique reine pour la fraude (doit être proche de 1)
print(roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]))


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=500, model__subsample=0.8; total time=   8.4s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=500, model__subsample=0.8; total time=   9.0s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=500, model__subsample=0.8; total time=  13.2s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=500, model__subsample=0.8; total time=   8.1s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=500, model__subsample=0.8; total time=   7.9s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.1, model__max_depth=6, model__n_estimators=1000, model__subsample=0.8; total time=  18.8s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.1, model__max_depth=6, model__n_e

In [7]:
print(f"Meilleurs paramètres : {random_search.best_params_}")
print(f"Meilleur score : {random_search.best_score_:.4f}")

Meilleurs paramètres : {'model__subsample': 1.0, 'model__n_estimators': 1000, 'model__max_depth': 9, 'model__learning_rate': 0.1, 'model__colsample_bytree': 0.8}
Meilleur score : 0.8616


In [8]:
best_model = random_search.best_estimator_

# prédictions
y_pred = best_model.predict(X_test)

print("--- RAPPORT DE CLASSIFICATION ---")
print(classification_report(y_test, y_pred))


--- RAPPORT DE CLASSIFICATION ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1144382
           1       0.63      0.87      0.73       890

    accuracy                           1.00   1145272
   macro avg       0.82      0.93      0.87   1145272
weighted avg       1.00      1.00      1.00   1145272



In [10]:
import joblib


joblib.dump(best_model, r'C:\Users\frede\Vs_Code\projet_fraude_cb\model\model_ml.joblib')

['C:\\Users\\frede\\Vs_Code\\projet_fraude_cb\\model\\model_ml.joblib']