#Préparation de l'environnement

In [40]:
# Connexion au drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
# Version utilisée pour entrainer les modèles
!pip install scikit-learn==1.6.0



In [42]:
# Librairies système et utilitaires
import os
import sys
import time
import datetime
from joblib import dump

# Librairies de manipulation de données
import numpy as np
import pandas as pd

# Librairies de visualisation
import shap

# Librairies Machine Learning - Scikit-learn
import sklearn
from sklearn import set_config
from sklearn.metrics import (
    confusion_matrix,
    make_scorer,
    roc_auc_score,
    accuracy_score
)
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV
)
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import lightgbm
from lightgbm import LGBMClassifier

# Librairies pour le traitement du déséquilibre des classes
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import joblib

In [43]:
lightgbm_model_file_path = "/content/drive/MyDrive/openclassrooms/project_list/project_8/model/lightgbm_model.joblib"
best_pl = joblib.load(lightgbm_model_file_path)

df_train = pd.read_parquet('/content/drive/MyDrive/openclassrooms/project_list/project_8/data/train_data_raw.pq')
df_test = pd.read_parquet('/content/drive/MyDrive/openclassrooms/project_list/project_8/data/test_data_raw.pq')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Shap values for test

In [31]:
print(df_train.shape) #uniquement comme reference pour l'explainer SHAP
print(df_test.shape)
print(best_pl.steps)

(307507, 32)
(48744, 32)
[('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler()), ('model', LGBMClassifier(is_unbalance='True', learning_rate=0.01, max_depth=7,
               n_estimators=1000, num_leaves=20, random_state=42))]


**INITIALISER L'EXPLAINER SHAP AVEC LES DONNEES D'ENTRAINEMENT EQUILIBRE**

In [23]:
### INDIQUER LA DISTRIBUTION TRAIN DANS LE SHAP pour l'initilisation
# Préparer la donnée de référence train
y_train = df_train['TARGET']
X_train = df_train.drop(columns = ['TARGET', 'SK_ID_CURR'])

# Equilibrer les classes pour la distribution
rus_model = RandomUnderSampler(random_state=42)
X_balanced, y_balanced = rus_model.fit_resample(X_train, y_train)
print('Dimension après équilibre de classe', X_balanced.shape)

# Configurer le modèle
# Entrainement du modèle avec les données train
X_processed = best_pl.named_steps['imputer'].transform(X_balanced)
X_processed = best_pl.named_steps['scaler'].transform(X_processed)

# Créer un objet explainer qui va prendre un modèle et une distribution initial du jeu utilisé pour l'entrainement de lightGBM
explainer = shap.Explainer(best_pl.named_steps['model'],
                           X_processed)

Dimension après équilibre de classe (49650, 30)


**FIT AVEC LES VALEURS TESTS**

In [20]:
#### ENTRAINEMENT SHAP SUR LES DONNEES TEST
# On entraine les valeurs shap sur les données test
# Préparer la donnée pour l'entrainement
y_test = df_test['TARGET']
X_test = df_test.drop(columns = ['TARGET', 'SK_ID_CURR'])
X_processed_test = best_pl.named_steps['imputer'].transform(X_test)
X_processed_test = best_pl.named_steps['scaler'].transform(X_processed_test)

In [None]:
shap_values_test = explainer(X_processed_test, check_additivity=False)

# Exportation des shap values test
test_path = "/content/drive/MyDrive/openclassrooms/project_list/project_8/data/shap_values_test.joblib"
joblib.dump(shap_values_test, test_path)

**RECUPERER LE DF_TEST AVEC LES PREDICTIONS**

In [44]:
df_test.drop(columns='TARGET', inplace=True)
df_test['TARGET'] = best_pl.predict(X_test)
df_test



Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_MAX,BURO_DAYS_CREDIT_MEAN,CC_AMT_BALANCE_MEAN,BURO_CREDIT_ACTIVE_Closed_MEAN,DAYS_BIRTH,PREV_NAME_CONTRACT_STATUS_Refused_MEAN,...,BURO_STATUS_C_MEAN_MEAN,PREV_CODE_REJECT_REASON_SCOFR_MEAN,DAYS_LAST_PHONE_CHANGE,CODE_GENDER,APPROVED_DAYS_DECISION_MIN,REFUSED_HOUR_APPR_PROCESS_START_MIN,DAYS_ID_PUBLISH,REG_CITY_NOT_WORK_CITY,SK_ID_CURR,TARGET
0,0.159520,0.789654,0.752614,,,-735.000000,,0.571429,-19241,0.000,...,0.441240,0.0,-1740.0,1,-1740.0,,-812,0,100001,0.0
1,0.432962,0.291656,0.564990,,,-190.666667,,0.333333,-18064,0.000,...,0.128205,0.0,0.0,0,-757.0,,-1623,0,100005,1.0
2,0.610991,0.699787,,0.255556,7.0,-1737.500000,18159.919219,1.000000,-20038,0.000,...,0.397036,0.0,-856.0,0,-1999.0,,-3503,0,100013,0.0
3,0.612704,0.509677,0.525734,0.045455,12.0,-1401.750000,8085.058163,0.583333,-13976,0.000,...,0.362245,0.0,-1805.0,1,-1805.0,,-4208,0,100028,0.0
4,,0.425687,0.202145,,,,,,-13040,0.000,...,,0.0,-821.0,0,-821.0,,-4262,1,100038,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.643026,0.648575,,,,-727.200000,,0.600000,-19970,0.000,...,0.393340,0.0,-684.0,1,-683.0,,-3399,0,456221,0.0
48740,,0.684596,,,,,,,-11186,0.000,...,,0.0,0.0,1,-2451.0,,-3003,1,456222,0.0
48741,0.283712,0.632770,0.733503,,,-539.000000,,0.800000,-15922,0.000,...,0.514412,0.0,-838.0,1,-838.0,,-1504,0,456223,0.0
48742,0.595456,0.445701,0.373090,,,-1755.823529,,0.647059,-13968,0.400,...,0.442340,0.0,-2308.0,0,-2308.0,16.0,-1364,1,456224,0.0


In [51]:
# # Concaténer
# df_concat = pd.concat([df_train, df_test], ignore_index=True)
# df_concat
# Trop long à entrainer pour SHAP

Unnamed: 0,TARGET,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_MAX,BURO_DAYS_CREDIT_MEAN,CC_AMT_BALANCE_MEAN,BURO_CREDIT_ACTIVE_Closed_MEAN,DAYS_BIRTH,...,NAME_EDUCATION_TYPE_Higher education,BURO_STATUS_C_MEAN_MEAN,PREV_CODE_REJECT_REASON_SCOFR_MEAN,DAYS_LAST_PHONE_CHANGE,CODE_GENDER,APPROVED_DAYS_DECISION_MIN,REFUSED_HOUR_APPR_PROCESS_START_MIN,DAYS_ID_PUBLISH,REG_CITY_NOT_WORK_CITY,SK_ID_CURR
0,1.0,0.139376,0.262949,0.083037,,,-874.000000,,0.750000,-9461,...,False,0.175426,0.0,-1134.0,0,-606.0,,-2120,0,100002
1,0.0,,0.622246,0.311267,,,-1400.750000,,0.750000,-16765,...,True,,0.0,-828.0,1,-2341.0,,-291,0,100003
2,0.0,0.729567,0.555912,,,,-867.000000,,1.000000,-19046,...,False,,0.0,-815.0,0,-815.0,,-2531,0,100004
3,0.0,,0.650442,,,0.0,,0.00000,,-19005,...,False,,0.0,-617.0,1,-617.0,15.0,-2437,0,100006
4,0.0,,0.322738,,,,-1149.000000,,1.000000,-19932,...,False,,0.0,-1106.0,0,-2357.0,,-3458,1,100007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356246,0.0,0.643026,0.648575,,,,-727.200000,,0.600000,-19970,...,False,0.393340,0.0,-684.0,1,-683.0,,-3399,0,456221
356247,0.0,,0.684596,,,,,,,-11186,...,False,,0.0,0.0,1,-2451.0,,-3003,1,456222
356248,0.0,0.283712,0.632770,0.733503,,,-539.000000,,0.800000,-15922,...,False,0.514412,0.0,-838.0,1,-838.0,,-1504,0,456223
356249,0.0,0.595456,0.445701,0.373090,,,-1755.823529,,0.647059,-13968,...,True,0.442340,0.0,-2308.0,0,-2308.0,16.0,-1364,1,456224


In [46]:
# Exportation
df_test_file_path = "/content/drive/MyDrive/openclassrooms/project_list/project_8/data/df_test_processed.pq"
df_test.to_parquet(df_test_file_path)

In [49]:
df_test[df_test['SK_ID_CURR']==100038].index[0]

4