Chargement des librairies

In [1]:
import warnings
import gc
import pandas as pd
import numpy as np
from numpy import argmax
import pandas_flavor as pf
import tqdm
import re
from time import time
from prettytable import PrettyTable

import pickle
from collections import Counter
from sklearn.metrics import f1_score, fbeta_score, make_scorer
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, recall_score, precision_score, confusion_matrix

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'


warnings.filterwarnings("ignore")


Chargement des données

In [2]:
data = pd.read_pickle('../../gen_data/data_to_train.pkl')
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [3]:
@pf.register_dataframe_method
def add_row(df, row):
    df.loc[len(df)] = row

Fonction d'initialisation du dataframe pour le stockage des résultats

In [4]:
def init_table():

    scores_df = pd.DataFrame([], columns=["model name", "step", "time",
                             "roc AUC score", "accuracy", "F1-score","F2-score", "precision", "recall"])
    return scores_df


In [5]:
def pretty_print(df):
    table_scores = PrettyTable()
    table_scores.field_names = df.columns
    table_scores.add_rows(df.values)
    print(table_scores)


Fonction pour extraire un sample équilibré des données

In [6]:
def get_sample_for_testing(data,ratio):
    data_0 = data[data.TARGET == 0]
    data_1 = data[data.TARGET == 1]
    data_0 = data_0.sample(int(round(len(data_0)*ratio, 0)))
    data_1 = data_1.sample(int(round(len(data_1)*ratio, 0)))
    data = data_1.append(data_0)
    del data_0, data_1
    gc.collect()
    return data

# Modélisation 

Scorer utilisé : fbeta_score avec beta = 2 pour donner plus de poids à la classe positive qui est minoritaire. 

In [60]:
ftwo_scorer = make_scorer(fbeta_score, beta=2)

Fonction de gain métier

In [61]:
# à maximiser
def gain(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Par exemple
    gain =  2* tn - 10*fn
    return gain

In [7]:

def evaluate_and_log(model_name, model_pipeline,step, time, x_test, y_test, scores_df):
    test_pred = model_pipeline.predict(x_test)
    test_pred_proba = model_pipeline.predict_proba(x_test)

    auc_score = roc_auc_score(y_test, test_pred_proba[:, 1])
    accuracy = accuracy_score(y_test, test_pred)
    F1_score = f1_score(y_test, test_pred)
    F2_score = fbeta_score(y_test, test_pred, beta=2)
    precision = precision_score(y_test, test_pred)
    recall = recall_score(y_test, test_pred)

    scores_df.add_row([model_name, step, time, auc_score,accuracy, F1_score, F2_score, precision, recall])
    return scores_df

In [8]:
def plot_roc_curve(model_pipeline, x_test, y_test):
    test_pred_proba = model_pipeline.predict_proba(x_test)
 
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, test_pred_proba[:,1])
    fig = px.scatter(x = false_positive_rate, y = true_positive_rate)
    fig.show()

In [48]:
def plot_confusion_matrix(y_test, test_pred):
    conf_mat = confusion_matrix(y_test, test_pred)
    fig = px.imshow(conf_mat,text_auto=True,
                labels=dict(x="Prédiction", y="Données réelles"),
                x=['0', '1'],
                y=['0', '1']
               )
    fig.update_xaxes(side="top")
    fig.update_layout(height=400, width=400)
    fig.show()

In [62]:
def evaluate_model(model_pipeline, x_test, y_test):
    # prediction
    test_pred = model_pipeline.predict(x_test)
    test_pred_proba = model_pipeline.predict_proba(x_test)
    print('Roc auc score : {:.4f}'.format(
        roc_auc_score(y_test, test_pred_proba[:, 1])))
    print('F1-score : {:.4f}'.format(f1_score(y_test, test_pred)))
    print('F2-score : {:.4f}'.format(fbeta_score(y_test, test_pred, beta=2)))
    print('Accuracy :{:.4f}'.format(accuracy_score(y_test, test_pred)))
    print('Precision :{:.4f}'.format(precision_score(y_test, test_pred)))
    print('Recall : {:.4f}'.format( recall_score(y_test, test_pred)))
    print('Gain : {:.4f}'.format(gain(y_test, test_pred)))
    plot_confusion_matrix(y_test, test_pred)
    print('Confusion matrix:\n', confusion_matrix(y_test, test_pred))


## Tests de plusieurs algorithmes sur un sous-ensemble des données 

On prend une partie des données pour accélérer la pré-analyse.

In [13]:
data_sample = get_sample_for_testing(data, 0.2)
y_sample = data_sample[['TARGET']]
X_sample = data_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns

### Entraînement et évaluation plusieurs classifieurs binaires

Fonction permettant de comparer plusieurs modèles afin de choisir le meilleur à optimiser. 

In [18]:
def train_and_evaluate(X, y, with_smote=False, worf = False):
    scores_df = init_table()
    scale_pos_weight = Counter(y['TARGET'])[0]/Counter(y['TARGET'])[1]
    
    classifiers = [
            ('Dummy classifier', DummyClassifier(strategy="stratified")),
            ('Logistic Regression', LogisticRegression(class_weight='balanced')),
            ('RandomForest', RandomForestClassifier(class_weight='balanced')),
            ('XGBoost', XGBClassifier(scale_pos_weight=scale_pos_weight)),
            ('Light GBM', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight))
            ]
    if with_smote :
        if worf :  
             classifiers = [
                 ('Dummy classifier', DummyClassifier(strategy="stratified")),
                ('Logistic Regression', LogisticRegression()),
                ('XGBoost', XGBClassifier()),
                ('Light GBM', LGBMClassifier(objective='binary'))
                ]
        else : 
             classifiers = [
                 ('Dummy classifier', DummyClassifier(strategy="stratified")),
                ('Logistic Regression', LogisticRegression()),
                ('RandomForest', RandomForestClassifier()),
                ('XGBoost', XGBClassifier()),
                ('Light GBM', LGBMClassifier(objective='binary'))
                ]
    else  : 
        if worf : 
             classifiers =  [
                ('Dummy classifier', DummyClassifier(strategy="stratified")),
                ('Logistic Regression', LogisticRegression(class_weight='balanced')),
                ('XGBoost', XGBClassifier(scale_pos_weight=scale_pos_weight)),
                ('Light GBM', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight))
             ]

    skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)

    for clf_name, clf in tqdm.tqdm(classifiers):
        print(clf_name)
        print('===============================')
        # Entraîner le classifieur sur les données d'entraînement
        pipeline = Pipeline(steps=[
            ('scaler', RobustScaler()),
            ('classifier', clf)
        ]
        )

        for i, (train_index, test_index) in enumerate(skfolds.split(X, y)):
            start = time()
            X_train = X.iloc[train_index]
            y_train = y.iloc[train_index]
            X_test = X.iloc[test_index]
            y_test = y.iloc[test_index]
            if with_smote:
                over_only = SMOTE()
                print('Sampling')
            
                # transform the dataset

                X_train_re, y_train_re = over_only.fit_resample(
                    X_train, y_train)
                curr_clf = pipeline.fit(X_train_re, y_train_re)
            else:
                curr_clf = pipeline.fit(X_train, y_train)

            duration = time()-start
    
            print(clf_name + ' -- fold n°' + str(i))
            print('-------------------------------')
            scores_df = evaluate_and_log(
                clf_name, curr_clf, 'train', duration, X_train, y_train, scores_df)
            scores_df = evaluate_and_log(
                clf_name, curr_clf, 'test', duration, X_test, y_test, scores_df)
  
        print('===============================')
       
    return scores_df


Avec oversampling de l'ensemble d'entraînement

In [82]:
scores_df = train_and_evaluate(X_sample, y_sample, with_smote = True)
mean_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'mean', 'accuracy': 'mean', 'F1-score': 'mean', 'F2-score': 'mean', 'precision': 'mean', 'recall': 'mean'})

mean_scores_df.to_pickle('../../gen_data/partial_data_class_smote_scores_df.pkl')
mean_scores_df.to_excel('../../gen_data/results.xlsx', sheet_name='partial_data_class_smote_scores_df')
mean_scores_df

  0%|          | 0/5 [00:00<?, ?it/s]

Dummy classifier
Sampling
Dummy classifier -- fold n°0
-------------------------------
Sampling
Dummy classifier -- fold n°1
-------------------------------
Sampling
Dummy classifier -- fold n°2
-------------------------------
Sampling
Dummy classifier -- fold n°3
-------------------------------
Sampling
Dummy classifier -- fold n°4
-------------------------------


 20%|██        | 1/5 [00:09<00:39,  9.82s/it]

Logistic Regression
Sampling
Logistic Regression -- fold n°0
-------------------------------
Sampling
Logistic Regression -- fold n°1
-------------------------------
Sampling
Logistic Regression -- fold n°2
-------------------------------
Sampling
Logistic Regression -- fold n°3
-------------------------------
Sampling
Logistic Regression -- fold n°4
-------------------------------


 40%|████      | 2/5 [00:30<00:49, 16.37s/it]

RandomForest
Sampling
RandomForest -- fold n°0
-------------------------------
Sampling
RandomForest -- fold n°1
-------------------------------
Sampling
RandomForest -- fold n°2
-------------------------------
Sampling
RandomForest -- fold n°3
-------------------------------
Sampling
RandomForest -- fold n°4
-------------------------------


 60%|██████    | 3/5 [05:51<05:10, 155.24s/it]

XGBoost
Sampling
XGBoost -- fold n°0
-------------------------------
Sampling
XGBoost -- fold n°1
-------------------------------
Sampling
XGBoost -- fold n°2
-------------------------------
Sampling
XGBoost -- fold n°3
-------------------------------
Sampling
XGBoost -- fold n°4
-------------------------------


 80%|████████  | 4/5 [07:53<02:22, 142.25s/it]

Light GBM
Sampling
Light GBM -- fold n°0
-------------------------------
Sampling
Light GBM -- fold n°1
-------------------------------
Sampling
Light GBM -- fold n°2
-------------------------------
Sampling
Light GBM -- fold n°3
-------------------------------
Sampling
Light GBM -- fold n°4
-------------------------------


100%|██████████| 5/5 [08:34<00:00, 102.97s/it]






ModuleNotFoundError: No module named 'openpyxl'

In [83]:
#mean_scores_df.to_excel('../../gen_data/results.xlsx', sheet_name='partial_data_class_smote_scores_df')

Le modèle Random Forest est très long à s'exécuter et ses résultats sur l'apprentissage montre une grosse tendance à l'overfitting avec ses paramètres par défaut. Aussi, je l'exclus de la comparaison que je vais maintenant effectuer sans faire d'oversampling avec SMOTE mais en utilisant les paramètres dédiés des modèles (class_weight et scale_pos_weight) 

In [84]:
scores_df = train_and_evaluate(X_sample, y_sample, worf=True)
mean_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'mean', 'accuracy': 'mean', 'F1-score': 'mean', 'F2-score': 'mean', 'precision': 'mean', 'recall': 'mean'})

mean_scores_df.to_pickle('../../gen_data/partial_data_class_parameters_no_smote_scores_df.pkl')
mean_scores_df.to_excel('../../gen_data/results.xlsx', sheet_name='partial_data_class_parameters_no_smote_scores_df')
mean_scores_df

  0%|          | 0/4 [00:00<?, ?it/s]

Dummy classifier
Dummy classifier -- fold n°0
-------------------------------
Dummy classifier -- fold n°1
-------------------------------
Dummy classifier -- fold n°2
-------------------------------
Dummy classifier -- fold n°3
-------------------------------
Dummy classifier -- fold n°4
-------------------------------


 25%|██▌       | 1/4 [00:05<00:16,  5.38s/it]

Logistic Regression
Logistic Regression -- fold n°0
-------------------------------
Logistic Regression -- fold n°1
-------------------------------
Logistic Regression -- fold n°2
-------------------------------
Logistic Regression -- fold n°3
-------------------------------
Logistic Regression -- fold n°4
-------------------------------


 50%|█████     | 2/4 [00:16<00:17,  8.97s/it]

XGBoost
XGBoost -- fold n°0
-------------------------------
XGBoost -- fold n°1
-------------------------------
XGBoost -- fold n°2
-------------------------------
XGBoost -- fold n°3
-------------------------------
XGBoost -- fold n°4
-------------------------------


 75%|███████▌  | 3/4 [01:05<00:26, 26.95s/it]

Light GBM
Light GBM -- fold n°0
-------------------------------
Light GBM -- fold n°1
-------------------------------
Light GBM -- fold n°2
-------------------------------
Light GBM -- fold n°3
-------------------------------
Light GBM -- fold n°4
-------------------------------


100%|██████████| 4/4 [01:19<00:00, 19.87s/it]






Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F1-score,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Dummy classifier,test,2.192361,0.499984,0.850411,0.07608,0.076227,0.07587,0.076334
Dummy classifier,train,2.192361,0.501571,0.85194,0.078668,0.078474,0.079003,0.078348
Light GBM,test,9.955592,0.743662,0.754008,0.277555,0.405383,0.18195,0.585096
Light GBM,train,9.955592,0.912872,0.796194,0.411926,0.606193,0.268511,0.884189
Logistic Regression,test,8.812297,0.744588,0.686709,0.25837,0.41053,0.159712,0.675932
Logistic Regression,train,8.812297,0.752519,0.689287,0.26311,0.417795,0.162708,0.68711
XGBoost,test,44.485301,0.705863,0.819518,0.266107,0.335125,0.198131,0.405237
XGBoost,train,44.485301,0.980366,0.900589,0.611958,0.786321,0.446839,0.970745


Données complètes

In [85]:
#identifiers = data[['SK_ID_CURR']]
y = data[['TARGET']]
X = data.drop(columns=['SK_ID_CURR','TARGET'])
features = X.columns

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=100, stratify = y)

In [86]:
scores_df = train_and_evaluate(X, y, worf=True)
mean_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'mean', 'accuracy': 'mean', 'F1-score': 'mean', 'F2-score': 'mean', 'precision': 'mean', 'recall': 'mean'})

mean_scores_df.to_pickle('../../gen_data/all_data_class_parameters_no_smote_scores_df.pkl')
mean_scores_df.to_excel('../../gen_data/results.xlsx', sheet_name='all_data_class_parameters_no_smote_scores_df')
mean_scores_df

  0%|          | 0/4 [00:00<?, ?it/s]

Dummy classifier
Dummy classifier -- fold n°0
-------------------------------
Dummy classifier -- fold n°1
-------------------------------
Dummy classifier -- fold n°2
-------------------------------
Dummy classifier -- fold n°3
-------------------------------
Dummy classifier -- fold n°4
-------------------------------


 25%|██▌       | 1/4 [00:23<01:09, 23.18s/it]

Logistic Regression
Logistic Regression -- fold n°0
-------------------------------
Logistic Regression -- fold n°1
-------------------------------
Logistic Regression -- fold n°2
-------------------------------
Logistic Regression -- fold n°3
-------------------------------
Logistic Regression -- fold n°4
-------------------------------


 50%|█████     | 2/4 [01:16<01:22, 41.11s/it]

XGBoost
XGBoost -- fold n°0
-------------------------------
XGBoost -- fold n°1
-------------------------------
XGBoost -- fold n°2
-------------------------------
XGBoost -- fold n°3
-------------------------------
XGBoost -- fold n°4
-------------------------------


 75%|███████▌  | 3/4 [05:57<02:30, 150.48s/it]

Light GBM
Light GBM -- fold n°0
-------------------------------
Light GBM -- fold n°1
-------------------------------
Light GBM -- fold n°2
-------------------------------
Light GBM -- fold n°3
-------------------------------
Light GBM -- fold n°4
-------------------------------


100%|██████████| 4/4 [06:59<00:00, 104.93s/it]






Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F1-score,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Dummy classifier,test,9.433001,0.498573,0.851794,0.080208,0.080107,0.080381,0.08004
Dummy classifier,train,9.433001,0.499701,0.851538,0.079191,0.07912,0.07931,0.079074
Light GBM,test,41.039085,0.763452,0.706596,0.272734,0.426064,0.170481,0.68149
Light GBM,train,41.039085,0.808792,0.71665,0.298584,0.466674,0.186579,0.747049
Logistic Regression,test,38.48471,0.751517,0.68874,0.262374,0.416746,0.162222,0.68572
Logistic Regression,train,38.48471,0.752988,0.689085,0.263178,0.418019,0.162721,0.687805
XGBoost,test,260.940971,0.748593,0.746894,0.278178,0.41133,0.180692,0.604109
XGBoost,train,260.940971,0.877306,0.778602,0.373164,0.553422,0.241866,0.816304


## Optimisation du modèle LGBMClassifier avec GridSearchCV

In [None]:
scale_pos_weight = Counter(y_train['TARGET'])[0]/Counter(y_train['TARGET'])[1]


print("Starting LightGBM. Train shape: {}, Validation set shape: {}".format(
        X_train.shape, X_valid.shape))
print("Train counting: {}, Validation counting: {}".format(
        Counter(y_train['TARGET']), Counter(y_valid['TARGET'])))

search_params = {
    'lgbm__learning_rate': [0.01, 0.1, 0.2],
    'lgbm__min_child_samples' : [20, 50, 100],
    'lgbm__min_split_gain': [0.01, 0.05, 0.1],
    'lgbm__reg_alpha': [0, 0.01, 0.02],
    'lgbm__reg_lambda': [0, 0.1, 0.2]
}

fixed_params = {
   # 'lgbm__learning_rate': [0.1],
    'lgbm__num_leaves': [31],
    'lgbm__n_estimators': [500],
    'lgbm__subsample': [0.5],
    'lgbm__colsample_bytree': [0.5],
   # 'lgbm__reg_alpha': [0],
   # 'lgbm__reg_lambda': [0.1]
}

param_grid = {**search_params, **fixed_params}


classifier_pipe = Pipeline(steps=(['scaler', RobustScaler()],
                                ['lgbm', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight)]))

kfolds = StratifiedKFold(5, shuffle=True, random_state=42)

#Grid search cross-validation

grid_cv = GridSearchCV(classifier_pipe,
                    param_grid,
                    scoring= ftwo_scorer,
                    cv=kfolds,
                    n_jobs=1,
                    return_train_score=True,
                    verbose=10
                    )

grid_cv.fit(X_train,y_train)

print(f"BEST SCORE: {grid_cv.best_score_}")
best_model = grid_cv.best_estimator_
print(grid_cv.best_params_)

# Sauvegarde du meilleur modèle

filename = '../../gen_data/final_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

In [None]:
filename = '../../gen_data/final_model.sav'
best_model = pickle.load(open(filename, 'rb'))

Résultats du modèle optimisé

In [63]:
print('Training results for best model')
print('-------------------------------')
evaluate_model(best_model, X_train, y_train)
print('===============================')
print('Validation results for best model')
print('-------------------------------')
evaluate_model(best_model, X_valid, y_valid)


Training results for best model
-------------------------------
Roc auc score : 0.8962
F1-score : 0.3954
F2-score : 0.5806
Accuracy :0.7916
Precision :0.2582
Recall : 0.8442
Gain : 325004.0000


Confusion matrix:
 [[177972  48176]
 [  3094  16766]]
Validation results for best model
-------------------------------
Roc auc score : 0.7571
F1-score : 0.2867
F2-score : 0.4198
Accuracy :0.7558
Precision :0.1876
Recall : 0.6081
Gain : 67466.0000


Confusion matrix:
 [[43463 13075]
 [ 1946  3019]]


## Recherche du seuil de probabilité permettant de maximiser le gain

Représentation graphique

In [29]:



def plot_gain_scores(threshold_array, gain_scores, f2_scores,  max_threshold) :
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(x=threshold_array, y=f2_scores, name="f2-score"),
        secondary_y=False,
    )

    fig.add_vline(x = max_threshold)

    fig.add_trace(
        go.Scatter(x=threshold_array, y=gain_scores, name="gain"),
        secondary_y=True,
    )

    fig.update_layout(
        title_text="Gain versus f2-score"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Seuil de probabilité")

    # Set y-axes titles
    fig.update_yaxes(title_text="f2-score", secondary_y=False)
    fig.update_yaxes(title_text="gain", secondary_y=True)

    fig.show()

In [65]:
def to_labels(pos_probs, threshold):
 return (pos_probs >= threshold).astype('int')
 
y_pred_proba = best_model.predict_proba(X_valid)[::,1]
y_pred = best_model.predict(X_valid)

threshold_array = np.linspace(0, 1, 100)

gain_scores = [gain(y_valid, to_labels(y_pred_proba, t)) for t in threshold_array]
f2_scores = [fbeta_score(y_valid, to_labels(y_pred_proba, t), beta=2) for t in threshold_array]
accuracy_scores = [accuracy_score(y_valid, to_labels(y_pred_proba, t)) for t in threshold_array]

# récupération du meilleur seuil (maximisation du gain)

maxgain_ix = argmax(gain_scores)
best_threshold = threshold_array[maxgain_ix]
max_gain = gain_scores[maxgain_ix]

print('Seuil=%.3f, gain maximum=%.5f' % (best_threshold, max_gain))

Seuil=0.646, gain maximum=71276.00000


In [67]:
print(gain(y_train, y_train))
gain(y_valid, y_valid)

452296

In [31]:
plot_gain_scores(threshold_array, gain_scores, f2_scores, best_threshold)

On recalcule les scores avec le nouveau seuil de probabilité.

In [56]:
def evaluate_model_with_threshold(model_pipeline,x_test, y_test, threshold):
    test_pred_proba = model_pipeline.predict_proba(x_test)
    test_pred_th = to_labels(test_pred_proba, threshold)[::,1]
    print('Roc auc score : {:.4f}'.format(roc_auc_score(y_test, test_pred_th)))
    print('F1-score : {:.4f}'.format(f1_score(y_test, test_pred_th)))
    print('F2-score : {:.4f}'.format(fbeta_score(y_test, test_pred_th, beta=2)))
    print('Accuracy :{:.4f}'.format(accuracy_score(y_test, test_pred_th)))
    print('Precision :{:.4f}'.format(precision_score(y_test, test_pred_th)))
    print('Recall : {:.4f}'.format( recall_score(y_test, test_pred_th)))
    print('Gain : {:.4f}'.format(gain(y_test, test_pred_th)))
    plot_confusion_matrix(y_test, test_pred_th)
    print('Confusion matrix:\n', confusion_matrix(y_test, test_pred_th))

In [87]:
print('Train results for best model with new threshold')
print('-------------------------------')
evaluate_model_with_threshold(best_model,X_train, y_train, best_threshold)
print('Validation results for best model with new threshold')
print('-------------------------------')
evaluate_model_with_threshold(best_model,X_valid, y_valid, best_threshold)


Train results for best model with new threshold
-------------------------------
Roc auc score : 0.7621
F1-score : 0.4652
F2-score : 0.5446
Accuracy :0.8860
Precision :0.3743
Recall : 0.6145
Gain : 334934.0000


Confusion matrix:
 [[205747  20401]
 [  7656  12204]]
Validation results for best model with new threshold
-------------------------------
Roc auc score : 0.6453
F1-score : 0.3062
F2-score : 0.3536
Accuracy :0.8557
Precision :0.2503
Recall : 0.3944
Gain : 71276.0000


Confusion matrix:
 [[50673  5865]
 [ 3007  1958]]


# Importance globale des features

Le modèle Light GBM permet de récupérer l'attribut feature_importances_

In [88]:

feature_importance_df = pd.DataFrame()
feature_importance_df['importance'] = best_model['lgbm'].feature_importances_
feature_importance_df.index = features
feature_importance_df = feature_importance_df.sort_values(
    by='importance', ascending=False)

most_important_features = list(feature_importance_df.nlargest(20, columns=['importance']).index)

Fonction de visualisation des features les plus influentes, à l'échelle globale

In [89]:
def show_global_importance(feature_importance_df, num_features):
    df = feature_importance_df.nlargest(num_features, columns=['importance'])
    fig = px.bar(df, orientation='h')
    fig.update_yaxes(title='Importance')
    fig.update_xaxes(title='Feature')
    fig.update_traces(showlegend=False)
    fig.update_layout(
    title="Importance globale des features",
    font_size=11,
    height=800,
    width=600)
    fig.show()

In [90]:
show_global_importance(feature_importance_df, 20)

On peut aussi visualiser les influences locales respectives sur un sous-ensemble de données

In [91]:
small_sample = get_sample_for_testing(data,0.01)
small_sample.shape

(3075, 102)

In [92]:
y_small_sample = small_sample[['TARGET']]
X_small_sample = small_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns

Light GBM permet grâce à un paramètre (pred_contrib) de calculer les valeurs SHAP de chaque features, par individu. 

In [93]:

shap_values= best_model.predict(X_small_sample.values,pred_contrib=True)
shap_df = pd.DataFrame(shap_values[:,0:len(features)], columns=features)
shap_best_df = shap_df[most_important_features]


Visualisation des valeurs shap par individu

In [94]:
def plot_bee_chart(shap_best_df) :

    df = pd.melt(shap_best_df, value_vars=shap_best_df.columns).rename(columns={
        "variable": "features",
        "value": "shap_value"
    })
    fig = px.scatter(df, y="features", x="shap_value", color='shap_value')
    fig.update_traces(marker_size=3)
    fig.update_layout(
        title="Influences locales des features pour chaque point",
        font_size=11,
        height=800,
        width=800)
    fig.show()

In [95]:
plot_bee_chart(shap_best_df) 

In [96]:
! jupyter nbconvert --to html training_and_optimisation.ipynb

[NbConvertApp] Converting notebook training_and_optimisation.ipynb to html
[NbConvertApp] Writing 8479130 bytes to training_and_optimisation.html
