Chargement des librairies

In [31]:
import pandas as pd
import numpy as np
from numpy import arange, argmin, argmax
import pandas_flavor as pf
import tqdm
import re
from random import random
from time import time
from prettytable import PrettyTable

import pickle
from collections import Counter
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, confusion_matrix

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier




import plotly.graph_objects as go
from plotly.subplots import make_subplots

import plotly.express as px

import gc
import warnings

warnings.filterwarnings("ignore")


Chargement des données

In [16]:
data = pd.read_pickle('data_to_train.pkl')
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


In [17]:
@pf.register_dataframe_method
def add_row(df, row):
    df.loc[len(df)] = row


Fonction de stockage des résultats dans des tables

In [18]:

def init_tables():
    table_scores = PrettyTable()
    scores_df = pd.DataFrame([], columns=["model name", "step", "time",
                             "roc AUC score", "accuracy", "F2-score", "precision", "recall"])
    table_scores.field_names = ["model name", "step", "time",
                                "roc AUC score", "accuracy",  "F2-score", "precision", "recall"]
    return scores_df, table_scores


In [19]:
def get_sample_for_testing(data,ratio):
    data_0 = data[data.TARGET == 0]
    data_1 = data[data.TARGET == 1]
    data_0 = data_0.sample(int(round(len(data_0)*ratio, 0)))
    data_1 = data_1.sample(int(round(len(data_1)*ratio, 0)))
    data = data_1.append(data_0)
    del data_0, data_1
    gc.collect()
    return data

# Modélisation et optimisation

In [20]:

def evaluate_and_log(model_name, model_pipeline,step, time, x_test, y_test, scores_df, table_scores):
    test_pred = model_pipeline.predict(x_test)
    test_pred_proba = model_pipeline.predict_proba(x_test)

    auc_score = roc_auc_score(y_test, test_pred_proba[:, 1])
    accuracy = accuracy_score(y_test, test_pred)
    F2_score = fbeta_score(y_test, test_pred, beta=2)
    precision = precision_score(y_test, test_pred)
    recall = recall_score(y_test, test_pred)

    scores_df.add_row([model_name, step, time, auc_score,accuracy, F2_score, precision, recall])
    table_scores.add_row([model_name, step, time, auc_score,accuracy, F2_score, precision, recall])
    #print('Confusion matrix:\n', confusion_matrix(y_test, test_pred))
    return scores_df, table_scores

In [21]:
def evaluate_model(model_pipeline, x_test, y_test):
    # prediction
    test_pred = model_pipeline.predict(x_test)
    test_pred_proba = model_pipeline.predict_proba(x_test)
    print('Roc auc score : {:.4f}'.format(
        roc_auc_score(y_test, test_pred_proba[:, 1])))
    print('F2-score : {:.4f}'.format(fbeta_score(y_test, test_pred, beta=2)))
    print('Accuracy :{:.4f}'.format(accuracy_score(y_test, test_pred)))
    print('Precision :{:.4f}'.format(precision_score(y_test, test_pred)))
    print('Recall : {:.4f}'.format( recall_score(y_test, test_pred)))
    print('Confusion matrix:\n', confusion_matrix(y_test, test_pred))

In [22]:
def to_labels(pos_probs, threshold):
 return (pos_probs >= threshold).astype('int')

def evaluate_model_with_threshold(model_pipeline,x_test, y_test, threshold):
    # prediction

    test_pred_proba = model_pipeline.predict_proba(x_test)
    test_pred_th = to_labels(test_pred_proba, threshold)[::,1]
    print('Roc auc score : {:.4f}'.format(roc_auc_score(y_test, test_pred_th)))
    print('F2-score : {:.4f}'.format(fbeta_score(y_test, test_pred_th, beta=2)))
    print('Accuracy :{:.4f}'.format(accuracy_score(y_test, test_pred_th)))
    print('Precision :{:.4f}'.format(precision_score(y_test, test_pred_th)))
    print('Recall : {:.4f}'.format( recall_score(y_test, test_pred_th)))
    print('Confusion matrix:\n', confusion_matrix(y_test, test_pred_th))

Scorer utilisé : fbeta_score avec beta = 2 pour donner plus de poids à la classe positive qui est minoritaire. 

In [23]:
ftwo_scorer = make_scorer(fbeta_score, beta=2)

## Tests de plusieurs algorithmes sur un sous-ensemble des données 

On prend la moitié des données uniquement pour accélérer l'analyse

In [24]:
data_sample = get_sample_for_testing(data, 0.5)
y_sample = data_sample[['TARGET']]
X_sample = data_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns

### Dummy Classifier

In [25]:
X_sample_train, X_sample_valid, y_sample_train, y_sample_valid = train_test_split(
    X_sample, y_sample, test_size=0.20, random_state=42, stratify=y_sample)

In [26]:
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_sample_train,y_sample_train)
y_dummy_pred = dummy_clf.predict(X_sample_valid)

print('Training Results for Dummy Classification ')
print('-------------------------------')
evaluate_model(dummy_clf,X_sample_train, y_sample_train)
print('===============================')
print('Validation Results for Dummy Classification ')
print('-------------------------------')
evaluate_model(dummy_clf,X_sample_valid, y_sample_valid)

Training Results for Dummy Classification 
-------------------------------
Roc auc score : 0.4976
F2-score : 0.0812
Accuracy :0.8512
Precision :0.0808
Recall : 0.0813
Confusion matrix:
 [[103891   9183]
 [  9123    807]]
Validation Results for Dummy Classification 
-------------------------------
Roc auc score : 0.4978
F2-score : 0.0879
Accuracy :0.8512
Precision :0.0865
Recall : 0.0882
Confusion matrix:
 [[25956  2313]
 [ 2263   219]]


### Entraînement et évaluation plusieurs classifieurs binaires

Fonction pour comparer quelques modèles, sans optimisation, pour choisir le meilleur modèle à optimiser. 

In [60]:
def train_and_evaluate(X, y, with_smote=False, worf = False):
    scores_df, table_scores = init_tables()
    scale_pos_weight = Counter(y['TARGET'])[0]/Counter(y['TARGET'])[1]
    
    classifiers = [
            ('Logistic Regression', LogisticRegression(class_weight='balanced')),
            ('RandomForest', RandomForestClassifier(class_weight='balanced')),
            ('XGBoost', XGBClassifier(scale_pos_weight=scale_pos_weight)),
            ('Light GBM', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight))
            ]
    if (worf == True) : 
        classifiers = [
            ('Logistic Regression', LogisticRegression()),
            ('XGBoost', XGBClassifier()),
            ('Light GBM', LGBMClassifier(objective='binary'))
        ]

    skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)

    for clf_name, clf in tqdm.tqdm(classifiers):
        print(clf_name)
        print('===============================')
        # Entraîner le classifieur sur les données d'entraînement
        pipeline = Pipeline(steps=[
            ('scaler', RobustScaler()),
            ('classifier', clf)
        ]
        )

        for i, (train_index, test_index) in enumerate(skfolds.split(X, y)):
            start = time()
            X_train = X.iloc[train_index]
            y_train = y.iloc[train_index]
            X_test = X.iloc[test_index]
            y_test = y.iloc[test_index]
            if with_smote:
                over_only = SMOTE()
                print('Before sampling')
                print(Counter(y_train['TARGET']))

                # transform the dataset

                X_train_re, y_train_re = over_only.fit_resample(
                    X_train, y_train)
                print('After sampling')
                print(Counter(y_train_re['TARGET']))
                curr_clf = pipeline.fit(X_train_re, y_train_re)
            else:
                curr_clf = pipeline.fit(X_train, y_train)

            duration = time()-start
    
            print(clf_name + ' -- fold n°' + str(i))
            print('-------------------------------')
            scores_df, table_scores = evaluate_and_log(
                clf_name, curr_clf, 'train', duration, X_train, y_train, scores_df, table_scores)
            scores_df, table_scores = evaluate_and_log(
                clf_name, curr_clf, 'test', duration, X_test, y_test, scores_df, table_scores)
  
        print('===============================')
       
    return scores_df, table_scores


In [61]:
scores_df, table_scores = train_and_evaluate(X_sample, y_sample, with_smote=False)

  0%|          | 0/4 [00:00<?, ?it/s]

Logistic Regression
Logistic Regression -- fold n°0
-------------------------------
Logistic Regression -- fold n°1
-------------------------------
Logistic Regression -- fold n°2
-------------------------------
Logistic Regression -- fold n°3
-------------------------------
Logistic Regression -- fold n°4
-------------------------------


 25%|██▌       | 1/4 [00:21<01:04, 21.48s/it]

RandomForest
RandomForest -- fold n°0
-------------------------------
RandomForest -- fold n°1
-------------------------------
RandomForest -- fold n°2
-------------------------------
RandomForest -- fold n°3
-------------------------------
RandomForest -- fold n°4
-------------------------------


 50%|█████     | 2/4 [06:42<07:45, 232.87s/it]

XGBoost
XGBoost -- fold n°0
-------------------------------
XGBoost -- fold n°1
-------------------------------
XGBoost -- fold n°2
-------------------------------
XGBoost -- fold n°3
-------------------------------
XGBoost -- fold n°4
-------------------------------


 75%|███████▌  | 3/4 [08:41<03:00, 180.98s/it]

Light GBM
Light GBM -- fold n°0
-------------------------------
Light GBM -- fold n°1
-------------------------------
Light GBM -- fold n°2
-------------------------------
Light GBM -- fold n°3
-------------------------------
Light GBM -- fold n°4
-------------------------------


100%|██████████| 4/4 [09:09<00:00, 137.38s/it]






In [62]:
mean_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'mean', 'accuracy': 'mean', 'F2-score': 'mean', 'precision': 'mean', 'recall': 'mean'})

mean_scores_df.to_pickle('half_data_class_weight_scores_df.pkl')
mean_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light GBM,test,18.621716,0.756476,0.718481,0.418557,0.172017,0.652271
Light GBM,train,18.621716,0.841155,0.738269,0.502148,0.20584,0.784463
Logistic Regression,test,16.593978,0.749323,0.689005,0.416249,0.162145,0.684418
Logistic Regression,train,16.593978,0.752444,0.690007,0.418828,0.16323,0.688265
RandomForest,test,327.466592,0.719735,0.9193,0.002515,0.509945,0.002014
RandomForest,train,327.466592,1.0,0.999954,0.999549,1.0,0.999436
XGBoost,test,110.645997,0.731977,0.768177,0.389792,0.183456,0.542298
XGBoost,train,110.645997,0.920286,0.817286,0.623469,0.290292,0.874356


In [64]:
std_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'std', 'accuracy': 'std', 'F2-score': 'std', 'precision': 'std', 'recall': 'std'})

std_scores_df.to_pickle('half_data_class_weight_std_scores_df.pkl')
std_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light GBM,test,18.621716,0.004813485,0.001803,0.007131,0.002918,0.011191
Light GBM,train,18.621716,0.001384787,0.002038,0.003228,0.001782,0.00395
Logistic Regression,test,16.593978,0.004498444,0.0036,0.003328,0.001732,0.006118
Logistic Regression,train,16.593978,0.00092907,0.001414,0.00116,0.000758,0.000705
RandomForest,test,327.466592,0.008410017,0.000159,0.001705,0.193685,0.001366
RandomForest,train,327.466592,5.5511150000000004e-17,2.3e-05,0.000232,0.0,0.00029
XGBoost,test,110.645997,0.006399186,0.00347,0.008788,0.004513,0.012255
XGBoost,train,110.645997,0.002217879,0.001958,0.004889,0.003034,0.005187


Le modèle Random Forest est le plus long à s'exécuter. Ses résultats sur l'apprentissage montre une grosse tendance à l'overfitting avec ses paramètres par défaut. XG boost a aussi une légère tendance à l'overfitting et sa durée d'execution n'est pas négligeable comparé à LGBM. 

Je vais maintenant comparer les résultats obtenus sur l'apprentissage avec l'ensemble complet, pour la régression logistique et Light GBM.

In [67]:
#identifiers = data[['SK_ID_CURR']]
y = data[['TARGET']]
X = data.drop(columns=['SK_ID_CURR','TARGET'])
features = X.columns

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=100, stratify = y)

Régression Logistique

In [50]:
pipe1 = Pipeline([
                 ('scaler', RobustScaler()),
                 ('model', LogisticRegression(class_weight='balanced'))])
               
default_logreg_clf = pipe1.fit(X_train, y_train)

In [51]:
print('Training results for Base Log Reg')
print('-------------------------------')
evaluate_model(default_logreg_clf, X_train, y_train)
print('===============================')
print('Validation results for Base Log Reg')
print('-------------------------------')
evaluate_model(default_logreg_clf, X_valid, y_valid)

Training results for Base Log Reg
-------------------------------
Roc auc score : 0.7521
F2-score : 0.4183
Accuracy :0.6896
Precision :0.1630
Recall : 0.6877
Confusion matrix:
 [[155992  70156]
 [  6202  13658]]
Validation results for Base Log Reg
-------------------------------
Roc auc score : 0.7532
F2-score : 0.4168
Accuracy :0.6899
Precision :0.1626
Recall : 0.6844
Confusion matrix:
 [[39034 17504]
 [ 1567  3398]]


## Light GBM - Avant optimisation

In [52]:
scale_pos_weight = Counter(y_train['TARGET'])[0]/Counter(y_train['TARGET'])[1]
scale_pos_weight

11.38710976837865

In [53]:
pipe = Pipeline([
                 ('scaler', RobustScaler()),
                 ('model', LGBMClassifier(objective='binary', scale_pos_weight = scale_pos_weight))])

default_lgbm_clf = pipe.fit(X_train, y_train)

In [54]:
print('Training results for Base LGBM')
print('-------------------------------')
evaluate_model(default_lgbm_clf, X_train, y_train)
print('===============================')
print('Validation results for Base LGBM')
print('-------------------------------')
evaluate_model(default_lgbm_clf, X_valid, y_valid)

Training results for Base LGBM
-------------------------------
Roc auc score : 0.8080
F2-score : 0.4665
Accuracy :0.7148
Precision :0.1859
Recall : 0.7494
Confusion matrix:
 [[160968  65180]
 [  4977  14883]]
Validation results for Base LGBM
-------------------------------
Roc auc score : 0.7649
F2-score : 0.4272
Accuracy :0.7053
Precision :0.1704
Recall : 0.6852
Confusion matrix:
 [[39979 16559]
 [ 1563  3402]]


## Optimisation de LGBM avec GridSearch

class lightgbm.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=None, importance_type='split', **kwargs)


 nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,

GridSearchCV(estimator: Any, param_grid: Any, *, scoring: Any | None = None, n_jobs: Any | None = None, refit: bool = True, cv: Any | None = None, verbose: int = 0, pre_dispatch: str = "2*n_jobs", error_score: float = np.nan, return_train_score: bool = False)

In [74]:
print("Starting LightGBM. Train shape: {}, Validation set shape: {}".format(
        X_train.shape, X_valid.shape))
print("Train counting: {}, Validation counting: {}".format(
        Counter(y_train['TARGET']), Counter(y_valid['TARGET'])))

classifier_pipe = Pipeline(steps=(['scaler', RobustScaler()],
                                ['classifier', LogisticRegression(class_weight='balanced')]))


classifier_param_grid = [{
                      "classifier":[LogisticRegression(class_weight='balanced')],
                      "classifier__C":[100, 10, 1, 0.1, 0.01],
                     },

                     {
                      "classifier":[LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight)],
                      "classifier__learning_rate":[0.02, 0.05, 0.1],
                      "classifier__max_depth": [8, 10, 12, 24],
                      "classifier__n_estimators":[100, 1000,10000],
            
                     }]

folds = StratifiedKFold(5, shuffle=True, random_state=42)
#Grid search
grid_cv = GridSearchCV(classifier_pipe,
                    classifier_param_grid,
                    scoring= ftwo_scorer,
                    cv=folds,
                    n_jobs=1,
                    return_train_score=True,
                    verbose=10)

grid_cv.fit(X_train,y_train)

print(f"BEST SCORE: {grid_cv.best_score_}")
final_classifier_1 = grid_cv.best_estimator_
print(f"VALIDATION_SCORE: {final_classifier_1.score(X_valid,y_valid)}")
print(f"\n\nBEST CLASSIFIER: {final_classifier_1}")

#print(res.cv_results_)

#filename = 'cv_results.sav'
#pickle.dump(res.cv_results_, open(filename, 'wb'))

print(grid_cv.best_params_)
    # model can be saved, used for predictions or scoring
best_model = grid_cv.best_estimator_

filename = 'final_model_2.sav'
pickle.dump(best_model, open(filename, 'wb'))

Starting LightGBM. Train shape: (246008, 100), Validation set shape: (61503, 100)
Train counting: Counter({0.0: 226148, 1.0: 19860}), Validation counting: Counter({0.0: 56538, 1.0: 4965})
Fitting 5 folds for each of 41 candidates, totalling 205 fits
[CV 1/5; 1/41] START classifier=LogisticRegression(class_weight='balanced'), classifier__C=100
[CV 1/5; 1/41] END classifier=LogisticRegression(class_weight='balanced'), classifier__C=100;, score=(train=0.421, test=0.410) total time=   6.5s
[CV 2/5; 1/41] START classifier=LogisticRegression(class_weight='balanced'), classifier__C=100
[CV 2/5; 1/41] END classifier=LogisticRegression(class_weight='balanced'), classifier__C=100;, score=(train=0.418, test=0.419) total time=   5.8s
[CV 3/5; 1/41] START classifier=LogisticRegression(class_weight='balanced'), classifier__C=100
[CV 3/5; 1/41] END classifier=LogisticRegression(class_weight='balanced'), classifier__C=100;, score=(train=0.420, test=0.416) total time=   5.7s
[CV 4/5; 1/41] START classi

KeyboardInterrupt: 

In [68]:
print('Training results for best model')
print('-------------------------------')
evaluate_model(best_model, X_train, y_train)
print('===============================')
print('Validation results for best model')
print('-------------------------------')
evaluate_model(best_model, X_valid, y_valid)


Training results for best model
-------------------------------
Roc auc score : 0.7525
F2-score : 0.4187
Accuracy :0.6897
Precision :0.1631
Recall : 0.6885
Confusion matrix:
 [[155990  70158]
 [  6186  13674]]
Validation results for best model
-------------------------------
Roc auc score : 0.7516
F2-score : 0.4150
Accuracy :0.6897
Precision :0.1619
Recall : 0.6812
Confusion matrix:
 [[39036 17502]
 [ 1583  3382]]


## Recherche du seuil de probabilité permettant de maximiser le gain

Fonctions de calculs du gain et de représentation graphique de la courbe

In [None]:
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
 return (pos_probs >= threshold).astype('int')

# à maximiser
def gain(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Par exemple
    gain =  2* tn - 10*fn
    return gain

def plot_gain_scores(threshold_array, gain_scores, precision_scores, recall_scores) :
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(x=threshold_array, y=precision_scores, name="precision"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=threshold_array, y=recall_scores, name="recall"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=threshold_array, y=gain_scores, name="gain"),
        secondary_y=True,
    )

    fig.update_layout(
        title_text="Gain versus précision/recall"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Seuil de probabilité")

    # Set y-axes titles
    fig.update_yaxes(title_text="scores", secondary_y=False)
    fig.update_yaxes(title_text="gain", secondary_y=True)

    fig.show()

In [None]:
y_pred_proba = best_model.predict_proba(X)[::,1]
y_pred = best_model.predict(X)

threshold_array = np.linspace(0, 1, 100)

gain_scores = [gain(y, to_labels(y_pred_proba, t)) for t in threshold_array]
precision_scores = [precision_score(y, to_labels(y_pred_proba, t)) for t in threshold_array]
recall_scores = [recall_score(y, to_labels(y_pred_proba, t)) for t in threshold_array]
accuracy_scores = [accuracy_score(y, to_labels(y_pred_proba, t)) for t in threshold_array]

# récupération du meilleur seuil (maximisation du gain)

maxgain_ix = argmax(gain_scores)
best_threshold = threshold_array[maxgain_ix]
max_gain = gain_scores[maxgain_ix]

print('Seuil=%.3f, gain maximum=%.5f' % (best_threshold, max_gain))

In [None]:
plot_gain_scores(threshold_array, gain_scores, precision_scores, recall_scores)

On recalcule les scores avec le nouveau seuil de probabilité.

In [None]:
evaluate_model_with_threshold(best_model,X_train, y_train, best_threshold)

# Importance globale des features

In [None]:
filename = 'final_model.sav'
model = pickle.load(open(filename, 'rb'))

Le modèle Light GBM permet de récupérer l'attribut feature_importances_

In [None]:

feature_importance_df = pd.DataFrame()
feature_importance_df['importance'] = model['lgbm'].feature_importances_
feature_importance_df.index = features
feature_importance_df = feature_importance_df.sort_values(
    by='importance', ascending=False)

most_important_features = list(feature_importance_df.nlargest(20, columns=['importance']).index)

Fonction de visualisation des features les plus influentes, à l'échelle globale

In [None]:
def show_global_importance(feature_importance_df, num_features):
    df = feature_importance_df.nlargest(num_features, columns=['importance'])
    fig = px.bar(df, orientation='h')
    fig.update_yaxes(title='Importance')
    fig.update_xaxes(title='Feature')
    fig.update_traces(showlegend=False)
    fig.update_layout(
    title="Importance globale des features",
    font_size=11,
    height=800,
    width=600)
    fig.show()

In [None]:
show_global_importance(feature_importance_df, 20)

On peut aussi visualiser les influences locales respectives sur un sous-ensemble de données

In [None]:
small_sample = get_small_sample_for_testing(data,0.01)
small_sample.shape

In [None]:
y_small_sample = small_sample[['TARGET']]
X_small_sample = small_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns

Light GBM permet grâce à un paramètre (pred_contrib) de calculer les valeurs SHAP de chaque features, par individu. 

In [None]:

shap_values= model.predict(X_small_sample.values,pred_contrib=True)
shap_df = pd.DataFrame(shap_values[:,0:len(features)], columns=features)
shap_best_df = shap_df[most_important_features]


Visualisation des valeurs shap par individu

In [None]:
def plot_bee_chart(shap_best_df) :

    df = pd.melt(shap_best_df, value_vars=shap_best_df.columns).rename(columns={
        "variable": "features",
        "value": "shap_value"
    })
    fig = px.scatter(df, y="features", x="shap_value", color='shap_value')
    fig.update_traces(marker_size=3)
    fig.update_layout(
        title="Influences locales des features pour chaque point",
        font_size=11,
        height=800,
        width=800)
    fig.show()

In [None]:
plot_bee_chart(shap_best_df) 