Chargement des librairies

In [1]:
import pandas as pd
import numpy as np
from numpy import arange, argmin, argmax
import pandas_flavor as pf
import tqdm
import re
from random import random
from time import time
from prettytable import PrettyTable

import pickle
from collections import Counter
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, confusion_matrix

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier




import plotly.graph_objects as go
from plotly.subplots import make_subplots

import plotly.express as px

import gc
import warnings

warnings.filterwarnings("ignore")


Chargement des données

In [2]:
data = pd.read_pickle('../../gen_data/data_to_train.pkl')
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


In [3]:
@pf.register_dataframe_method
def add_row(df, row):
    df.loc[len(df)] = row

Fonction d'initialisation des tables pour le stockage des résultats

In [4]:
def init_tables():
    table_scores = PrettyTable()
    scores_df = pd.DataFrame([], columns=["model name", "step", "time",
                             "roc AUC score", "accuracy", "F2-score", "precision", "recall"])
    table_scores.field_names = ["model name", "step", "time",
                                "roc AUC score", "accuracy",  "F2-score", "precision", "recall"]
    return scores_df, table_scores


Fonction pour extraire un sample équilibré des données

In [5]:
def get_sample_for_testing(data,ratio):
    data_0 = data[data.TARGET == 0]
    data_1 = data[data.TARGET == 1]
    data_0 = data_0.sample(int(round(len(data_0)*ratio, 0)))
    data_1 = data_1.sample(int(round(len(data_1)*ratio, 0)))
    data = data_1.append(data_0)
    del data_0, data_1
    gc.collect()
    return data

# Modélisation et optimisation

In [6]:

def evaluate_and_log(model_name, model_pipeline,step, time, x_test, y_test, scores_df, table_scores):
    test_pred = model_pipeline.predict(x_test)
    test_pred_proba = model_pipeline.predict_proba(x_test)

    auc_score = roc_auc_score(y_test, test_pred_proba[:, 1])
    accuracy = accuracy_score(y_test, test_pred)
    F2_score = fbeta_score(y_test, test_pred, beta=2)
    precision = precision_score(y_test, test_pred)
    recall = recall_score(y_test, test_pred)

    scores_df.add_row([model_name, step, time, auc_score,accuracy, F2_score, precision, recall])
    table_scores.add_row([model_name, step, time, auc_score,accuracy, F2_score, precision, recall])
    #print('Confusion matrix:\n', confusion_matrix(y_test, test_pred))
    return scores_df, table_scores

In [7]:
def evaluate_model(model_pipeline, x_test, y_test):
    # prediction
    test_pred = model_pipeline.predict(x_test)
    test_pred_proba = model_pipeline.predict_proba(x_test)
    print('Roc auc score : {:.4f}'.format(
        roc_auc_score(y_test, test_pred_proba[:, 1])))
    print('F2-score : {:.4f}'.format(fbeta_score(y_test, test_pred, beta=2)))
    print('Accuracy :{:.4f}'.format(accuracy_score(y_test, test_pred)))
    print('Precision :{:.4f}'.format(precision_score(y_test, test_pred)))
    print('Recall : {:.4f}'.format( recall_score(y_test, test_pred)))
    print('Confusion matrix:\n', confusion_matrix(y_test, test_pred))

In [8]:
def to_labels(pos_probs, threshold):
 return (pos_probs >= threshold).astype('int')

def evaluate_model_with_threshold(model_pipeline,x_test, y_test, threshold):
    test_pred_proba = model_pipeline.predict_proba(x_test)
    test_pred_th = to_labels(test_pred_proba, threshold)[::,1]
    print('Roc auc score : {:.4f}'.format(roc_auc_score(y_test, test_pred_th)))
    print('F2-score : {:.4f}'.format(fbeta_score(y_test, test_pred_th, beta=2)))
    print('Accuracy :{:.4f}'.format(accuracy_score(y_test, test_pred_th)))
    print('Precision :{:.4f}'.format(precision_score(y_test, test_pred_th)))
    print('Recall : {:.4f}'.format( recall_score(y_test, test_pred_th)))
    print('Confusion matrix:\n', confusion_matrix(y_test, test_pred_th))

Scorer utilisé : fbeta_score avec beta = 2 pour donner plus de poids à la classe positive qui est minoritaire. 

In [9]:
ftwo_scorer = make_scorer(fbeta_score, beta=2)

## Tests de plusieurs algorithmes sur un sous-ensemble des données 

On prend la moitié des données uniquement pour accélérer l'analyse

In [10]:
data_sample = get_sample_for_testing(data, 0.5)
y_sample = data_sample[['TARGET']]
X_sample = data_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns

### Dummy Classifier

In [11]:
X_sample_train, X_sample_valid, y_sample_train, y_sample_valid = train_test_split(
    X_sample, y_sample, test_size=0.20, random_state=42, stratify=y_sample)

In [12]:
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_sample_train,y_sample_train)
y_dummy_pred = dummy_clf.predict(X_sample_valid)

print('Training Results for Dummy Classification ')
print('-------------------------------')
evaluate_model(dummy_clf,X_sample_train, y_sample_train)
print('===============================')
print('Validation Results for Dummy Classification ')
print('-------------------------------')
evaluate_model(dummy_clf,X_sample_valid, y_sample_valid)

Training Results for Dummy Classification 
-------------------------------
Roc auc score : 0.4999
F2-score : 0.0812
Accuracy :0.8528
Precision :0.0822
Recall : 0.0810
Confusion matrix:
 [[104093   8981]
 [  9126    804]]
Validation Results for Dummy Classification 
-------------------------------
Roc auc score : 0.5003
F2-score : 0.0757
Accuracy :0.8527
Precision :0.0772
Recall : 0.0753
Confusion matrix:
 [[26033  2236]
 [ 2295   187]]


### Entraînement et évaluation plusieurs classifieurs binaires

Fonction pour comparer quelques modèles, sans optimisation, afin de choisir le meilleur modèle à optimiser. 

In [13]:
def train_and_evaluate(X, y, with_smote=False, worf = False):
    scores_df, table_scores = init_tables()
    scale_pos_weight = Counter(y['TARGET'])[0]/Counter(y['TARGET'])[1]
    
    classifiers = [
            ('Logistic Regression', LogisticRegression(class_weight='balanced')),
            ('RandomForest', RandomForestClassifier(class_weight='balanced')),
            ('XGBoost', XGBClassifier(scale_pos_weight=scale_pos_weight)),
            ('Light GBM', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight))
            ]
    if with_smote & worf : 
        classifiers = [
            ('Logistic Regression', LogisticRegression()),
            ('XGBoost', XGBClassifier()),
            ('Light GBM', LGBMClassifier(objective='binary'))
            ]
    if ((with_smote == False) and (worf == True)) : 
        classifiers =  [
            ('Logistic Regression', LogisticRegression(class_weight='balanced')),
            ('XGBoost', XGBClassifier(scale_pos_weight=scale_pos_weight)),
            ('Light GBM', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight))
            ]

    skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)

    for clf_name, clf in tqdm.tqdm(classifiers):
        print(clf_name)
        print('===============================')
        # Entraîner le classifieur sur les données d'entraînement
        pipeline = Pipeline(steps=[
            ('scaler', RobustScaler()),
            ('classifier', clf)
        ]
        )

        for i, (train_index, test_index) in enumerate(skfolds.split(X, y)):
            start = time()
            X_train = X.iloc[train_index]
            y_train = y.iloc[train_index]
            X_test = X.iloc[test_index]
            y_test = y.iloc[test_index]
            if with_smote:
                over_only = SMOTE()
                print('Before sampling')
                print(Counter(y_train['TARGET']))

                # transform the dataset

                X_train_re, y_train_re = over_only.fit_resample(
                    X_train, y_train)
                print('After sampling')
                print(Counter(y_train_re['TARGET']))
                curr_clf = pipeline.fit(X_train_re, y_train_re)
            else:
                curr_clf = pipeline.fit(X_train, y_train)

            duration = time()-start
    
            print(clf_name + ' -- fold n°' + str(i))
            print('-------------------------------')
            scores_df, table_scores = evaluate_and_log(
                clf_name, curr_clf, 'train', duration, X_train, y_train, scores_df, table_scores)
            scores_df, table_scores = evaluate_and_log(
                clf_name, curr_clf, 'test', duration, X_test, y_test, scores_df, table_scores)
  
        print('===============================')
       
    return scores_df, table_scores


Le modèle Random Forest est très long à s'exécuter et ses résultats sur l'apprentissage montre une grosse tendance à l'overfitting avec ses paramètres par défaut. Aussi, je l'exclue de la recherche.

In [14]:
scores_df, table_scores = train_and_evaluate(X_sample, y_sample, with_smote=True, worf=True)

  0%|          | 0/3 [00:00<?, ?it/s]

Logistic Regression
Before sampling
Counter({0.0: 113075, 1.0: 9929})
After sampling
Counter({1.0: 113075, 0.0: 113075})
Logistic Regression -- fold n°0
-------------------------------
Before sampling
Counter({0.0: 113075, 1.0: 9929})
After sampling
Counter({1.0: 113075, 0.0: 113075})
Logistic Regression -- fold n°1
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
Logistic Regression -- fold n°2
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
Logistic Regression -- fold n°3
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
Logistic Regression -- fold n°4
-------------------------------


 33%|███▎      | 1/3 [00:51<01:42, 51.04s/it]

XGBoost
Before sampling
Counter({0.0: 113075, 1.0: 9929})
After sampling
Counter({1.0: 113075, 0.0: 113075})
XGBoost -- fold n°0
-------------------------------
Before sampling
Counter({0.0: 113075, 1.0: 9929})
After sampling
Counter({1.0: 113075, 0.0: 113075})
XGBoost -- fold n°1
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
XGBoost -- fold n°2
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
XGBoost -- fold n°3
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
XGBoost -- fold n°4
-------------------------------


 67%|██████▋   | 2/3 [04:59<02:47, 167.05s/it]

Light GBM
Before sampling
Counter({0.0: 113075, 1.0: 9929})
After sampling
Counter({1.0: 113075, 0.0: 113075})
Light GBM -- fold n°0
-------------------------------
Before sampling
Counter({0.0: 113075, 1.0: 9929})
After sampling
Counter({1.0: 113075, 0.0: 113075})
Light GBM -- fold n°1
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
Light GBM -- fold n°2
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
Light GBM -- fold n°3
-------------------------------
Before sampling
Counter({0.0: 113074, 1.0: 9930})
After sampling
Counter({1.0: 113074, 0.0: 113074})
Light GBM -- fold n°4
-------------------------------


100%|██████████| 3/3 [05:58<00:00, 119.44s/it]






In [15]:
mean_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'mean', 'accuracy': 'mean', 'F2-score': 'mean', 'precision': 'mean', 'recall': 'mean'})

mean_scores_df.to_pickle('../../gen_data/half_data_class_with_smote_scores_df.pkl')
mean_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light GBM,test,49.479225,0.750906,0.918903,0.031383,0.461949,0.02546
Light GBM,train,49.479225,0.79424,0.920253,0.042961,0.605224,0.034865
Logistic Regression,test,45.684061,0.746111,0.689825,0.410694,0.160659,0.672414
Logistic Regression,train,45.684061,0.748884,0.691122,0.413501,0.161896,0.676362
XGBoost,test,239.488193,0.738615,0.917414,0.063054,0.410094,0.052046
XGBoost,train,239.488193,0.885532,0.929487,0.174873,0.883814,0.145666


In [16]:
std_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'std', 'accuracy': 'std', 'F2-score': 'std', 'precision': 'std', 'recall': 'std'})

std_scores_df.to_pickle('../../gen_data/half_data_class_with_smote_std_scores_df.pkl')
std_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light GBM,test,49.479225,0.003405,0.00033,0.003588,0.032819,0.002956
Light GBM,train,49.479225,0.001172,7.3e-05,0.0028,0.003446,0.002304
Logistic Regression,test,45.684061,0.005647,0.008109,0.006112,0.003933,0.007364
Logistic Regression,train,45.684061,0.002662,0.006779,0.003888,0.002992,0.004636
XGBoost,test,239.488193,0.004237,0.000479,0.001138,0.019491,0.000958
XGBoost,train,239.488193,0.003427,0.000349,0.004555,0.009786,0.003932


In [17]:
scores_df, table_scores = train_and_evaluate(X_sample, y_sample, with_smote=False, worf=True)

  0%|          | 0/3 [00:00<?, ?it/s]

Logistic Regression
Logistic Regression -- fold n°0
-------------------------------
Logistic Regression -- fold n°1
-------------------------------
Logistic Regression -- fold n°2
-------------------------------
Logistic Regression -- fold n°3
-------------------------------
Logistic Regression -- fold n°4
-------------------------------


 33%|███▎      | 1/3 [00:24<00:48, 24.32s/it]

XGBoost
XGBoost -- fold n°0
-------------------------------
XGBoost -- fold n°1
-------------------------------
XGBoost -- fold n°2
-------------------------------
XGBoost -- fold n°3
-------------------------------
XGBoost -- fold n°4
-------------------------------


 67%|██████▋   | 2/3 [02:09<01:11, 71.94s/it]

Light GBM
Light GBM -- fold n°0
-------------------------------
Light GBM -- fold n°1
-------------------------------
Light GBM -- fold n°2
-------------------------------
Light GBM -- fold n°3
-------------------------------
Light GBM -- fold n°4
-------------------------------


100%|██████████| 3/3 [02:37<00:00, 52.42s/it]






In [18]:
mean_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'mean', 'accuracy': 'mean', 'F2-score': 'mean', 'precision': 'mean', 'recall': 'mean'})

mean_scores_df.to_pickle('../../gen_data/half_data_class_without_smote_scores_df.pkl')
mean_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light GBM,test,18.99457,0.758914,0.720165,0.422262,0.173809,0.657107
Light GBM,train,18.99457,0.842296,0.739516,0.502642,0.206522,0.783496
Logistic Regression,test,18.758992,0.751658,0.69069,0.416691,0.162751,0.68321
Logistic Regression,train,18.758992,0.754639,0.691218,0.419551,0.163798,0.688185
XGBoost,test,96.899516,0.736341,0.771981,0.392226,0.186352,0.541977
XGBoost,train,96.899516,0.924669,0.823102,0.633461,0.298332,0.880862


In [19]:
std_scores_df = scores_df.groupby(by=['model name', 'step']).agg(
    {'time': sum, 'roc AUC score': 'std', 'accuracy': 'std', 'F2-score': 'std', 'precision': 'std', 'recall': 'std'})

std_scores_df.to_pickle('../../gen_data/half_data_class_without_smote_std_scores_df.pkl')
std_scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,roc AUC score,accuracy,F2-score,precision,recall
model name,step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Light GBM,test,18.99457,0.003099,0.002719,0.005004,0.002032,0.009322
Light GBM,train,18.99457,0.000481,0.000198,0.000855,0.00033,0.001435
Logistic Regression,test,18.758992,0.002601,0.00286,0.004076,0.00174,0.007598
Logistic Regression,train,18.758992,0.00054,0.000783,0.000654,0.000285,0.001726
XGBoost,test,96.899516,0.004019,0.003737,0.006266,0.002647,0.012021
XGBoost,train,96.899516,0.00432,0.003728,0.009022,0.005876,0.009281


Je vais maintenant comparer les résultats obtenus sur l'apprentissage avec l'ensemble complet, pour la régression logistique et Light GBM.

In [20]:
#identifiers = data[['SK_ID_CURR']]
y = data[['TARGET']]
X = data.drop(columns=['SK_ID_CURR','TARGET'])
features = X.columns

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=100, stratify = y)

Régression Logistique

In [21]:
pipe1 = Pipeline([
                 ('scaler', RobustScaler()),
                 ('model', LogisticRegression(class_weight='balanced'))])
               
default_logreg_clf = pipe1.fit(X_train, y_train)

In [22]:
print('Training results for Base Log Reg')
print('-------------------------------')
evaluate_model(default_logreg_clf, X_train, y_train)
print('===============================')
print('Validation results for Base Log Reg')
print('-------------------------------')
evaluate_model(default_logreg_clf, X_valid, y_valid)

Training results for Base Log Reg
-------------------------------
Roc auc score : 0.7534
F2-score : 0.4191
Accuracy :0.6902
Precision :0.1634
Recall : 0.6886
Confusion matrix:
 [[156116  70032]
 [  6185  13675]]
Validation results for Base Log Reg
-------------------------------
Roc auc score : 0.7506
F2-score : 0.4147
Accuracy :0.6896
Precision :0.1618
Recall : 0.6808
Confusion matrix:
 [[39030 17508]
 [ 1585  3380]]


## Light GBM - Avant optimisation

In [23]:
scale_pos_weight = Counter(y_train['TARGET'])[0]/Counter(y_train['TARGET'])[1]
scale_pos_weight

11.38710976837865

In [24]:
pipe = Pipeline([
                 ('scaler', RobustScaler()),
                 ('model', LGBMClassifier(objective='binary', scale_pos_weight = scale_pos_weight))])

default_lgbm_clf = pipe.fit(X_train, y_train)

In [25]:
print('Training results for Base LGBM')
print('-------------------------------')
evaluate_model(default_lgbm_clf, X_train, y_train)
print('===============================')
print('Validation results for Base LGBM')
print('-------------------------------')
evaluate_model(default_lgbm_clf, X_valid, y_valid)

Training results for Base LGBM
-------------------------------
Roc auc score : 0.8085
F2-score : 0.4673
Accuracy :0.7162
Precision :0.1866
Recall : 0.7490
Confusion matrix:
 [[161310  64838]
 [  4985  14875]]
Validation results for Base LGBM
-------------------------------
Roc auc score : 0.7612
F2-score : 0.4252
Accuracy :0.7054
Precision :0.1699
Recall : 0.6814
Confusion matrix:
 [[40004 16534]
 [ 1582  3383]]


## Optimisation de LGBM avec GridSearch

class lightgbm.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=None, importance_type='split', **kwargs)

GridSearchCV(estimator: Any, param_grid: Any, *, scoring: Any | None = None, n_jobs: Any | None = None, refit: bool = True, cv: Any | None = None, verbose: int = 0, pre_dispatch: str = "2*n_jobs", error_score: float = np.nan, return_train_score: bool = False)

In [35]:
print("Starting LightGBM. Train shape: {}, Validation set shape: {}".format(
        X_train.shape, X_valid.shape))
print("Train counting: {}, Validation counting: {}".format(
        Counter(y_train['TARGET']), Counter(y_valid['TARGET'])))

classifier_pipe = Pipeline(steps=(['scaler', RobustScaler()],
                                ['lgbm', LGBMClassifier(objective='binary', scale_pos_weight=scale_pos_weight)]))

search_params = {
   # 'lgbm__learning_rate': [0.1, 0.05],
   # 'lgbm__num_leaves': [31, 63, 127],
   # 'lgbm__n_estimators': [200, 300, 500],
   # 'lgbm__subsample': [0.5, 0.8, 1.0],
   # 'lgbm__colsample_bytree': [0.5, 0.8, 1.0],
    'lgbm__reg_alpha': [0.1, 0.5, 1.0],
    'lgbm__reg_lambda': [0.1, 0.5, 1.0]
}

fixed_params = {
    'lgbm__learning_rate': [0.1],
    'lgbm__num_leaves': [31],
    'lgbm__n_estimators': [300],
    'lgbm__subsample': [0.5],
    'lgbm__colsample_bytree': [0.5],
    #'lgbm__reg_alpha': [0.1, 0.5, 1.0],
    #'lgbm__reg_lambda': [0.1, 0.5, 1.0]
}

param_grid = {**search_params, **fixed_params}

folds = StratifiedKFold(5, shuffle=True, random_state=42)
#Grid search
grid_cv = GridSearchCV(classifier_pipe,
                    param_grid,
                    scoring= ftwo_scorer,
                    cv=folds,
                    n_jobs=1,
                    return_train_score=True,
                    verbose=10
                    )

grid_cv.fit(X_train,y_train)

print(f"BEST SCORE: {grid_cv.best_score_}")
best_model = grid_cv.best_estimator_
print(grid_cv.best_params_)
    # model can be saved, used for predictions or scoring
best_model = grid_cv.best_estimator_

filename = '../../gen_data/final_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

Starting LightGBM. Train shape: (246008, 100), Validation set shape: (61503, 100)
Train counting: Counter({0.0: 226148, 1.0: 19860}), Validation counting: Counter({0.0: 56538, 1.0: 4965})
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5; 1/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.1, lgbm__n_estimators=300, lgbm__num_leaves=31, lgbm__reg_alpha=0.1, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 1/5; 1/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.1, lgbm__n_estimators=300, lgbm__num_leaves=31, lgbm__reg_alpha=0.1, lgbm__reg_lambda=0.1, lgbm__subsample=0.5;, score=(train=0.548, test=0.415) total time=  12.1s
[CV 2/5; 1/9] START lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.1, lgbm__n_estimators=300, lgbm__num_leaves=31, lgbm__reg_alpha=0.1, lgbm__reg_lambda=0.1, lgbm__subsample=0.5
[CV 2/5; 1/9] END lgbm__colsample_bytree=0.5, lgbm__learning_rate=0.1, lgbm__n_estimators=300, lgbm__num_leaves=31, lgbm__reg_alpha=0.1, lgbm__reg_lambda=0.1,

KeyboardInterrupt: 

In [36]:
print('Training results for best model')
print('-------------------------------')
evaluate_model(best_model, X_train, y_train)
print('===============================')
print('Validation results for best model')
print('-------------------------------')
evaluate_model(best_model, X_valid, y_valid)


Training results for best model
-------------------------------
Roc auc score : 0.8592
F2-score : 0.5271
Accuracy :0.7578
Precision :0.2225
Recall : 0.8014
Confusion matrix:
 [[170519  55629]
 [  3944  15916]]
Validation results for best model
-------------------------------
Roc auc score : 0.7591
F2-score : 0.4214
Accuracy :0.7326
Precision :0.1781
Recall : 0.6399
Confusion matrix:
 [[41880 14658]
 [ 1788  3177]]


## Recherche du seuil de probabilité permettant de maximiser le gain

Fonctions de calculs du gain et de représentation graphique de la courbe

In [37]:
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
 return (pos_probs >= threshold).astype('int')

# à maximiser
def gain(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Par exemple
    gain =  2* tn - 10*fn
    return gain

def plot_gain_scores(threshold_array, gain_scores, accuracy_scores, recall_scores) :
    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(x=threshold_array, y=accuracy_scores, name="accuracy"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=threshold_array, y=recall_scores, name="recall"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=threshold_array, y=gain_scores, name="gain"),
        secondary_y=True,
    )

    fig.update_layout(
        title_text="Gain versus accuracy et recall"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Seuil de probabilité")

    # Set y-axes titles
    fig.update_yaxes(title_text="scores", secondary_y=False)
    fig.update_yaxes(title_text="gain", secondary_y=True)

    fig.show()

In [38]:
y_pred_proba = best_model.predict_proba(X)[::,1]
y_pred = best_model.predict(X)

threshold_array = np.linspace(0, 1, 100)

gain_scores = [gain(y, to_labels(y_pred_proba, t)) for t in threshold_array]
recall_scores = [recall_score(y, to_labels(y_pred_proba, t)) for t in threshold_array]
accuracy_scores = [accuracy_score(y, to_labels(y_pred_proba, t)) for t in threshold_array]

# récupération du meilleur seuil (maximisation du gain)

maxgain_ix = argmax(gain_scores)
best_threshold = threshold_array[maxgain_ix]
max_gain = gain_scores[maxgain_ix]

print('Seuil=%.3f, gain maximum=%.5f' % (best_threshold, max_gain))

Seuil=0.626, gain maximum=388420.00000


In [39]:
plot_gain_scores(threshold_array, gain_scores, accuracy_scores,recall_scores)

On recalcule les scores avec le nouveau seuil de probabilité.

In [None]:
evaluate_model_with_threshold(best_model,X_train, y_train, best_threshold)

# Importance globale des features

In [None]:
filename = 'final_model.sav'
model = pickle.load(open(filename, 'rb'))

Le modèle Light GBM permet de récupérer l'attribut feature_importances_

In [None]:

feature_importance_df = pd.DataFrame()
feature_importance_df['importance'] = model['lgbm'].feature_importances_
feature_importance_df.index = features
feature_importance_df = feature_importance_df.sort_values(
    by='importance', ascending=False)

most_important_features = list(feature_importance_df.nlargest(20, columns=['importance']).index)

Fonction de visualisation des features les plus influentes, à l'échelle globale

In [None]:
def show_global_importance(feature_importance_df, num_features):
    df = feature_importance_df.nlargest(num_features, columns=['importance'])
    fig = px.bar(df, orientation='h')
    fig.update_yaxes(title='Importance')
    fig.update_xaxes(title='Feature')
    fig.update_traces(showlegend=False)
    fig.update_layout(
    title="Importance globale des features",
    font_size=11,
    height=800,
    width=600)
    fig.show()

In [None]:
show_global_importance(feature_importance_df, 20)

On peut aussi visualiser les influences locales respectives sur un sous-ensemble de données

In [None]:
small_sample = get_small_sample_for_testing(data,0.01)
small_sample.shape

In [None]:
y_small_sample = small_sample[['TARGET']]
X_small_sample = small_sample.drop(columns=['SK_ID_CURR','TARGET'])
features = X_sample.columns

Light GBM permet grâce à un paramètre (pred_contrib) de calculer les valeurs SHAP de chaque features, par individu. 

In [None]:

shap_values= model.predict(X_small_sample.values,pred_contrib=True)
shap_df = pd.DataFrame(shap_values[:,0:len(features)], columns=features)
shap_best_df = shap_df[most_important_features]


Visualisation des valeurs shap par individu

In [None]:
def plot_bee_chart(shap_best_df) :

    df = pd.melt(shap_best_df, value_vars=shap_best_df.columns).rename(columns={
        "variable": "features",
        "value": "shap_value"
    })
    fig = px.scatter(df, y="features", x="shap_value", color='shap_value')
    fig.update_traces(marker_size=3)
    fig.update_layout(
        title="Influences locales des features pour chaque point",
        font_size=11,
        height=800,
        width=800)
    fig.show()

In [None]:
plot_bee_chart(shap_best_df) 