## Interpretability 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
from sklearn.model_selection import GridSearchCV
import pickle 

import lightgbm as lgbm

#Path Eva : 'C:/Users/evaet/Documents/5A/defi_IA/' 
#Path Julie : '/home/julie/Documents/cours/5A/IAF/defi_IA'
PATH_PROJECT = 'C:/Users/evaet/Documents/5A/defi_IA/' 
PATH_IMAGE = os.path.join(PATH_PROJECT,'images')
PATH_UTILITIES = os.path.join(PATH_PROJECT,'code/utilities')

os.chdir(PATH_UTILITIES)

import data_loading as DL
import data_preparation_for_models as DP
import predictions_analysis as PA
from download_prediction import download_pred_Xtest

In [8]:
import eli5
from eli5.sklearn import PermutationImportance
import seaborn as sns

In [2]:
# Modèle LGBM 
def Model_LGBM(X_train,Y_train,param_opt):
    all_param = param_opt
    tps0=time.perf_counter()
    lgbmOpt = lgbm.LGBMRegressor(**all_param)
    lgbmOpt.fit(X_train, Y_train) 
    #pickle.dump(lgbmOpt, open(filename, 'wb'))
    tps1=time.perf_counter()
    print("Temps execution en sec :",(tps1 - tps0))
    return lgbmOpt

param_opt = {'n_estimators': 1967,
               'learning_rate': 0.28964787379300905,
               'num_leaves': 540, 
               'max_depth': 12,
               'min_data_in_leaf': 200,
               'max_bin': 243,
               'lambda_l1': 0,
               'lambda_l2': 30}

data,Y,var_quant,var_quali,var_quali_to_encode = DL.main_load_data()
X_train,X_vali,X_train_renorm,Y_train,X_vali_renorm,Y_vali,X_test_renorm = DP.main_prepare_train_vali_data(data,Y,var_quant,var_quali,var_quali_to_encode)
lgbmOpt = Model_LGBM(X_train_renorm, Y_train, param_opt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_quant["stock_mod"]=X_quant["stock"].map(lambda x: sqrt(x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_quant["stock_mod"]=X_quant["stock"].map(lambda x: sqrt(x))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas

Temps execution en sec : 200.0403675


In [3]:
print("Score lgbm :", lgbmOpt.score(X_vali_renorm,Y_vali))

Score lgbm : 0.9746958551620154


## Model agnostic methods 

### Feature importance 

In [15]:
feature_names = X_vali_renorm.columns
features_importance_dict = {}
name = "lgbm"

In [16]:
permutation_import = PermutationImportance(lgbmOpt).fit(X_vali_renorm,Y_vali)

KeyboardInterrupt: 

In [None]:
plt.figure()
features_importance = {'Feature_name':feature_names, 'Importance':permutation_import.feature_importances_}  
features_importance = pd.DataFrame(features_importance) 
features_importance = features_importance.sort_values(by=['Importance'],ascending=False) 
features_importance_dict[name] = features_importance.iloc[:5]
ax = sns.barplot(x=features_importance_dict[name]["Feature_name"], y=features_importance_dict[name]["Importance"], data=features_importance) 

### PDP and ICE plots

We apply this method only on the most important features according to the previous section "feature importance". 

In [None]:
!pip install pdpbox > /dev/null 2>&1

In [23]:
from pdpbox import pdp, get_dataset, info_plots

feature = "mobile"
pdp_feat = pdp.pdp_isolate(model=lgbmOpt, dataset=X_vali_renorm, model_features=feature_names, feature=feature)
pdp.pdp_plot(pdp_feat, feature, plot_lines=True, frac_to_plot=0.5)
plt.show()

ModuleNotFoundError: No module named 'pdpbox'

The flatter the graph, the less the feature has importance and influence on the model. 
It is also possible to visualize the combined effect of two features.

In [None]:
features_to_plot = ["mobile", "mobile"]
inter1 = pdp.pdp_interact(model=lgbmOpt, dataset=X_vali_renorm, model_features=feature_names, features=features_to_plot)
pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')
plt.show()

### SHAP 

In [None]:
!pip install shap > /dev/null 2>&1

In [None]:
import shap
shap.initjs() 

idx = 1

#TRES LONG 
X_train_summary = shap.kmeans(X_train_renorm,10)
explainer = shap.KernelExplainer(lgbmOpt.predict, X_train_renorm)
shap_values = explainer.shap_values(X_vali_renorm) #.iloc[0:20,:])
shap.summary_plot(shap_values, X_vali_renorm) #.iloc[0:20,:]) #single exemple plot
plt.figure()
#Summary on the dataset. To speed up we just compute the shap values for 20 exemples

### LIME 

- https://algotech.netlify.app/blog/interpreting-black-box-regression-model-with-lime/
- https://marcotcr.github.io/lime/tutorials/Using%2Blime%2Bfor%2Bregression.html

In [None]:
!pip install lime > /dev/null 2>&1

In [None]:
import lime
import lime.lime_tabular

index = 0

explainer = lime.lime_tabular.LimeTabularExplainer(X_vali_renorm.values, feature_names=feature_names, mode="regression")
exp = explainer.explain_instance(X_vali_renorm.iloc[index], lgbmOpt.predict, num_features=5, top_labels=1)
exp.show_in_notebook(show_table=True, show_all=True)

## Model specific methods 

### Vanilla gradient back-propagation
--> For Neural Networks