In [3]:
# !pip3 install shap

In [18]:


import pandas as pd
import numpy as np
from tabulate import tabulate 


import shap
from math import exp
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import plotly.graph_objects as go


#######################################
### KIDS -> MCD -> SICK

###############
##  hazards of going to McDonalds (too much)
MCD_HAZARD = { 
              '0 NEVER': 0.025 ,
              '1 SOMETIMES': 0.05,
              '2 OFTEN': 0.10,
              '3 EVERY DINNER': 0.20
              }

########
## Kids demographics
KIDS_DIST = { 
             '0 KIDS': .50,
             '1 KIDS': .15,
             '2 KIDS': .25,
             '3 OR MORE KIDS': .10}

##############
### KIDS -> MCD 
MCD_DIST = { 
             '0 KIDS': { '0 NEVER': 0.50 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.15,  '3 EVERY DINNER': 0.05},
             '1 KIDS': { '0 NEVER': 0.45 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.20,  '3 EVERY DINNER': 0.05},
             '2 KIDS': { '0 NEVER': 0.35 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.25,  '3 EVERY DINNER': 0.10},
             '3 OR MORE KIDS': { '0 NEVER': 0.20 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.30,  '3 EVERY DINNER': 0.20} 
            }
# %%
#####################################
##data generation
def np_k (dict: dict): return list(dict.keys())
def np_v (dict: dict): return list (dict.values())
N=10000
df = pd.DataFrame ({ 'KIDS': np.random.choice(np_k(KIDS_DIST), size=N, p=np_v(KIDS_DIST))})

# %%
def mcd (kids): 
    d = MCD_DIST[kids] 
    return np.random.choice (np_k(d), 1, p=np_v(d))[0]
df['MCD'] = df.KIDS.map (mcd)
# %%
df['IsSICK']=df.MCD.map (lambda m: MCD_HAZARD[m] > np.random.uniform())

print(df.shape)
df.head()

(10000, 3)


Unnamed: 0,KIDS,MCD,IsSICK
0,2 KIDS,0 NEVER,False
1,3 OR MORE KIDS,2 OFTEN,False
2,2 KIDS,1 SOMETIMES,False
3,2 KIDS,3 EVERY DINNER,False
4,3 OR MORE KIDS,1 SOMETIMES,False


In [19]:
df.groupby('MCD').aggregate({'MCD': 'count', 'IsSICK': [np.mean]}) 

Unnamed: 0_level_0,MCD,IsSICK
Unnamed: 0_level_1,count,mean
MCD,Unnamed: 1_level_2,Unnamed: 2_level_2
0 NEVER,4182,0.023673
1 SOMETIMES,3022,0.052614
2 OFTEN,2031,0.095027
3 EVERY DINNER,765,0.20915


In [20]:
df.groupby('KIDS').aggregate({'KIDS': 'count', 'IsSICK': [np.mean]}) 

Unnamed: 0_level_0,KIDS,IsSICK
Unnamed: 0_level_1,count,mean
KIDS,Unnamed: 1_level_2,Unnamed: 2_level_2
0 KIDS,4984,0.054374
1 KIDS,1471,0.057104
2 KIDS,2511,0.068897
3 OR MORE KIDS,1034,0.080271


In [21]:
pd.pivot_table(df, index='KIDS', columns='MCD', aggfunc={'IsSICK': np.mean, 'KIDS': 'count'}) 

Unnamed: 0_level_0,IsSICK,IsSICK,IsSICK,IsSICK,KIDS,KIDS,KIDS,KIDS
MCD,0 NEVER,1 SOMETIMES,2 OFTEN,3 EVERY DINNER,0 NEVER,1 SOMETIMES,2 OFTEN,3 EVERY DINNER
KIDS,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0 KIDS,0.02503,0.057487,0.084544,0.232283,2477,1496,757,254
1 KIDS,0.024279,0.044811,0.11838,0.164179,659,424,321,67
2 KIDS,0.018605,0.051862,0.101538,0.208835,860,752,650,249
3 OR MORE KIDS,0.026882,0.042857,0.082508,0.194872,186,350,303,195


In [23]:
kids_dict = {
    '0 KIDS' : 0,
    '1 KIDS' : 1,
    '2 KIDS' : 2,
    '3 OR MORE KIDS' : 3,
}

mcd_dict = {
    '0 NEVER': 0,
    '1 SOMETIMES': 1,
    '2 OFTEN': 2,
    '3 EVERY DINNER': 3,
}

issick_dict = {
    False : 0,
    True : 1,
}

df=df.replace({"KIDS": kids_dict})
df=df.replace({"MCD": mcd_dict})
df=df.replace({"IsSICK": issick_dict})
df.head()

Unnamed: 0,KIDS,MCD,IsSICK
0,2,0,0
1,3,2,0
2,2,1,0
3,2,3,0
4,3,1,0


## **XGBOOST**

In [24]:
X = df[df.columns[:-1]]
Y = df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
model = XGBClassifier()
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
print("Accuracy on test data: %.2f%%" % (accuracy * 100.0))
auc= roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("AUC on test data", round(auc,2))
prba = pd.DataFrame(model.predict_proba(X)[:, 1], columns=['P'])
df = pd.merge(df, prba, left_index=True, right_index=True)
df.head()

Accuracy on test data: 93.64%
AUC on test data 0.69


Unnamed: 0,KIDS,MCD,IsSICK,P
0,2,0,0,0.018696
1,3,2,0,0.096072
2,2,1,0,0.050346
3,2,3,0,0.211609
4,3,1,0,0.041198


## **SHAP**

In [25]:
explainer = shap.TreeExplainer(model)
baseValue = explainer.expected_value[0]
_shap = explainer.shap_values(X)
_shap = pd.DataFrame(_shap, columns = X.columns)
_shap['baseValue'] = baseValue
_shap['sumShap'] = _shap[_shap.columns].sum(axis=1)
_shap['P']=_shap.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
_shap.head()

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,KIDS,MCD,baseValue,sumShap,P
0,-0.11467,-1.047719,-2.798203,-3.960592,0.018696
1,-0.158761,0.715312,-2.798203,-2.241652,0.096072
2,0.008449,-0.147433,-2.798203,-2.937187,0.050346
3,0.009694,1.473256,-2.798203,-1.315253,0.211609
4,-0.23553,-0.113574,-2.798203,-3.147308,0.041197


## **SHAP IV**

In [26]:
_shap_iv = explainer.shap_interaction_values(X)
m,n,r = _shap_iv.shape
shap_iv= np.column_stack((np.repeat(np.arange(m),n),_shap_iv.reshape(m*n,-1)))
shap_iv = pd.DataFrame(shap_iv, columns = ['observations']+list(X.columns))
shap_iv.insert(1, 'features', np.tile(list(X.columns), len(shap_iv))[:len(shap_iv)])
shap_iv['sumIV'] = shap_iv[shap_iv.columns[2:]].sum(axis=1)
shap_iv['baseValue'] = baseValue 
shap_iv['sumShap'] = shap_iv.groupby('observations').sumIV.transform(sum)
shap_iv['sumShap'] = shap_iv['sumShap']+shap_iv['baseValue']
shap_iv['observations'] = shap_iv['observations'].astype(int)
shap_iv['P']=shap_iv.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
shap_iv.head(9)

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,observations,features,KIDS,MCD,sumIV,baseValue,sumShap,P
0,0,KIDS,-0.018935,-0.095735,-0.11467,-2.798203,-3.960592,0.018696
1,0,MCD,-0.095735,-0.951984,-1.047719,-2.798203,-3.960592,0.018696
2,1,KIDS,-0.296772,0.138011,-0.158761,-2.798203,-2.241652,0.096072
3,1,MCD,0.138011,0.5773,0.715312,-2.798203,-2.241652,0.096072
4,2,KIDS,-0.018935,0.027383,0.008449,-2.798203,-2.937187,0.050346
5,2,MCD,0.027383,-0.174816,-0.147433,-2.798203,-2.937187,0.050346
6,3,KIDS,-0.018934,0.028629,0.009694,-2.798203,-1.315253,0.211609
7,3,MCD,0.028629,1.444627,1.473256,-2.798203,-1.315253,0.211609
8,4,KIDS,-0.296772,0.061242,-0.23553,-2.798203,-3.147308,0.041197
