### **DATA GENERATION**

In [42]:
#!/usr/bin/env python3 

import pandas as pd
import numpy as np
from tabulate import tabulate 


import shap
from math import exp
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import plotly.graph_objects as go


#######################################
### KIDS -> MCD -> SICK

###############
##  hazards of going to McDonalds (too much)
MCD_HAZARD = { 
              '0 NEVER': 0.025 ,
              '1 SOMETIMES': 0.05,
              '2 OFTEN': 0.10,
              '3 EVERY DINNER': 0.20
              }

########
## Kids demographics
KIDS_DIST = { 
             '0 KIDS': .50,
             '1 KIDS': .15,
             '2 KIDS': .25,
             '3 OR MORE KIDS': .10}

##############
### KIDS -> MCD 
MCD_DIST = { 
             '0 KIDS': { '0 NEVER': 0.50 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.15,  '3 EVERY DINNER': 0.05},
             '1 KIDS': { '0 NEVER': 0.45 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.20,  '3 EVERY DINNER': 0.05},
             '2 KIDS': { '0 NEVER': 0.35 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.25,  '3 EVERY DINNER': 0.10},
             '3 OR MORE KIDS': { '0 NEVER': 0.20 ,  '1 SOMETIMES': 0.30, '2 OFTEN': 0.30,  '3 EVERY DINNER': 0.20} 
            }
# %%
#####################################
##data generation
def np_k (dict: dict): return list(dict.keys())
def np_v (dict: dict): return list (dict.values())
N=100000
df = pd.DataFrame ({ 'KIDS': np.random.choice(np_k(KIDS_DIST), size=N, p=np_v(KIDS_DIST))})

# %%
def mcd (kids): 
    d = MCD_DIST[kids] 
    return np.random.choice (np_k(d), 1, p=np_v(d))[0]
df['MCD'] = df.KIDS.map (mcd)
# %%
df['IsSICK']=df.MCD.map (lambda m: MCD_HAZARD[m] > np.random.uniform())

# df = df.reset_index().rename(columns={'index': 'ID'})


  from .autonotebook import tqdm as notebook_tqdm


### **DATA**

In [36]:
print(df.shape)
df.head()

(100000, 3)


Unnamed: 0,KIDS,MCD,IsSICK
0,0 KIDS,0 NEVER,False
1,1 KIDS,0 NEVER,False
2,0 KIDS,3 EVERY DINNER,False
3,2 KIDS,0 NEVER,False
4,0 KIDS,3 EVERY DINNER,False


In [37]:
df.groupby('MCD').aggregate({'MCD': 'count', 'IsSICK': [np.mean]}) 

Unnamed: 0_level_0,MCD,IsSICK
Unnamed: 0_level_1,count,mean
MCD,Unnamed: 1_level_2,Unnamed: 2_level_2
0 NEVER,42317,0.025356
1 SOMETIMES,30226,0.048402
2 OFTEN,19774,0.097047
3 EVERY DINNER,7683,0.204738


In [38]:
df.groupby('KIDS').aggregate({'KIDS': 'count', 'IsSICK': [np.mean]}) 

Unnamed: 0_level_0,KIDS,IsSICK
Unnamed: 0_level_1,count,mean
KIDS,Unnamed: 1_level_2,Unnamed: 2_level_2
0 KIDS,50035,0.051284
1 KIDS,14961,0.054141
2 KIDS,25034,0.070145
3 OR MORE KIDS,9970,0.08987


In [39]:
pd.pivot_table(df, index='KIDS', columns='MCD', aggfunc={'IsSICK': np.mean, 'KIDS': 'count'}) 

Unnamed: 0_level_0,IsSICK,IsSICK,IsSICK,IsSICK,KIDS,KIDS,KIDS,KIDS
MCD,0 NEVER,1 SOMETIMES,2 OFTEN,3 EVERY DINNER,0 NEVER,1 SOMETIMES,2 OFTEN,3 EVERY DINNER
KIDS,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0 KIDS,0.025693,0.045914,0.097029,0.205468,24987,15028,7606,2414
1 KIDS,0.025488,0.04935,0.096381,0.171958,6709,4539,2957,756
2 KIDS,0.024798,0.053302,0.098199,0.209782,8670,7617,6273,2474
3 OR MORE KIDS,0.023065,0.047009,0.095303,0.209907,1951,3042,2938,2039


### **Label Encoding**

In [49]:
kids_dict = {
    '0 KIDS' : 0,
    '1 KIDS' : 1,
    '2 KIDS' : 2,
    '3 OR MORE KIDS' : 3,
}

mcd_dict = {
    '0 NEVER': 0,
    '1 SOMETIMES': 1,
    '2 OFTEN': 2,
    '3 EVERY DINNER': 3,
}

issick_dict = {
    False : 0,
    True : 1,
}


df=df.replace({"KIDS": kids_dict})
df=df.replace({"MCD": mcd_dict})
df=df.replace({"IsSICK": issick_dict})
df.head()

Unnamed: 0,KIDS,MCD,IsSICK
0,0,0,0
1,1,0,0
2,0,3,0
3,2,0,0
4,0,3,0


### **XGBOOST**

In [51]:
X = df[df.columns[:-1]]
Y = df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
model = XGBClassifier()
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
print("Accuracy on test data: %.2f%%" % (accuracy * 100.0))
auc= roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("AUC on test data", round(auc,2))
prba = pd.DataFrame(model.predict_proba(X)[:, 1], columns=['P'])
df = pd.merge(df, prba, left_index=True, right_index=True)
df.head()

Accuracy on test data: 94.08%
AUC on test data 0.7


Unnamed: 0,KIDS,MCD,IsSICK,P
0,0,0,0,0.026167
1,1,0,0,0.025055
2,0,3,0,0.207656
3,2,0,0,0.024628
4,0,3,0,0.207656


In [52]:
explainer = shap.TreeExplainer(model)
baseValue = explainer.expected_value[0]
_shap = explainer.shap_values(X)
_shap = pd.DataFrame(_shap, columns = X.columns)
_shap['baseValue'] = baseValue
_shap['sumShap'] = _shap[_shap.columns].sum(axis=1)
_shap['P']=_shap.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
_shap.head()

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,KIDS,MCD,baseValue,sumShap,P
0,0.008336,-0.838816,-1.8e-05,-0.830499,0.30354
1,-0.012998,-0.862043,-1.8e-05,-0.875059,0.294203
2,0.000281,1.446859,-1.8e-05,1.447122,0.809555
3,-0.000692,-0.891985,-1.8e-05,-0.892695,0.290554
4,0.000281,1.446859,-1.8e-05,1.447122,0.809555
