#### **Import Libraries**

In [58]:
# !pip3 install shap
import shap
import numpy as np
import pandas as pd
from math import exp
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import plotly.graph_objects as go
pd.set_option('display.max_rows', None)

#### **DATA DESCRIPTION**



**SUMMARY**
DOWNLOAD HERE https://raw.githubusercontent.com/yazid-mekhtoub/notes/main/titanic.csv

This is a classic dataset used in many data mining tutorials and demos -- perfect for getting started with exploratory analysis and building binary classification models to predict survival.

**Features**

* embarked - Port of Embarkation ({"Cherbourg": 1, "Queenstown": 2 , "Southampton": 3, "Unknown": 4})

* class - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* age - Age
* fare - Passenger Fare
* isfemale - Sex (female = 1, male = 0)
* survival - Survival (0 = No; 1 = Yes)



In [76]:
df = pd.read_csv('https://raw.githubusercontent.com/yazid-mekhtoub/notes/main/titanic.csv')
df.head()

Unnamed: 0,Embarked,Class,Age,Fare,isFemale,Survived
0,3,3,22.0,7.25,0,0
1,1,1,38.0,71.2833,1,1
2,3,3,26.0,7.925,1,1
3,3,1,35.0,53.1,1,1
4,3,3,35.0,8.05,0,0


#### **XGBOOST**

In [64]:
X = df[df.columns[:-1]]
Y = df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
model = XGBClassifier()
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
print("Accuracy on test data: %.2f%%" % (accuracy * 100.0))
auc= roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("AUC on test data", round(auc,2))
prba = pd.DataFrame(model.predict_proba(X)[:, 1], columns=['P'])
df = pd.merge(df, prba, left_index=True, right_index=True)
df.head()

Accuracy on test data: 84.75%
AUC on test data 0.88


Unnamed: 0,Embarked,Class,Age,Fare,isFemale,Survived,P
0,3,3,22.0,7.25,0,0,0.023519
1,1,1,38.0,71.2833,1,1,0.9993
2,3,3,26.0,7.925,1,1,0.798186
3,3,1,35.0,53.1,1,1,0.997935
4,3,3,35.0,8.05,0,0,0.036672


### **SHAP**

In [65]:
explainer = shap.TreeExplainer(model)
baseValue = explainer.expected_value[0]
_shap = explainer.shap_values(X)
_shap = pd.DataFrame(_shap, columns = X.columns)
_shap['baseValue'] = baseValue
_shap['sumShap'] = _shap[_shap.columns].sum(axis=1)
_shap['P']=_shap.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
_shap.head()

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,Embarked,Class,Age,Fare,isFemale,baseValue,sumShap,P
0,-0.280397,-0.481697,-0.239425,-0.529779,-1.583931,-0.610898,-3.726127,0.023519
1,1.187934,2.333913,0.060382,0.99002,3.302827,-0.610898,7.264178,0.9993
2,-0.436465,-0.892553,0.582206,0.889678,1.843027,-0.610898,1.374995,0.798186
3,-0.291159,2.798014,0.737292,0.533108,3.014283,-0.610898,6.180641,0.997935
4,-0.155388,-0.843223,-0.988544,-0.012749,-0.657566,-0.610898,-3.268369,0.036672


#### **SHAP INTERACTIONS**

In [66]:
_shap_iv = explainer.shap_interaction_values(X)
m,n,r = _shap_iv.shape
shap_iv= np.column_stack((np.repeat(np.arange(m),n),_shap_iv.reshape(m*n,-1)))
shap_iv = pd.DataFrame(shap_iv, columns = ['observations']+list(X.columns))
shap_iv.insert(1, 'features', np.tile(list(X.columns), len(shap_iv))[:len(shap_iv)])
shap_iv['sumIV'] = shap_iv[shap_iv.columns[2:]].sum(axis=1)
shap_iv['baseValue'] = baseValue 
shap_iv['sumShap'] = shap_iv.groupby('observations').sumIV.transform(sum)
shap_iv['sumShap'] = shap_iv['sumShap']+shap_iv['baseValue']
shap_iv['observations'] = shap_iv['observations'].astype(int)
shap_iv['P']=shap_iv.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
shap_iv.head(9)

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,observations,features,Embarked,Class,Age,Fare,isFemale,sumIV,baseValue,sumShap,P
0,0,Embarked,-0.239703,-0.041012,-0.062952,0.023292,0.039978,-0.280397,-0.610898,-3.726128,0.023519
1,0,Class,-0.041012,-0.834557,-0.012095,0.277122,0.128845,-0.481697,-0.610898,-3.726128,0.023519
2,0,Age,-0.062952,-0.012095,-0.754226,0.972753,-0.382905,-0.239425,-0.610898,-3.726128,0.023519
3,0,Fare,0.023292,0.277122,0.972753,-1.539878,-0.263068,-0.529779,-0.610898,-3.726128,0.023519
4,0,isFemale,0.039978,0.128845,-0.382905,-0.263068,-1.10678,-1.583931,-0.610898,-3.726128,0.023519
5,1,Embarked,0.640301,-0.197166,0.221129,0.588638,-0.064968,1.187934,-0.610898,7.264178,0.9993
6,1,Class,-0.197166,1.283048,0.24257,0.405866,0.599594,2.333913,-0.610898,7.264178,0.9993
7,1,Age,0.221129,0.24257,-1.252303,0.846007,0.00298,0.060382,-0.610898,7.264178,0.9993
8,1,Fare,0.588638,0.405866,0.846007,-1.500147,0.649656,0.99002,-0.610898,7.264178,0.9993


#### **AvsB**

In [70]:
# A FILTER 
B_filters = {
    "isFemale":"== 1",
    "Embarked": "!= 4"
}

# B FILTER 
A_filters = {
    "isFemale":"== 0",
    "Embarked": "!= 4"
}

def p_actual( df, filters):
  
  s = ""
  for key, value in filters.items():
    s = s+f"`{key}` { value} & "
  s = s[:-3]

  df = df.query(s)
  df['Residual'] = df['Survived']- df['P']
  df = df[['Survived', 'Residual']]

  _var =  df.mean().to_dict()

  return _var['Survived'], _var['Residual']


In [71]:
a_actual, a_residual = p_actual(df,A_filters)
print(a_actual, a_residual)

0.18890814558058924 0.0027474124910450335



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [72]:
b_actual, b_residual = p_actual(df, B_filters)
print(b_actual, b_residual)


0.7403846153846154 -0.012564652314326033



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [73]:
def avsb(df, _shap, A_filters, B_filters):

  def make_filter(filters):
    s = ""
    for key, value in filters.items():
      s = s + f"`{key}` { value} & "
    s = s[:-3]
    return s

  a_f = make_filter(A_filters)
  b_f = make_filter(B_filters)

  A = _shap.loc[df.query(a_f).index].mean()
  B = _shap.loc[df.query(b_f).index].mean()

  df = B.subtract(A).to_frame(name='avg_shap_diff').reset_index()
  df = df.rename({'index':'features'}, axis=1)



  return df


d = avsb(df, _shap[_shap.columns[:-3]], A_filters, B_filters)
d
  


Unnamed: 0,features,avg_shap_diff
0,Embarked,0.060696
1,Class,0.555159
2,Age,0.370547
3,Fare,0.273478
4,isFemale,3.761539


In [74]:
def dx(df):
    red = '255, 49, 49'
    green = '126, 217, 87'
    alloc = b_actual - a_actual - b_residual + a_residual
    print(b_actual ,a_actual, b_residual , a_residual)
    df['alloc'] = alloc
    df['proportion'] = (df.avg_shap_diff/df.avg_shap_diff.sum()) * 100
    df['contribution'] =  df['proportion'] * (alloc)

    df= df[['features', 'contribution']]
    df['measure'] = 'relative'
    df = df.sort_values(by=['contribution'], ascending=False)
    values = df.values.tolist()
    df = df.sort_values(by=['contribution'])
    values.insert(0,   ['A',  a_actual*100,  'absolute'])
    values.append(['A Residuals', - a_residual*100,  'relative'])
    values.append(['B Residuals', b_residual*100, 'relative'])
    values.append(['B',  b_actual*100,  'total'])

    df = pd.DataFrame(values, columns = df.columns)
    print(df)

    fig  = go.Figure()
    fig.add_trace(go.Waterfall(
        width= [0.8] * len(df),
                x = df.features, 
                    y = df['contribution'],
                measure = df['measure'],
                base = 0,
                cliponaxis= False,
                textfont=dict(
                    family="verdana, arial, sans-serif",
                    size=10,
                    color="rgb(148, 144, 144)"
                ),
                # text =  df['contribution'].apply(lambda x: '{0:1.2f}%'.format(x)),
                textposition = 'outside',
                decreasing = {"marker":{"color":f"rgba({red}, 0.7)",  "line":{"color":f"rgba({red}, 1)","width":2}}},
                increasing = {"marker":{"color":f"rgba({green}, 0.7)","line":{"color":f"rgba({green}, 1)", "width":2}}},
                totals     = {"marker":{"color":"rgba(12, 192, 223, 0.7)", "line":{"color":"rgba(12, 192, 223, 1)", "width":2}}},
                    connector = {"line":{"color":"rgba(217, 217, 217, 1)", "width":1}},
                ))

    # gl['title']= title
    fig.update_layout(height=400)
    end = 0.5 + len(df) -2
    start = 0.5 + len(df) -4
    # fig.update_yaxes(visible=ax_visibale)
    # gl['yaxis'] ['ticksuffix'] = "%"
    fig.add_vrect(x0=start, x1=end,
                fillcolor="pink", opacity=0.15 , line_color="pink")
    fig.update_annotations(font=dict( size=14, color="rgb(148, 144, 144)"))
    fig.update_yaxes(ticksuffix = "%")
    return fig
dx(d)

0.7403846153846154 0.18890814558058924 -0.012564652314326033 0.0027474124910450335
      features  contribution   measure
0            A     18.890815  absolute
1     isFemale     42.458065  relative
2        Class      6.266308  relative
3          Age      4.182524  relative
4         Fare      3.086865  relative
5     Embarked      0.685100  relative
6  A Residuals     -0.274741  relative
7  B Residuals     -1.256465  relative
8            B     74.038462     total



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
