#### **Import Libraries**

In [None]:
import shap
import numpy as np
import pandas as pd
from math import exp
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import plotly.graph_objects as go

#### **DATA DESCRIPTION**



**SUMMARY**
DOWNLOAD HERE https://raw.githubusercontent.com/yazid-mekhtoub/notes/7fcd1004be1d6e923a26e4905c409157ffc8de36/titanic.csv

This is a classic dataset used in many data mining tutorials and demos -- perfect for getting started with exploratory analysis and building binary classification models to predict survival.

**Features**

* embarked - Port of Embarkation ({"Cherbourg": 1, "Queenstown": 2 , "Southampton": 3, "Unknown": 4})
* cabin - Cabin ({"A" : 1, "B" : 2, "C": 3, "D" : 4, "E": 5, "F" : 6, "G" : 7, "T": 8, "Unknown":9})
* class - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* nsiblings - Number of Siblings/Spouses Aboard
* nparents - Number of Parents/Children Aboard
* age - Age
* fare - Passenger Fare
* isfemale - Sex (female = 1, male = 0)
* survival - Survival (0 = No; 1 = Yes)



In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/yazid-mekhtoub/notes/main/titanic.csv')
df.head()


Unnamed: 0,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,survived
0,3,2,1,0,0,29.0,211.3375,1,1
1,3,3,1,1,2,0.9167,151.55,0,1
2,3,3,1,1,2,2.0,151.55,1,0
3,3,3,1,1,2,30.0,151.55,0,0
4,3,3,1,1,2,25.0,151.55,1,0


#### **XGBOOST**

In [None]:
X = df[df.columns[:-1]]
Y = df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
model = XGBClassifier()
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
print("Accuracy on test data: %.2f%%" % (accuracy * 100.0))
auc= roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("AUC on test data", round(auc,2))
prba = pd.DataFrame(model.predict_proba(X)[:, 1], columns=['P'])
df = pd.merge(df, prba, left_index=True, right_index=True)
df.head()

Accuracy on test data: 77.68%
AUC on test data 0.83


Unnamed: 0,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,survived,P
0,3,2,1,0,0,29.0,211.3375,1,1,0.99259
1,3,3,1,1,2,0.9167,151.55,0,1,0.504928
2,3,3,1,1,2,2.0,151.55,1,0,0.177388
3,3,3,1,1,2,30.0,151.55,0,0,0.012379
4,3,3,1,1,2,25.0,151.55,1,0,0.227893


### **SHAP**

In [None]:
explainer = shap.TreeExplainer(model)
baseValue = explainer.expected_value[0]
_shap = explainer.shap_values(X)
_shap = pd.DataFrame(_shap, columns = X.columns)
_shap['baseValue'] = baseValue
_shap['sumShap'] = _shap[_shap.columns].sum(axis=1)
_shap['P']=_shap.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
_shap.head()

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,baseValue,sumShap,P
0,-0.160233,1.011798,1.91459,0.185203,0.060362,-0.116607,-0.149309,2.738255,-0.586539,4.897521,0.99259
1,-0.169474,0.059934,0.794226,0.384468,-0.146875,2.689516,-2.373389,-0.632153,-0.586539,0.019713,0.504928
2,-0.211721,0.019928,1.234602,-0.416915,-0.283196,0.395973,-2.660688,0.974411,-0.586539,-1.534145,0.177388
3,-0.161587,-0.112153,0.7326,0.037763,-0.522173,-0.462926,-1.832059,-1.472224,-0.586539,-4.379298,0.012379
4,-0.248174,0.050092,1.649513,-0.663814,-0.297954,-0.668617,-2.259589,1.804836,-0.586539,-1.220245,0.227893


#### **SHAP INTERACTIONS**

In [None]:
_shap_iv = explainer.shap_interaction_values(X)
m,n,r = _shap_iv.shape
shap_iv= np.column_stack((np.repeat(np.arange(m),n),_shap_iv.reshape(m*n,-1)))
shap_iv = pd.DataFrame(shap_iv, columns = ['observations']+list(X.columns))
shap_iv.insert(1, 'features', np.tile(list(X.columns), len(shap_iv))[:len(shap_iv)])
shap_iv['sumIV'] = shap_iv[shap_iv.columns[2:]].sum(axis=1)
shap_iv['baseValue'] = baseValue 
shap_iv['sumShap'] = shap_iv.groupby('observations').sumIV.transform(sum)
shap_iv['sumShap'] = shap_iv['sumShap']+shap_iv['baseValue']
shap_iv['observations'] = shap_iv['observations'].astype(int)
shap_iv['P']=shap_iv.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
shap_iv.head(9)

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,observations,features,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,sumIV,baseValue,sumShap,P
0,0,embarked,-0.254735,0.002727,0.03712,0.007582,0.011938,-0.064956,0.095867,0.004223,-0.160233,-0.586539,4.897521,0.99259
1,0,cabin,0.002727,1.474634,-0.302366,-0.156733,-0.031637,0.012884,0.257453,-0.245163,1.011798,-0.586539,4.897521,0.99259
2,0,class,0.03712,-0.302366,1.160303,0.054813,-0.043987,0.080346,0.297382,0.630979,1.91459,-0.586539,4.897521,0.99259
3,0,nsiblings,0.007582,-0.156733,0.054813,-0.019713,-0.001888,-0.013721,0.0556,0.259262,0.185203,-0.586539,4.897521,0.99259
4,0,nparents,0.011938,-0.031637,-0.043987,-0.001888,-0.064614,0.037023,0.139929,0.013598,0.060362,-0.586539,4.897521,0.99259
5,0,age,-0.064956,0.012884,0.080345,-0.013721,0.037023,-0.230432,0.119222,-0.056973,-0.116607,-0.586539,4.897521,0.99259
6,0,fare,0.095868,0.257453,0.297382,0.0556,0.139929,0.119222,-1.492611,0.377848,-0.149309,-0.586539,4.897521,0.99259
7,0,isfemale,0.004223,-0.245164,0.630978,0.259262,0.013598,-0.056973,0.377848,1.754484,2.738255,-0.586539,4.897521,0.99259
8,1,embarked,-0.217292,-0.011635,0.025183,-0.011689,-0.03037,0.032683,0.034626,0.009018,-0.169474,-0.586539,0.019712,0.504928


#### **AvsB**

In [None]:
# A FILTER 
B_filters = {
    "isfemale":"== 1",
    "embarked": "!= 4"
}

# B FILTER 
A_filters = {
    "isfemale":"== 0",
    "embarked": "!= 4"
}

def p_actual( df, filters):
  
  s = ""
  for key, value in filters.items():
    s = s+f"`{key}` { value} & "
  s = s[:-3]

  df = df.query(s)
  df['residual'] = df['survived']- df['P']
  df = df[['survived', 'residual']]

  _var =  df.mean().to_dict()

  return _var['survived'], _var['residual']


In [None]:
a_actual, a_residual = p_actual(df,A_filters)
print(a_actual, a_residual)

0.19121140142517815 0.0014726374213096538




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
b_actual, b_residual = p_actual(df, B_filters)
print(b_actual, b_residual)


0.7262931034482759 -0.005478454032684034




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
def avsb(df, _shap, A_filters, B_filters):

  def make_filter(filters):
    s = ""
    for key, value in filters.items():
      s = s + f"`{key}` { value} & "
    s = s[:-3]
    return s

  a_f = make_filter(A_filters)
  b_f = make_filter(B_filters)

  A = _shap.loc[df.query(a_f).index].mean()
  B = _shap.loc[df.query(b_f).index].mean()

  df = B.subtract(A).to_frame(name='avg_shap_diff').reset_index()
  df = df.rename({'index':'features'}, axis=1)



  return df


d = avsb(df, _shap[_shap.columns[:-3]], A_filters, B_filters)
d
  


Unnamed: 0,features,avg_shap_diff
0,embarked,0.068461
1,cabin,0.167147
2,class,0.444781
3,nsiblings,-0.032792
4,nparents,0.070286
5,age,0.261668
6,fare,0.190081
7,isfemale,3.613874


In [None]:
def dx(df):
    red = '255, 49, 49'
    green = '126, 217, 87'
    alloc = b_actual - a_actual - b_residual + a_residual
    print(b_actual ,a_actual, b_residual , a_residual)
    df['alloc'] = alloc
    df['proportion'] = (df.avg_shap_diff/df.avg_shap_diff.sum()) * 100
    df['contribution'] =  df['proportion'] * (alloc)

    df= df[['features', 'contribution']]
    df['measure'] = 'relative'
    df = df.sort_values(by=['contribution'], ascending=False)
    values = df.values.tolist()
    df = df.sort_values(by=['contribution'])
    values.insert(0,   ['A',  a_actual*100,  'absolute'])
    values.append(['A Residuals', - a_residual*100,  'relative'])
    values.append(['B Residuals', b_residual*100, 'relative'])
    values.append(['B',  b_actual*100,  'total'])

    df = pd.DataFrame(values, columns = df.columns)
    print(df)

    fig  = go.Figure()
    fig.add_trace(go.Waterfall(
        width= [0.8] * len(df),
                x = df.features, 
                    y = df['contribution'],
                measure = df['measure'],
                base = 0,
                cliponaxis= False,
                textfont=dict(
                    family="verdana, arial, sans-serif",
                    size=10,
                    color="rgb(148, 144, 144)"
                ),
                # text =  df['contribution'].apply(lambda x: '{0:1.2f}%'.format(x)),
                textposition = 'outside',
                decreasing = {"marker":{"color":f"rgba({red}, 0.7)",  "line":{"color":f"rgba({red}, 1)","width":2}}},
                increasing = {"marker":{"color":f"rgba({green}, 0.7)","line":{"color":f"rgba({green}, 1)", "width":2}}},
                totals     = {"marker":{"color":"rgba(12, 192, 223, 0.7)", "line":{"color":"rgba(12, 192, 223, 1)", "width":2}}},
                    connector = {"line":{"color":"rgba(217, 217, 217, 1)", "width":1}},
                ))
    # title={
    #         'text': 'Explained By change in Sales Mix between Septemer and December, 2022',
    #         'x':0.525,
    #         'xanchor': 'center',
    #         'yanchor': 'top'  
    #     }

    # gl['title']= title
    fig.update_layout(height=400)
    end = 0.5 + len(df) -2
    start = 0.5 + len(df) -4
    # fig.update_yaxes(visible=ax_visibale)
    # gl['yaxis'] ['ticksuffix'] = "%"
    fig.add_vrect(x0=start, x1=end,
                fillcolor="pink", opacity=0.15 , line_color="pink")
    fig.update_annotations(font=dict( size=14, color="rgb(148, 144, 144)"))
    fig.update_yaxes(ticksuffix = "%")
    return fig
dx(d)

0.7262931034482759 0.19121140142517815 -0.005478454032684034 0.0014726374213096538
       features  contribution   measure
0             A     19.121140  absolute
1      isfemale     40.949841  relative
2         class      5.039944  relative
3           age      2.965036  relative
4          fare      2.153867  relative
5         cabin      1.893990  relative
6      nparents      0.796429  relative
7      embarked      0.775751  relative
8     nsiblings     -0.371579  relative
9   A Residuals     -0.147264  relative
10  B Residuals     -0.547845  relative
11            B     72.629310     total




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

