### **DATA DESCRIPTION**

In [2]:
# %%capture
# ! pip3 install numpy
# !pip3 install shap
import shap
import numpy as np
import pandas as pd
from math import exp
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score



  from .autonotebook import tqdm as notebook_tqdm


### **DATA DESCRIPTION**

#### **SUMMARY**
##### DOWNLOAD HERE https://raw.githubusercontent.com/yazid-mekhtoub/notes/7fcd1004be1d6e923a26e4905c409157ffc8de36/titanic.csv
This is a classic dataset used in many data mining tutorials and demos -- perfect for getting started with exploratory analysis and building binary classification models to predict survival.


#### **Features**
* embarked - Port of Embarkation ({"Cherbourg": 1, "Queenstown": 2 , "Southampton": 3, "Unknown": 4})
* cabin - Cabin ({"A" : 1, "B" : 2, "C": 3, "D" : 4, "E": 5, "F" : 6, "G" : 7, "T": 8, "Unknown":9})
* class - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* nsiblings - Number of Siblings/Spouses Aboard
* nparents - Number of Parents/Children Aboard
* age - Age
* fare - Passenger Fare
* isfemale - Sex (female = 1, male = 0)
* survival - Survival (0 = No; 1 = Yes)

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/yazid-mekhtoub/notes/7fcd1004be1d6e923a26e4905c409157ffc8de36/titanic.csv')
df.head()

Unnamed: 0,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,survived
0,3,2,1,0,0,29.0,211.3375,1,1
1,3,3,1,1,2,0.9167,151.55,0,1
2,3,3,1,1,2,2.0,151.55,1,0
3,3,3,1,1,2,30.0,151.55,0,0
4,3,3,1,1,2,25.0,151.55,1,0


In [4]:
X = df[df.columns[:-4]]
Y =df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [5]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
prba = pd.DataFrame(model.predict_proba(X)[:, 1], columns=['P'])
prba = pd.merge(df, prba, left_index=True, right_index=True)
prba

Accuracy: 71.56%


Unnamed: 0,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,survived,P
0,3,2,1,0,0,29.000000,211.3375,1,1,0.616944
1,3,3,1,1,2,0.916700,151.5500,0,1,0.125274
2,3,3,1,1,2,2.000000,151.5500,1,0,0.125274
3,3,3,1,1,2,30.000000,151.5500,0,0,0.125274
4,3,3,1,1,2,25.000000,151.5500,1,0,0.125274
...,...,...,...,...,...,...,...,...,...,...
1303,1,9,3,1,0,14.500000,14.4542,1,0,0.292933
1304,1,9,3,1,0,29.881135,14.4542,1,0,0.292933
1305,1,9,3,0,0,26.500000,7.2250,0,0,0.237169
1306,1,9,3,0,0,27.000000,7.2250,0,0,0.237169


### **XGBOOST**

### **SHAP**

In [8]:
explainer = shap.TreeExplainer(model)
baseValue = explainer.expected_value

_shap = explainer.shap_values(X)


In [68]:
_shap = pd.DataFrame(_shap, columns = X.columns)
_shap['baseValue'] = baseValue

_shap['sumShap'] = _shap[_shap.columns].sum(axis=1)

_shap['P']=_shap.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
_shap

Unnamed: 0,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,baseValue,sumShap,P
0,-0.160233,1.011798,1.914590,0.185203,0.060362,-0.116607,-0.149309,2.738255,0.0,5.484060,0.995865
1,-0.169474,0.059934,0.794226,0.384468,-0.146875,2.689516,-2.373389,-0.632153,0.0,0.606252,0.647085
2,-0.211721,0.019928,1.234602,-0.416915,-0.283196,0.395973,-2.660688,0.974411,0.0,-0.947606,0.279367
3,-0.161587,-0.112153,0.732600,0.037763,-0.522173,-0.462926,-1.832059,-1.472224,0.0,-3.792759,0.022037
4,-0.248174,0.050092,1.649513,-0.663814,-0.297954,-0.668617,-2.259589,1.804836,0.0,-0.633706,0.346671
...,...,...,...,...,...,...,...,...,...,...,...
1303,0.494830,-0.232626,-1.054342,-0.242887,-0.044920,0.814286,-2.045167,1.705281,0.0,-0.605544,0.353076
1304,0.688937,-0.188332,-1.072667,-0.210593,-0.061541,0.028810,-2.118265,1.292354,0.0,-1.641296,0.162289
1305,1.053066,-0.334240,-0.265969,0.098390,-0.104043,0.084100,-1.159122,-1.695199,0.0,-2.323016,0.089235
1306,1.045470,-0.331963,-0.265969,0.096143,-0.098764,0.289828,-1.185728,-1.693108,0.0,-2.144092,0.104885


In [60]:
_shap_iv = explainer.shap_interaction_values(X)
m,n,r = _shap_iv.shape
shap_iv= np.column_stack((np.repeat(np.arange(m),n),_shap_iv.reshape(m*n,-1)))
shap_iv = pd.DataFrame(shap_iv, columns = ['observations']+list(X.columns))
shap_iv.insert(1, 'features', np.tile(list(X.columns), len(shap_iv))[:len(shap_iv)])
shap_iv['sumIV'] = shap_iv[shap_iv.columns[2:]].sum(axis=1)
shap_iv['baseValue'] = baseValue[0]
shap_iv['sumShap'] = shap_iv.groupby('observations').sumIV.transform(sum)
shap_iv['sumShap'] = shap_iv['sumShap']+shap_iv['baseValue']
shap_iv['P']=shap_iv.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
shap_iv.head(9)


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


Unnamed: 0,observations,features,embarked,cabin,class,nsiblings,nparents,age,fare,isfemale,sumIV,baseValue,sumShap,P
0,0.0,embarked,-0.254735,0.002727,0.03712,0.007582,0.011938,-0.064956,0.095867,0.004223,-0.160233,0.0,5.48406,0.995865
1,0.0,cabin,0.002727,1.474634,-0.302366,-0.156733,-0.031637,0.012884,0.257453,-0.245163,1.011798,0.0,5.48406,0.995865
2,0.0,class,0.03712,-0.302366,1.160303,0.054813,-0.043987,0.080346,0.297382,0.630979,1.91459,0.0,5.48406,0.995865
3,0.0,nsiblings,0.007582,-0.156733,0.054813,-0.019713,-0.001888,-0.013721,0.0556,0.259262,0.185203,0.0,5.48406,0.995865
4,0.0,nparents,0.011938,-0.031637,-0.043987,-0.001888,-0.064614,0.037023,0.139929,0.013598,0.060362,0.0,5.48406,0.995865
5,0.0,age,-0.064956,0.012884,0.080345,-0.013721,0.037023,-0.230432,0.119222,-0.056973,-0.116607,0.0,5.48406,0.995865
6,0.0,fare,0.095868,0.257453,0.297382,0.0556,0.139929,0.119222,-1.492611,0.377848,-0.149309,0.0,5.48406,0.995865
7,0.0,isfemale,0.004223,-0.245164,0.630978,0.259262,0.013598,-0.056973,0.377848,1.754484,2.738255,0.0,5.48406,0.995865
8,1.0,embarked,-0.217292,-0.011635,0.025183,-0.011689,-0.03037,0.032683,0.034626,0.009018,-0.169474,0.0,0.606252,0.647085
