### **Install libraries and download data**

In [None]:
%%capture
!pip3 install shap
!gdown --id 1QJl06QYcXUST_qlnaJuP9MxG0J_N2ZXZ -O housePrice.csv


#### **Import Libraries**

In [None]:
import shap
import numpy as np
import pandas as pd
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from math import exp

#### **Read Data**

In [None]:
df = pd.read_csv('housePrice.csv')
df.head()


Unnamed: 0,countyName,city,zipCode,oceanProxi,longitude,latitude,totalRooms,greaterAve
0,Alameda,Berkeley,94705,NEAR BAY,-122.23,37.88,880.0,1
1,Alameda,Berkeley,94705,NEAR BAY,-122.22,37.86,7099.0,1
2,Alameda,Berkeley,94705,NEAR BAY,-122.26,37.86,5161.0,1
3,Alameda,Berkeley,94705,NEAR BAY,-122.26,37.86,3497.0,1
4,Alameda,Berkeley,94705,NEAR BAY,-122.26,37.86,3774.0,1


#### **Label Encoding for Categorical Variables**

In [None]:
for feat in ['countyName' ,	'city' , 'oceanProxi']:
  df[feat] = le.fit_transform(df[feat]) 
df.head()

Unnamed: 0,countyName,city,zipCode,oceanProxi,longitude,latitude,totalRooms,greaterAve
0,0,64,94705,3,-122.23,37.88,880.0,1
1,0,64,94705,3,-122.22,37.86,7099.0,1
2,0,64,94705,3,-122.26,37.86,5161.0,1
3,0,64,94705,3,-122.26,37.86,3497.0,1
4,0,64,94705,3,-122.26,37.86,3774.0,1


#### **Train Test Split**

In [None]:
X = df[df.columns[:6]]
Y =df[df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

#### **Fit Xgboost and Predict Probabilities**

In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
prba = pd.DataFrame(model.predict_proba(X)[:, 1], columns=['P'])
prba = pd.merge(df, prba, left_index=True, right_index=True)
prba.head()

Accuracy: 88.68%


Unnamed: 0,countyName,city,zipCode,oceanProxi,longitude,latitude,totalRooms,greaterAve,P
0,0,64,94705,3,-122.23,37.88,880.0,1,0.977501
1,0,64,94705,3,-122.22,37.86,7099.0,1,0.935843
2,0,64,94705,3,-122.26,37.86,5161.0,1,0.861546
3,0,64,94705,3,-122.26,37.86,3497.0,1,0.861546
4,0,64,94705,3,-122.26,37.86,3774.0,1,0.861546


### **Shap TreeExplainer**

In [None]:
explainer = shap.TreeExplainer(model)
baseValue = explainer.expected_value
baseValue = baseValue[0]
baseValue

-0.48184107405586474

#### **Shap Table**

In [None]:
_shap = explainer.shap_values(X)

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


#### **Adding the Base Value**

In [None]:
_shap = pd.DataFrame(_shap, columns = X.columns)
_shap['baseValue'] = baseValue
_shap.head()


Unnamed: 0,countyName,city,zipCode,oceanProxi,longitude,latitude,baseValue
0,-0.178044,0.594361,0.732444,-0.046902,2.221071,0.930451,-0.481841
1,-0.200185,0.566032,0.462696,-0.046902,2.003898,0.376417,-0.481841
2,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841
3,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841
4,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841


#### **Sum All Shap Values**

In [None]:
_shap['sumShap'] = _shap[_shap.columns].sum(axis=1)
_shap.head()

Unnamed: 0,countyName,city,zipCode,oceanProxi,longitude,latitude,baseValue,sumShap
0,-0.178044,0.594361,0.732444,-0.046902,2.221071,0.930451,-0.481841,3.77154
1,-0.200185,0.566032,0.462696,-0.046902,2.003898,0.376417,-0.481841,2.680116
2,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841,1.828188
3,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841,1.828188
4,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841,1.828188


#### **Convert sum shap into probabilities**

In [None]:
_shap['P']=_shap.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
_shap.head()

Unnamed: 0,countyName,city,zipCode,oceanProxi,longitude,latitude,baseValue,sumShap,P
0,-0.178044,0.594361,0.732444,-0.046902,2.221071,0.930451,-0.481841,3.77154,0.977501
1,-0.200185,0.566032,0.462696,-0.046902,2.003898,0.376417,-0.481841,2.680116,0.935843
2,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841,1.828188,0.861546
3,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841,1.828188,0.861546
4,-0.273366,0.547654,0.409422,-0.046902,1.559384,0.113836,-0.481841,1.828188,0.861546


#### **Shap Interactions Table**

In [None]:
_shap_iv = explainer.shap_interaction_values(X)
m,n,r = _shap_iv.shape
shap_iv= np.column_stack((np.repeat(np.arange(m),n),_shap_iv.reshape(m*n,-1)))
shap_iv = pd.DataFrame(shap_iv, columns = ['observations']+list(X.columns))
shap_iv.insert(1, 'features', np.tile(list(X.columns), len(shap_iv))[:len(shap_iv)])
shap_iv.head(8)

Unnamed: 0,observations,features,countyName,city,zipCode,oceanProxi,longitude,latitude
0,0.0,countyName,-0.291024,0.04367,0.044796,-0.037315,0.038646,0.023184
1,0.0,city,0.04367,0.448507,0.094688,-0.005304,0.023442,-0.010642
2,0.0,zipCode,0.044796,0.094688,0.902128,-0.070458,-0.297791,0.059082
3,0.0,oceanProxi,-0.037315,-0.005304,-0.070458,0.171098,-0.075536,-0.029386
4,0.0,longitude,0.038646,0.023442,-0.297791,-0.075536,1.811658,0.720653
5,0.0,latitude,0.023184,-0.010641,0.059082,-0.029386,0.720653,0.16756
6,1.0,countyName,-0.274295,0.04333,0.038738,-0.037315,0.006394,0.022963
7,1.0,city,0.04333,0.483782,0.066151,-0.005304,0.053581,-0.075508


#### **Adding the Sum Shap**

In [None]:
shap_iv['sumIV'] = shap_iv[shap_iv.columns[2:]].sum(axis=1)
shap_iv['baseValue'] = baseValue
shap_iv.head()


Unnamed: 0,observations,features,countyName,city,zipCode,oceanProxi,longitude,latitude,sumIV,baseValue
0,0.0,countyName,-0.291024,0.04367,0.044796,-0.037315,0.038646,0.023184,-0.178044,-0.481841
1,0.0,city,0.04367,0.448507,0.094688,-0.005304,0.023442,-0.010642,0.594361,-0.481841
2,0.0,zipCode,0.044796,0.094688,0.902128,-0.070458,-0.297791,0.059082,0.732444,-0.481841
3,0.0,oceanProxi,-0.037315,-0.005304,-0.070458,0.171098,-0.075536,-0.029386,-0.046902,-0.481841
4,0.0,longitude,0.038646,0.023442,-0.297791,-0.075536,1.811658,0.720653,2.221071,-0.481841


In [None]:
shap_iv['sumShap'] = shap_iv.groupby('observations').sumIV.transform(sum)
shap_iv['sumShap'] = shap_iv['sumShap']+shap_iv['baseValue']
shap_iv.head(7)

Unnamed: 0,observations,features,countyName,city,zipCode,oceanProxi,longitude,latitude,sumIV,baseValue,sumShap
0,0.0,countyName,-0.291024,0.04367,0.044796,-0.037315,0.038646,0.023184,-0.178044,-0.481841,3.77154
1,0.0,city,0.04367,0.448507,0.094688,-0.005304,0.023442,-0.010642,0.594361,-0.481841,3.77154
2,0.0,zipCode,0.044796,0.094688,0.902128,-0.070458,-0.297791,0.059082,0.732444,-0.481841,3.77154
3,0.0,oceanProxi,-0.037315,-0.005304,-0.070458,0.171098,-0.075536,-0.029386,-0.046902,-0.481841,3.77154
4,0.0,longitude,0.038646,0.023442,-0.297791,-0.075536,1.811658,0.720653,2.221071,-0.481841,3.77154
5,0.0,latitude,0.023184,-0.010641,0.059082,-0.029386,0.720653,0.16756,0.930451,-0.481841,3.77154
6,1.0,countyName,-0.274295,0.04333,0.038738,-0.037315,0.006394,0.022963,-0.200185,-0.481841,2.680116


In [None]:
# shap_iv['P']=shap_iv.apply(lambda x: 1/(1+ exp(-x['sumShap'])), axis=1)
shap_iv.head(7)

Unnamed: 0,observations,features,countyName,city,zipCode,oceanProxi,longitude,latitude,sumIV,baseValue,sumShap,P
0,0.0,countyName,-0.291024,0.04367,0.044796,-0.037315,0.038646,0.023184,-0.178044,-0.481841,3.77154,0.977501
1,0.0,city,0.04367,0.448507,0.094688,-0.005304,0.023442,-0.010642,0.594361,-0.481841,3.77154,0.977501
2,0.0,zipCode,0.044796,0.094688,0.902128,-0.070458,-0.297791,0.059082,0.732444,-0.481841,3.77154,0.977501
3,0.0,oceanProxi,-0.037315,-0.005304,-0.070458,0.171098,-0.075536,-0.029386,-0.046902,-0.481841,3.77154,0.977501
4,0.0,longitude,0.038646,0.023442,-0.297791,-0.075536,1.811658,0.720653,2.221071,-0.481841,3.77154,0.977501
5,0.0,latitude,0.023184,-0.010641,0.059082,-0.029386,0.720653,0.16756,0.930451,-0.481841,3.77154,0.977501
6,1.0,countyName,-0.274295,0.04333,0.038738,-0.037315,0.006394,0.022963,-0.200185,-0.481841,2.680116,0.935843
