# Feed-forward Neural-Network Interpretability

Testing ShAP interpretability with a neural network, going to evaluate overlap with the lightgbm model

### Setup

In [1]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [3]:
"""
Loads common configuration parameters
"""
import utils.configuration_manager as configuration_manager
from pathlib import PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = configuration_manager.Config(config_path)

# Assumes parquet directory as input
input_path = config.input_path
print('Input path: '+ input_path)

# For result storage
output_directory = config.output_directory
print('Output path: ' + output_directory)

Loading configuration from: /home/justin/Code/interpretability_experiment/config.ini
Input path: data/2018_Yellow_Taxi_Trip_Data_float64
Output path: output


### Start local Dask Client

In [4]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
    cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:37885' processes=5 threads=10, memory=25.61 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:37885  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 25.61 GB


### Dask dataframe loader

In [5]:
import dask.dataframe as dd
import fastparquet

In [6]:
ddf = dd.read_parquet(input_path)

In [7]:
ddf.head()

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,2018-12-03 09:58:01,2018-12-03 10:14:17,1.0,1.2,1,N,186,161,1,11.0,0.0,0.5,2.95,0.0,0.3,14.75
1,2,2018-12-03 09:41:32,2018-12-03 10:20:08,1.0,12.03,1,N,138,162,1,39.0,0.0,0.5,9.11,5.76,0.3,54.67
2,2,2018-12-03 08:54:36,2018-12-03 08:59:35,2.0,0.86,1,N,151,166,1,5.5,0.0,0.5,1.26,0.0,0.3,7.56
3,2,2018-12-03 09:02:08,2018-12-03 09:07:16,2.0,1.09,1,N,166,238,1,6.0,0.0,0.5,1.36,0.0,0.3,8.16
4,2,2018-12-03 09:10:10,2018-12-03 09:21:32,2.0,1.78,1,N,238,75,1,9.5,0.0,0.5,2.06,0.0,0.3,12.36


### Dependencies for PyTorch

In [8]:
#Import
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from datetime import datetime

### Preprocessing

In [9]:
#  Note - total_amount is excluded from the input list 

In [17]:
categorical_columns =['VendorID', 
                      'RatecodeID', 
                      'PULocationID', 
                      'DOLocationID', 
                      'payment_type']

numerical_variables = ['passenger_count', 
                       'trip_distance', 
                       'fare_amount', 
                       'extra', 
                       'mta_tax', 
                       'tolls_amount', 
                       'improvement_surcharge', 
                       'total_amount_wo_tip']

input_columns = categorical_columns + numerical_variables

In [18]:
target = 'tip_amount'

In [19]:
"""
We'll subtract the tip_amount from the total_amount to prevent any leakage, 
using a new total_amount_wo_tip column.
"""
ddf['total_amount_wo_tip'] = ddf['total_amount'] - ddf['tip_amount']

In [25]:
"""
Extract a manageable dataset from Dask 
"""

from utils.helpful_functions import concatenate

columns_to_keep = input_columns + [target]

dfs = []

# Load and append to Pandas dataframe
for i in range(40):
    ddf_partition = ddf[columns_to_keep].get_partition(i)
    df_temp = ddf_partition.compute()
#     df_temp[numerical_variables] = df_temp[numerical_variables].astype(np.float16)
    dfs.append(df_temp)

df_train = concatenate(dfs)

In [27]:
# print(df_train.describe())

In [28]:
X_train = df_train[input_columns]
y_train = df_train[target]

In [29]:
print(X_train.head())

      VendorID RatecodeID PULocationID DOLocationID payment_type  \
index                                                              
0            1          1          186          161            1   
1            2          1          138          162            1   
2            2          1          151          166            1   
3            2          1          166          238            1   
4            2          1          238           75            1   

       passenger_count  trip_distance  fare_amount  extra  mta_tax  \
index                                                                
0                  1.0           1.20         11.0    0.0      0.5   
1                  1.0          12.03         39.0    0.0      0.5   
2                  2.0           0.86          5.5    0.0      0.5   
3                  2.0           1.09          6.0    0.0      0.5   
4                  2.0           1.78          9.5    0.0      0.5   

       tolls_amount  improvement

In [31]:
# Fill nulls in categoricals, if there are 
# def fill_categorical_nulls(df, categories):  
#     for category in categories: 
#         df[category].fillna('Unknown', inplace=True)

### Training and Validation split

In [None]:
# Separating dataset back to what it was
X_train
test_processed = stacked_df[26729:]

# Notice that the shape matches the orignal
print("train shape: ", X.shape, "original: ", train.shape)
print("test shape: ", test_processed.shape, "original: ", test.shape)

### ShAP Deep Explainer

In [None]:
# https://www.kaggle.com/ceshine/feature-importance-from-a-pytorch-model
%%time
e = shap.DeepExplainer(
        model, 
        torch.from_numpy(
            x_train[np.random.choice(np.arange(len(x_train)), 10000, replace=False)]
        ).to(DEVICE))

In [None]:
%%time
x_samples = x_train[np.random.choice(np.arange(len(x_train)), 300, replace=False)]
print(len(x_samples))
shap_values = e.shap_values(
    torch.from_numpy(x_samples).to(DEVICE)
)

In [None]:
import pandas as pd
df = pd.DataFrame({
    "mean_abs_shap": np.mean(np.abs(shap_values), axis=0), 
    "stdev_abs_shap": np.std(np.abs(shap_values), axis=0), 
    "name": features
})
df.sort_values("mean_abs_shap", ascending=False)[:10]

In [None]:
shap.summary_plot(shap_values, features=x_samples, feature_names=features)