# Feed-forward Neural-Network Interpretability

Testing ShAP interpretability with a neural network, going to evaluate overlap with the lightgbm model

### Setup

In [None]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [1]:
"""
Loads common configuration parameters
"""
import src.configuration_manager as configuration_manager
from pathlib import PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = configuration_manager.Config(config_path)

# Assumes parquet directory as input
input_path = config.input_path
print('Input path: '+ input_path)

# For result storage
output_directory = config.output_directory
print('Output path: ' + output_directory)

Loading configuration from: /home/justin/Code/interpretability_experiment/config.ini
Input path: data/2018_Yellow_Taxi_Trip_Data_float64
Output path: output


### Start local Dask Client

In [2]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
    cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:34571' processes=5 threads=10, memory=28.68 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:34571  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 28.68 GB


### Dask dataframe loader

In [3]:
import dask.dataframe as dd
import fastparquet

In [4]:
ddf = dd.read_parquet(input_path)

In [5]:
ddf.head()

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,2018-12-03 09:58:01,2018-12-03 10:14:17,1.0,1.2,1,N,186,161,1,11.0,0.0,0.5,2.95,0.0,0.3,14.75
1,2,2018-12-03 09:41:32,2018-12-03 10:20:08,1.0,12.03,1,N,138,162,1,39.0,0.0,0.5,9.11,5.76,0.3,54.67
2,2,2018-12-03 08:54:36,2018-12-03 08:59:35,2.0,0.86,1,N,151,166,1,5.5,0.0,0.5,1.26,0.0,0.3,7.56
3,2,2018-12-03 09:02:08,2018-12-03 09:07:16,2.0,1.09,1,N,166,238,1,6.0,0.0,0.5,1.36,0.0,0.3,8.16
4,2,2018-12-03 09:10:10,2018-12-03 09:21:32,2.0,1.78,1,N,238,75,1,9.5,0.0,0.5,2.06,0.0,0.3,12.36


### Loading dependencies

In [6]:
# Imports 
import pandas as pd
import numpy as np
from torchvision import models
from datetime import datetime

### Preprocessing

In [7]:
#  Note - total_amount is excluded from the input list 

In [8]:
categorical_columns =['VendorID', 
                      'RatecodeID', 
                      'PULocationID', 
                      'DOLocationID', 
                      'payment_type']

numerical_variables = ['passenger_count', 
                       'trip_distance', 
                       'fare_amount', 
                       'extra', 
                       'mta_tax', 
                       'tolls_amount', 
                       'improvement_surcharge', 
                       'total_amount_wo_tip']

input_columns = categorical_columns + numerical_variables

In [9]:
target = 'tip_amount'

In [10]:
"""
We'll subtract the tip_amount from the total_amount to prevent any leakage, 
using a new total_amount_wo_tip column.
"""
ddf['total_amount_wo_tip'] = ddf['total_amount'] - ddf['tip_amount']

In [11]:
"""
Extract a manageable dataset from Dask 
"""

from src.utils.helpful_functions import concatenate

columns_to_keep = input_columns + [target]

dfs = []

# Load and append to Pandas dataframe
for i in range(40):
    ddf_partition = ddf[columns_to_keep].get_partition(i)
    df_temp = ddf_partition.compute()
#     df_temp[numerical_variables] = df_temp[numerical_variables].astype(np.float16)
    dfs.append(df_temp)

df_train = concatenate(dfs)

In [12]:
# print(df_train.describe())

In [13]:
X = df_train[input_columns]
y = df_train[target]

In [14]:
X.head()

Unnamed: 0_level_0,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,1,186,161,1,1.0,1.2,11.0,0.0,0.5,0.0,0.3,11.8
1,2,1,138,162,1,1.0,12.03,39.0,0.0,0.5,5.76,0.3,45.56
2,2,1,151,166,1,2.0,0.86,5.5,0.0,0.5,0.0,0.3,6.3
3,2,1,166,238,1,2.0,1.09,6.0,0.0,0.5,0.0,0.3,6.8
4,2,1,238,75,1,2.0,1.78,9.5,0.0,0.5,0.0,0.3,10.3


In [15]:
"""
This dataset has already been cleaned for nulls, but in other cases we will need to fill the values here
"""
# Fill nulls in categoricals, if there are 
# def fill_categorical_nulls(df, categories):  
#     for category in categories: 
#         df[category].fillna('Unknown', inplace=True)

'\nThis dataset has already been cleaned for nulls, but in other cases we will need to fill the values here\n'

### Label Encoder

In [16]:
"""
Try LabelEncoder replacement of Pandas categorical encoder for compatibility
"""
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [17]:
"""
Coded representation of categoricals
"""
coded_df = X.copy()
cat_columns = coded_df.select_dtypes(['category']).columns
coded_df[cat_columns] = coded_df[cat_columns].apply(lambda x: x.cat.codes)
coded_df[cat_columns] = coded_df[cat_columns].astype(np.int64)

In [18]:
print(coded_df.dtypes)

VendorID                   int64
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
payment_type               int64
passenger_count          float64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tolls_amount             float64
improvement_surcharge    float64
total_amount_wo_tip      float64
dtype: object


In [19]:
for col in cat_columns:
    coded_df[col] = LabelEncoder().fit_transform(coded_df[col])

### Training and Validation split

In [20]:
from sklearn.model_selection import train_test_split

# Separating dataset back to what it was
X_train, X_val, y_train, y_val = train_test_split(coded_df, y, test_size=0.10, random_state=42)
X_train.head()

Unnamed: 0_level_0,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
149604,0,0,52,144,0,0.0,1.8,12.0,1.0,0.5,0.0,0.3,13.8
327099,1,0,59,198,0,1.0,1.67,7.0,0.0,0.5,0.0,0.3,7.8
332450,1,0,184,11,0,2.0,0.88,6.5,0.0,0.5,0.0,0.3,7.3
85655,0,0,11,2,0,3.0,2.4,10.0,0.5,0.5,0.0,0.3,11.3
80990,1,0,154,58,1,1.0,1.83,10.5,0.0,0.5,0.0,0.3,11.3


### Create embeddings for categorical data

From: https://www.usfca.edu/data-institute/certificates/fundamentals-deep-learning lesson 2

In [21]:

# Filter those categories that do not have more than 2 values (these will not need embeddings)
embedded_cols = {n: len(col.cat.categories) for n,col in X[categorical_columns].items() \
                 if len(col.cat.categories) > 2}
embedded_cols

{'VendorID': 3,
 'RatecodeID': 7,
 'PULocationID': 262,
 'DOLocationID': 261,
 'payment_type': 4}

In [22]:
embedded_col_names = embedded_cols.keys()
len(X.columns) - len(embedded_cols) # number of numerical columns

8

In [23]:
# Get the size of the embeddings
# Reduce the size of the representation by roughly half or to 50 in a larger category
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes

[(3, 2), (7, 4), (262, 50), (261, 50), (4, 2)]

### Integrate embedding columns for format for training

In [24]:
from src.nn.prepare_embed import PrepareDataset
# Expand inputs with embedding 
train_ds = PrepareDataset(X_train, y_train, embedded_col_names)
valid_ds = PrepareDataset(X_val, y_val, embedded_col_names)

### Set up device management for PyTorch

In [25]:
from src.nn import device_loader
device = device_loader.get_default_device()

Running on CPU


### Define Model

In [26]:
"""
Initiate model
"""
from src.nn import feed_forward_nn_embed

model = feed_forward_nn_embed.FFModel(embedding_sizes, len(numerical_variables))
model = model.double()
device_loader.to_device(model, device)

FFModel(
  (embeddings): ModuleList(
    (0): Embedding(3, 2)
    (1): Embedding(7, 4)
    (2): Embedding(262, 50)
    (3): Embedding(261, 50)
    (4): Embedding(4, 2)
  )
  (lin1): Linear(in_features=116, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=1, bias=True)
  (bn1): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (embedding_dropout): Dropout(p=0.6, inplace=False)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [27]:
# Setting optimizer
import torch.optim as torch_optim    

def get_optimizer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

### Training

In [28]:
from torch.utils.data import DataLoader
batch_size = 10000
train_dl = DataLoader(train_ds, batch_size= batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size= batch_size, shuffle=True)

In [29]:
import logging
logger = logging.getLogger("distributed.utils_perf")
logger.setLevel(logging.ERROR)

In [None]:
from src.nn import feed_forward_nn_embed

train_loader = device_loader.DeviceDataLoader(train_dl, device)
valid_loader = device_loader.DeviceDataLoader(valid_dl, device)

def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        # Train model 
        print('Epoch: ' + str(i))
        training_loss = feed_forward_nn_embed.fit(model, optim, train_loader)
        print("training loss: ", training_loss)
        
        # Calculate and output training loss
        validation_loss = feed_forward_nn_embed.calculate_mse(model, valid_loader)
        print("validation loss: ", validation_loss)

%time train_loop(model, epochs=10, lr=0.01, wd=0.00001)

Epoch: 0
training loss:  3.146651703893024
validation loss:  4.090722125393046
Epoch: 1
training loss:  3.074193536297103
validation loss:  3.130762161241635
Epoch: 2
training loss:  3.0644010865351055
validation loss:  2.391836751611148
Epoch: 3
training loss:  3.0593918585266953
validation loss:  4.757984898995612
Epoch: 4
training loss:  3.0519061561351037
validation loss:  4.2127385596009725
Epoch: 5


The validation error diverges after one epoch, so the model is quickly overfitting the data. Could play with the learning rate, but it's possible the function we are trying to model is too simple for the NN. Stopping at 1 to not overfit, since the intention of this was to see model interpretability with ShAP for the NN  

### ShAP Deep Explainer

Since the time to train an MSE in these cases is considerably worse than gradient boosting, interpretability for this tabular data may not be worse considering. 

In [31]:
import shap

In [32]:
# https://www.kaggle.com/ceshine/feature-importance-from-a-pytorch-model
explainer = shap.DeepExplainer(model, 
                               next(iter(DataLoader(valid_ds, batch_size= batch_size))),
                              ).to(device)
# explainer = shap.DeepExplainer(
#         model, 
#         torch.from_numpy(
#             x_train[np.random.choice(np.arange(len(x_train)), 10000, replace=False)]
#         ).to(DEVICE))

IndexError: tuple index out of range

In [None]:
%%time
x_samples = x_train[np.random.choice(np.arange(len(x_train)), 300, replace=False)]
print(len(x_samples))
shap_values = e.shap_values(
    torch.from_numpy(x_samples).to(DEVICE)
)

In [None]:
import pandas as pd
df = pd.DataFrame({
    "mean_abs_shap": np.mean(np.abs(shap_values), axis=0), 
    "stdev_abs_shap": np.std(np.abs(shap_values), axis=0), 
    "name": features
})
df.sort_values("mean_abs_shap", ascending=False)[:10]

In [None]:
shap.summary_plot(shap_values, features=x_samples, feature_names=features)