# Feed-forward Neural-Network Interpretability

Testing ShAP interpretability with a neural network, going to evaluate overlap with the lightgbm model

### Setup

In [None]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [None]:
"""
Loads common configuration parameters
"""
import utils.configuration_manager as configuration_manager
from pathlib import PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = configuration_manager.Config(config_path)

# Assumes parquet directory as input
input_path = config.input_path
print('Input path: '+ input_path)

# For result storage
output_directory = config.output_directory
print('Output path: ' + output_directory)

### Start local Dask Client

In [3]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
    cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:42637' processes=5 threads=10, memory=25.61 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:42637  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 25.61 GB


### Dask dataframe loader

In [4]:
import dask.dataframe as dd
import fastparquet

In [5]:
ddf = dd.read_parquet(input_path)

In [7]:
ddf.head()

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,2018-12-03 09:58:01,2018-12-03 10:14:17,1.0,1.2,1,N,186,161,1,11.0,0.0,0.5,2.95,0.0,0.3,14.75
1,2,2018-12-03 09:41:32,2018-12-03 10:20:08,1.0,12.03,1,N,138,162,1,39.0,0.0,0.5,9.11,5.76,0.3,54.67
2,2,2018-12-03 08:54:36,2018-12-03 08:59:35,2.0,0.86,1,N,151,166,1,5.5,0.0,0.5,1.26,0.0,0.3,7.56
3,2,2018-12-03 09:02:08,2018-12-03 09:07:16,2.0,1.09,1,N,166,238,1,6.0,0.0,0.5,1.36,0.0,0.3,8.16
4,2,2018-12-03 09:10:10,2018-12-03 09:21:32,2.0,1.78,1,N,238,75,1,9.5,0.0,0.5,2.06,0.0,0.3,12.36


### Dependencies for PyTorch

In [8]:
#Import
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from datetime import datetime

### Preprocessing

In [9]:
#  Note - total_amount is excluded from the input list 

In [10]:
categorical_columns =['VendorID', 
                      'RatecodeID', 
                      'PULocationID', 
                      'DOLocationID', 
                      'payment_type']

numerical_variables = ['passenger_count', 
                       'trip_distance', 
                       'fare_amount', 
                       'extra', 
                       'mta_tax', 
                       'tolls_amount', 
                       'improvement_surcharge', 
                       'total_amount_wo_tip']

input_columns = categorical_columns + numerical_variables

In [11]:
target = 'tip_amount'

In [12]:
"""
We'll subtract the tip_amount from the total_amount to prevent any leakage, 
using a new total_amount_wo_tip column.
"""
ddf['total_amount_wo_tip'] = ddf['total_amount'] - ddf['tip_amount']

In [13]:
"""
Extract a manageable dataset from Dask 
"""

from utils.helpful_functions import concatenate

columns_to_keep = input_columns + [target]

dfs = []

# Load and append to Pandas dataframe
for i in range(40):
    ddf_partition = ddf[columns_to_keep].get_partition(i)
    df_temp = ddf_partition.compute()
#     df_temp[numerical_variables] = df_temp[numerical_variables].astype(np.float16)
    dfs.append(df_temp)

df_train = concatenate(dfs)

In [14]:
# print(df_train.describe())

In [15]:
X = df_train[input_columns]
y = df_train[target]

In [20]:
X.head()

Unnamed: 0_level_0,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,1,186,161,1,1.0,1.2,11.0,0.0,0.5,0.0,0.3,11.8
1,2,1,138,162,1,1.0,12.03,39.0,0.0,0.5,5.76,0.3,45.56
2,2,1,151,166,1,2.0,0.86,5.5,0.0,0.5,0.0,0.3,6.3
3,2,1,166,238,1,2.0,1.09,6.0,0.0,0.5,0.0,0.3,6.8
4,2,1,238,75,1,2.0,1.78,9.5,0.0,0.5,0.0,0.3,10.3


In [21]:
# Fill nulls in categoricals, if there are 
# def fill_categorical_nulls(df, categories):  
#     for category in categories: 
#         df[category].fillna('Unknown', inplace=True)

### Training and Validation split

In [23]:
# Separating dataset back to what it was
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)
X_train.head()

Unnamed: 0_level_0,VendorID,RatecodeID,PULocationID,DOLocationID,payment_type,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
149604,1,1,164,249,1,0.0,1.8,12.0,1.0,0.5,0.0,0.3,13.8
327099,2,1,170,68,1,1.0,1.67,7.0,0.0,0.5,0.0,0.3,7.8
332450,2,1,79,114,1,2.0,0.88,6.5,0.0,0.5,0.0,0.3,7.3
85655,1,1,114,100,1,3.0,2.4,10.0,0.5,0.5,0.0,0.3,11.3
80990,2,1,43,161,2,1.0,1.83,10.5,0.0,0.5,0.0,0.3,11.3


### Create embeddings for categorical data

In [28]:
# Filter those categories that do not have more than 2 values (these will not need embeddings)
embedded_cols = {n: len(col.cat.categories) for n,col in X[categorical_columns].items() \
                 if len(col.cat.categories) > 2}
embedded_cols

{'VendorID': 3,
 'RatecodeID': 7,
 'PULocationID': 262,
 'DOLocationID': 261,
 'payment_type': 4}

In [29]:
embedded_col_names = embedded_cols.keys()
len(X.columns) - len(embedded_cols) # number of numerical columns

8

In [30]:
# Get the size of the embeddings
# Reduce the size of the representation by roughly half or to 50 in a larger category
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
embedding_sizes

[(3, 2), (7, 4), (262, 50), (261, 50), (4, 2)]

### Integrate embedding columns for format for training

In [49]:
# Define dataset for Pytorch
class PrepareDataset(Dataset):
    def __init__(self, X, y, embedded_column_names):
        X = X.copy()
        # Categorical columns
        self.X1 = X.loc[:,embedded_column_names].copy().values.astype(np.int64) 
        # Numerical columns
        self.X2 = X.drop(columns = embedded_column_names).copy().values.astype(np.float32) 
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]

In [53]:
# Expand inputs with embedding 
train_ds = PrepareDataset(X_train, y_train, embedded_col_names)
valid_ds = PrepareDataset(X_val, y_val, embedded_col_names)

### Set up device management for PyTorch

In [54]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        print('Running with GPU/Cuda')
        return torch.device('cuda')
    else:
        print('Running on CPU')
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [55]:
device = get_default_device()

Running on CPU


### Define Model

In [56]:
class FFModel(nn.Module):
    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)
        self.lin2 = nn.Linear(200, 70)
        self.lin3 = nn.Linear(70, 5)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(70)
        self.embedding_dropout = nn.Dropout(0.6)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.embedding_dropout(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.dropout(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.dropout(x)
        x = self.bn3(x)
        x = self.lin3(x)
        return x

In [57]:
"""
Initiate model
"""
model = FFModel(embedding_sizes, 1)
to_device(model, device)

FFModel(
  (embeddings): ModuleList(
    (0): Embedding(3, 2)
    (1): Embedding(7, 4)
    (2): Embedding(262, 50)
    (3): Embedding(261, 50)
    (4): Embedding(4, 2)
  )
  (lin1): Linear(in_features=109, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=5, bias=True)
  (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (embedding_dropout): Dropout(p=0.6, inplace=False)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [58]:
# Setting optimizer
def get_optimizer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

optimizer = get_optimizer(model)

In [63]:
def validation_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        out = model(x1, x2)
        loss = F.cross_entropy(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(out, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total

In [64]:
def train_model(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        output = model(x1, x2)
        loss = F.cross_entropy(output, y)   
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

### Training

In [65]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)

In [66]:
train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)

In [68]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        loss = train_model(model, optim, train_dl)
        print("training loss: ", loss)
        validation_loss(model, valid_dl)

In [69]:
train_loop(model, epochs=10, lr=0.05, wd=0.00001)

KeyError: 11948173

In [45]:
# model.train()
# epoch = 20
# for epoch in range(epoch):
#     optimizer.zero_grad()
#     # Forward pass
#     y_pred = model(X_train)
#     # Compute Loss
#     loss = criterion(y_pred.squeeze(), y_train)
   
#     print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
#     # Backward pass
#     loss.backward()
#     optimizer.step()

TypeError: forward() missing 1 required positional argument: 'x_cont'

In [None]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        loss = train_model(model, optim, train_dl)
        print("training loss: ", loss)
        validation_loss(model, valid_dl)

### ShAP Deep Explainer

In [None]:
# https://www.kaggle.com/ceshine/feature-importance-from-a-pytorch-model
%%time
e = shap.DeepExplainer(
        model, 
        torch.from_numpy(
            x_train[np.random.choice(np.arange(len(x_train)), 10000, replace=False)]
        ).to(DEVICE))

In [None]:
%%time
x_samples = x_train[np.random.choice(np.arange(len(x_train)), 300, replace=False)]
print(len(x_samples))
shap_values = e.shap_values(
    torch.from_numpy(x_samples).to(DEVICE)
)

In [None]:
import pandas as pd
df = pd.DataFrame({
    "mean_abs_shap": np.mean(np.abs(shap_values), axis=0), 
    "stdev_abs_shap": np.std(np.abs(shap_values), axis=0), 
    "name": features
})
df.sort_values("mean_abs_shap", ascending=False)[:10]

In [None]:
shap.summary_plot(shap_values, features=x_samples, feature_names=features)