# LightGBM interpretability

LightGBM offers a scalable solution that accomodates tabular and categorical inputs out of the box. 

### Setup

In [1]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [2]:
"""
Loads common configuration parameters
"""
import utils.configuration_manager as configuration_manager
from pathlib import PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = configuration_manager.Config(config_path)

# Assumes parquet directory as input
input_path = config.input_path
print('Input path: '+ input_path)

# For result storage
output_directory = config.output_directory
print('Output path: ' + output_directory)

Loading configuration from: /home/justin/Code/interpretability_experiment/config.ini
Input path: data/2018_Yellow_Taxi_Trip_Data
Output path: output


### Start local Dask Client

In [3]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
    cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:39311' processes=5 threads=10, memory=26.87 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:39311  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 26.87 GB


### Dask dataframe loader

In [4]:
import dask.dataframe as dd
import fastparquet

In [5]:
ddf = dd.read_parquet(input_path)

In [6]:
ddf.head()

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,2018-12-03 09:58:01,2018-12-03 10:14:17,1.0,1.2,1,N,186,161,1,11.0,0.0,0.5,2.95,0.0,0.3,14.75
1,2,2018-12-03 09:41:32,2018-12-03 10:20:08,1.0,12.03,1,N,138,162,1,39.0,0.0,0.5,9.11,5.76,0.3,54.67
2,2,2018-12-03 08:54:36,2018-12-03 08:59:35,2.0,0.86,1,N,151,166,1,5.5,0.0,0.5,1.26,0.0,0.3,7.56
3,2,2018-12-03 09:02:08,2018-12-03 09:07:16,2.0,1.09,1,N,166,238,1,6.0,0.0,0.5,1.36,0.0,0.3,8.16
4,2,2018-12-03 09:10:10,2018-12-03 09:21:32,2.0,1.78,1,N,238,75,1,9.5,0.0,0.5,2.06,0.0,0.3,12.36


In [7]:
ddf.dtypes

VendorID                       category
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                     category
store_and_fwd_flag               object
PULocationID                   category
DOLocationID                   category
payment_type                   category
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
dtype: object

### Define what we are trying to model

Let's try to create an estimator for the passenger tip, which I suspect is something pretty difficult to guess outright. 

Then we can look into other questions like whether the passenger count might affect the tip as a kind of social pressure. 

In [8]:
columns = [x for x in ddf.columns.values]
print(columns)

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']


In [20]:
# Note - total_amount is excluded from the input list 
input_columns = ['VendorID', 
                 'passenger_count', 'trip_distance', 'RatecodeID', 
                 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
                 'mta_tax', 'tolls_amount', 'improvement_surcharge', 'total_amount_wo_tip']

In [21]:
target = 'tip_amount'

In [22]:
"""
We'll subtract the tip_amount from the total_amount to prevent any leakage, 
using a new total_amount_wo_tip column.
"""
ddf['total_amount_wo_tip'] = ddf['total_amount'] - ddf['tip_amount']

In [23]:
# print(ddf.head())

### LightGBM Model

In [24]:
"""
LightGBM parameters
"""
import lightgbm as lgb
from dask_ml.model_selection import train_test_split

ts = 'tpep_pickup_datetime'

In [33]:
lgb_params = {
    'task': 'train',
    'boosting_type': 'goss',
    'objective': 'binary',
    'metric':'binary_logloss',
    'metric': {'l2', 'auc'},
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbose': None,
    'max_depth':12,
    'min_data_in_leaf':100,
    'alpha':0.5}

def train_model(df_block):
    X = df_block[input_columns]
    y = df_block[target]
    
    # Make training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8)
    
#     # Transform to pandas dataframe for LightGBM support 
#     X_train, X_test, y_train, y_test = pandafy(X_train, X_test, y_train, y_test)
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test)

    #https://www.kaggle.com/mlisovyi/beware-of-categorical-features-in-lgbm
    # https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27
    lgb_model = lgb.train(lgb_params,
                          lgb_train,
                          num_boost_round=100,
                          valid_sets=lgb_eval,
                          early_stopping_rounds=5)
    return lgb_model


In [34]:
columns_to_keep = input_columns + [target]
ddf_partition = ddf[columns_to_keep].get_partition(0)
df_partition = ddf_partition.compute()

In [35]:
print(df_partition.dtypes)

VendorID                 category
passenger_count           float64
trip_distance             float64
RatecodeID               category
PULocationID             category
DOLocationID             category
payment_type             category
fare_amount               float64
extra                     float64
mta_tax                   float64
tolls_amount              float64
improvement_surcharge     float64
total_amount_wo_tip       float64
tip_amount                float64
dtype: object


In [36]:
train_model(df_partition)

[1]	valid_0's l2: 8.91285	valid_0's auc: 0.967491
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 8.82042	valid_0's auc: 0.968817
[3]	valid_0's l2: 8.74074	valid_0's auc: 0.968955
[4]	valid_0's l2: 8.67121	valid_0's auc: 0.969753
[5]	valid_0's l2: 8.61024	valid_0's auc: 0.97042
[6]	valid_0's l2: 8.60203	valid_0's auc: 0.970806
[7]	valid_0's l2: 8.54869	valid_0's auc: 0.971008
[8]	valid_0's l2: 8.50152	valid_0's auc: 0.971086
[9]	valid_0's l2: 8.4605	valid_0's auc: 0.971025
[10]	valid_0's l2: 8.42342	valid_0's auc: 0.971099
[11]	valid_0's l2: 8.392	valid_0's auc: 0.970997
[12]	valid_0's l2: 8.36378	valid_0's auc: 0.971104
[13]	valid_0's l2: 8.33765	valid_0's auc: 0.971212
[14]	valid_0's l2: 8.31512	valid_0's auc: 0.971262
[15]	valid_0's l2: 8.29429	valid_0's auc: 0.97124
[16]	valid_0's l2: 8.27532	valid_0's auc: 0.971322
[17]	valid_0's l2: 8.25903	valid_0's auc: 0.971305
[18]	valid_0's l2: 8.24406	valid_0's auc: 0.971395
[19]	valid_0's l2: 8.2296	valid_0's 

<lightgbm.basic.Booster at 0x7fcab03a1650>

In [None]:
"""
Create Pandas dataframes for model interpretability testing
"""
# X_train.compute()
# X_test
# y_train.compute()
# y_test 

In [19]:
lgb_train = lgb.Dataset(X_train, y_train)

In [20]:
#https://www.kaggle.com/mlisovyi/beware-of-categorical-features-in-lgbm
# https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27
lgb_model = lgb.train(lgb_params,
                      lgb_train,
                      num_boost_round=30,
                      valid_sets=lgb_eval,
                      early_stopping_rounds=5)

NameError: name 'lgb_eval' is not defined

### Train Regressor

In [None]:
# Not working
# TODO find out if there is some way to support categoricals directly
lgb_regressor.fit(X_train, y_train) 

### Test 

In [None]:
dy_predict = lgb_regressor.predict(X_test)

In [None]:
print(dy_predict.shape)
# Something has gone terribly wrong!

In [None]:
"""
https://github.com/dask/dask-lightgbm/blob/master/system_tests/test_fit_predict.py
"""
# The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves

numerator = ((y - dy_predict) ** 2).sum()
denominator = ((y - y.mean()) ** 2).sum()
r2_score = 1 - numerator / denominator
r2_score = r2_score.compute()
print(r2_score)