# LightGBM Dask Test

LightGBM offers a scalable solution that accomodates tabular and categorical inputs out of the box. 

### Setup

In [1]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

### Load Configuration

In [2]:
"""
Loads common configuration parameters
"""
import utils.configuration_manager as configuration_manager
from pathlib import PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = configuration_manager.Config(config_path)

# Assumes parquet directory as input
input_path = config.input_path
print('Input path: '+ input_path)

# For result storage
output_directory = config.output_directory
print('Output path: ' + output_directory)

Loading configuration from: /home/justin/Code/interpretability_experiment/config.ini
Input path: data/2018_Yellow_Taxi_Trip_Data
Output path: output


### Start local Dask Client

In [3]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
#     cluster = LocalCluster(dashboard_address=':20100', memory_limit='4G')
    cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:38781' processes=5 threads=10, memory=25.97 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:38781  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 25.97 GB


### Dask dataframe loader

In [4]:
import dask.dataframe as dd
import fastparquet

In [5]:
ddf = dd.read_parquet(input_path)

In [6]:
ddf.head()

Unnamed: 0_level_0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,2018-12-03 09:58:01,2018-12-03 10:14:17,1.0,1.2,1,N,186,161,1,11.0,0.0,0.5,2.95,0.0,0.3,14.75
1,2,2018-12-03 09:41:32,2018-12-03 10:20:08,1.0,12.03,1,N,138,162,1,39.0,0.0,0.5,9.11,5.76,0.3,54.67
2,2,2018-12-03 08:54:36,2018-12-03 08:59:35,2.0,0.86,1,N,151,166,1,5.5,0.0,0.5,1.26,0.0,0.3,7.56
3,2,2018-12-03 09:02:08,2018-12-03 09:07:16,2.0,1.09,1,N,166,238,1,6.0,0.0,0.5,1.36,0.0,0.3,8.16
4,2,2018-12-03 09:10:10,2018-12-03 09:21:32,2.0,1.78,1,N,238,75,1,9.5,0.0,0.5,2.06,0.0,0.3,12.36


In [7]:
ddf.dtypes

VendorID                       category
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                     category
store_and_fwd_flag               object
PULocationID                   category
DOLocationID                   category
payment_type                   category
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
dtype: object

### Define what we are trying to model

Let's try to create an estimator for the passenger tip, which I suspect is something pretty difficult to guess outright. 

Then we can look into other questions like whether the passenger count might affect the tip as a kind of social pressure. 

In [8]:
columns = [x for x in ddf.columns.values]
print(columns)

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']


In [9]:
# Note - total_amount is excluded from the input list 
# Excludes categorical since these don't seem to be natively supported with dask dfs
input_columns = ['passenger_count', 'trip_distance','fare_amount', 'extra', 
                 'mta_tax', 'tolls_amount', 'improvement_surcharge', 
                 'total_amount_wo_tip']

In [10]:
target = 'tip_amount'

In [11]:
"""
We'll subtract the tip_amount from the total_amount to prevent any leakage, 
using a new total_amount_wo_tip column.
"""
ddf['total_amount_wo_tip'] = ddf['total_amount'] - ddf['tip_amount']

In [12]:
# print(ddf.head())

### Train-test split

In [13]:
X = ddf[input_columns]
y = ddf[target]

In [14]:
from dask_ml.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, \
                                                    test_size=.9, \
                                                    random_state=42)



In [15]:
print(X_train.dtypes)

passenger_count          float64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tolls_amount             float64
improvement_surcharge    float64
total_amount_wo_tip      float64
dtype: object


### Setup Dask LightGBM Model

In [16]:
"""
LightGBM parameters
"""
import dask_lightgbm.core as lgb

lgb_params = {
    'task': 'train',
    'boosting_type': 'goss',
    'objective': 'binary',
    'metric':'binary_logloss',
    'metric': {'l2', 'auc'},
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbose': None,
#     'num_iteration':100,
    'max_depth':12,
    'min_data_in_leaf':100,
    'alpha':0.5}

In [17]:
# lgb.LGBMRegressor?

In [18]:
lgb_regressor = lgb.LGBMRegressor(tree_learner='data', n_estimators=10, num_leaves=10)

### Train Regressor

In [21]:
# Not working
# TODO find out if there is some way to support categoricals directly
lgb_regressor.fit(X_train, y_train) 

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1,
              local_listen_port=12400,
              machines='127.0.0.1:12400,127.0.0.1:12401,127.0.0.1:12402,127.0.0.1:12403,127.0.0.1:12404',
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=10, n_jobs=-1, num_leaves=10,
              num_machines=5, num_threads=2, objective=None, random_state=None,
              reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, time_out=120,
              tree_learner='data')

### Test 

In [22]:
dy_predict = lgb_regressor.predict(X_test)

In [34]:
print(y.shape)
print(dy_predict.shape)

TypeError: 'tuple' object is not callable

In [None]:
"""
https://github.com/dask/dask-lightgbm/blob/master/system_tests/test_fit_predict.py
"""
# The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves

numerator = ((y - dy_predict) ** 2).sum()
denominator = ((y - y.mean()) ** 2).sum()
r2_score = 1 - numerator / denominator
r2_score = r2_score.compute()
print(r2_score)