# LightGBM Dask Test

LightGBM offers a scalable solution that accomodates tabular and categorical inputs out of the box. Here I'll try to see the compatibility of Dask with LightGBM directly

### Setup

In [4]:
# Extension reloader to import a function again when re-running cell 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Configuration

In [5]:
"""
Loads common configuration parameters
"""
from src.utils.configuration_manager import Config
from pathlib import Path, PurePath
from os import getcwd

config_path = PurePath(getcwd(),'config.ini')
config = Config(config_path)

Loading configuration from: /home/justin/Code/interpretability_experiment/config.ini


### Figures directory

In [6]:
figures_dir = Path(config.figures_dir, 'dask_lightgbm_shap')
figures_dir.mkdir(parents=True, exist_ok=True)

### Start local Dask Client

In [7]:
from dask.distributed import Client, LocalCluster
try:
    if client:
        print('Restarting client')
        client.restart()
except:
    cluster = LocalCluster(dashboard_address=':20100', memory_limit='5G')
#     cluster = LocalCluster(dashboard_address=':20100')
    print('Setting new client')
    client = Client(cluster)
    print(client)
client

Setting new client
<Client: 'tcp://127.0.0.1:43137' processes=5 threads=10, memory=25.00 GB>


0,1
Client  Scheduler: tcp://127.0.0.1:43137  Dashboard: http://127.0.0.1:20100/status,Cluster  Workers: 5  Cores: 10  Memory: 25.00 GB


### Dask dataframe loader

In [8]:
from src.preprocessing.dataset_manager import DatasetManager
dataset_manager = DatasetManager(config)

In [9]:
%time dataset_manager.write_dataset(overwrite=False)

Not overwriting existing training and test sets
CPU times: user 600 µs, sys: 130 µs, total: 730 µs
Wall time: 1.15 ms


In [10]:
X_train, y_train = dataset_manager.get_training_set()

### Additional Preprocessing Step

In [11]:
datetime_columns_to_transform = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']

In [12]:
def preprocessing_pipeline(ddf, datetime_columns_to_transform = []):
    """
    Preprocessing pipeline that transforms ddf before training
    Only keeps floats and the OHE of specified columns to transform
    """
    
    from src.preprocessing.datetime_to_cat import add_datetime_cat
    ddf, new_categorical_columns = add_datetime_cat(ddf, datetime_columns_to_transform)
    ddf = ddf.drop(datetime_columns_to_transform, axis=1)
    
    return ddf

In [13]:
X_train = preprocessing_pipeline(X_train, datetime_columns_to_transform)

In [14]:
X_train.head()

Unnamed: 0_level_0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount_wo_tip,tpep_pickup_datetime_hourslot,tpep_pickup_datetime_day_of_week,tpep_dropoff_datetime_hourslot,tpep_dropoff_datetime_day_of_week
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,1.0,1.2,1,186,161,1,11.0,0.0,0.5,0.0,0.3,11.8,9,0,10,0
1,2,1.0,12.03,1,138,162,1,39.0,0.0,0.5,5.76,0.3,45.56,9,0,10,0
2,2,2.0,0.86,1,151,166,1,5.5,0.0,0.5,0.0,0.3,6.3,8,0,8,0
3,2,2.0,1.09,1,166,238,1,6.0,0.0,0.5,0.0,0.3,6.8,9,0,9,0
5,2,2.0,1.32,1,238,141,1,10.0,0.0,0.5,0.0,0.3,10.8,9,0,9,0


In [15]:
target = 'tip_fraction'
y_train = y_train[target]
y_train.head()

# X = ddf[input_columns]
# y = ddf[target]

index
0    0.250000
1    0.199956
2    0.200000
3    0.200000
5    0.200000
Name: tip_fraction, dtype: float64

In [16]:
# from dask_ml.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, \
#                                                     test_size=.9, \
#                                                     random_state=42)

In [17]:
# print(X_train.dtypes)

### Setup Dask LightGBM Model

In [23]:
"""
LightGBM parameters
"""
import dask_lightgbm.core as lgb

In [24]:
lgb_params = {
    'task': 'train',
#     'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric': {'l1'},
    'num_leaves': 50,
    'learning_rate': 0.005,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbose': -1,
    'max_depth':12,
    'min_data_in_leaf':100,
    'alpha':0.5,
    'tree_learner':'feature'
}

In [25]:
lgb_regressor = lgb.LGBMRegressor(lgb_params)

In [26]:

def get_categorical_indices(ddf):
    """
    Gets categorical column indices - may not be necessary for Pandas, but doesn't hurt
    """
    categorical_feature_names = ddf.select_dtypes(['category']).columns
    categorical_feature_columns = [ddf.columns.get_loc(x) for x in categorical_feature_names]
    return categorical_feature_columns

def train_model(lgb_regressor, X_ddf, y_ddf):   
    categorical_col_indices = get_categorical_indices(X_ddf)
    
    from dask_ml.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_ddf, y_ddf, test_size=.8)
    
    lgb_model = lgb_regressor.fit(X_ddf,
                        y_ddf,
#                         gb_params,
                        num_boost_round=10,
                        verbose_eval=1,
                        early_stopping_rounds=500, 
                        categorical_feature = categorical_col_indices,
                         )
    return lgb_model

### Train Regressor

In [27]:
lightgbm_model = train_model(lgb_regressor, X_train, y_train)



KeyboardInterrupt: 



In [None]:
# Not working
# TODO find out if there is some way to support categoricals directly
# lgb_regressor.fit(X_train, 
#                   y_train, 
#                  ) 

### Test 

In [None]:
X_test, y_test = dataset_manager.get_test_set()
X_test = preprocessing_pipeline(X_test, datetime_columns_to_transform)

dy_predict = lgb_regressor.predict(X_test)

In [None]:
print(dy_predict.shape)
# Something has gone terribly wrong!

In [None]:
"""
https://github.com/dask/dask-lightgbm/blob/master/system_tests/test_fit_predict.py
"""
# The dask_ml.metrics.r2_score method fails with dataframes so we compute the R2 score ourselves

numerator = ((y - dy_predict) ** 2).sum()
denominator = ((y - y.mean()) ** 2).sum()
r2_score = 1 - numerator / denominator
r2_score = r2_score.compute()
print(r2_score)