Link to LGBM parameters tuning: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html#for-better-accuracy

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path
from dotenv import load_dotenv

from src.boosting_models import evaluate_lightgbm

load_dotenv()

True

In [2]:
%%time
dataset_dir = Path(os.environ['dataset_dir'])
full_data = pd.read_feather(dataset_dir / 'train32.feather')
sample_data = pd.read_csv(dataset_dir / 'train_sample.csv', index_col=0)

CPU times: user 4.2 s, sys: 11.6 s, total: 15.8 s
Wall time: 6.51 s


In [3]:
data = sample_data

In [4]:
data = data.set_index('row_id')

In [5]:
# train on earlier data, test on later data
train = data[data.time_id < 1000]
test = data.query("1000 <= time_id")

x_train, y_train = train.drop('target', axis=1), train[['time_id', 'target']]
x_test, y_test = test.drop('target', axis=1), test[['time_id', 'target']]

In [6]:
params = {
    'objective': 'regression', 
    'boosting': 'dart',
    'num_iterations': 100,
    'num_leaves': 31, # less num leaves reduces variance
    'max_bin': 63, # less max bin reduces variance
    'learning_rate': 0.1, # lower lr with higher iterations reduces variance
    'min_data_in_leaf': 2**8,
}

In [7]:
rmse, pearson = evaluate_lightgbm(
    x_train.drop("time_id", axis=1), 
    x_test.drop("time_id", axis=1), 
    y_train.target, 
    y_test,
    params,
)
print(f"RMSE: {rmse}, Pearson: {pearson}")



[LightGBM] [Info] Total Bins 22152
[LightGBM] [Info] Number of data: 24447, number of used features: 301
[LightGBM] [Info] Start training from score -0.021726
RMSE: 0.9247614889943098, Pearson: 0.050723329596191025
