In [1]:
import time

import polars as pl
import lightgbm as lgb

import xgboost as xgb
from catboost import CatBoostRegressor

from src.util.constants import DATA_PATH
from src.util.common import mean_grouped_spearman_correlation

In [2]:
df_train_list = []
df_validate_list = []

for fold in range(2):
    df_train: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_train_{fold}.parquet")
    df_validate: pl.DataFrame = pl.read_parquet(f"{DATA_PATH}/folds/df_validate_{fold}.parquet")

    df_train_list.append(df_train)
    df_validate_list.append(df_validate)
    del df_train, df_validate

In [3]:
num_boost_round = 500
max_depth = 5
learning_rate = .05

# LightGBM

LightGBM does not have out-of-the-box GPU support on Apple Silicon, unfortunately.

In [None]:
parameters = {
    "device_type": "cpu",
    "objective": "regression",
    "n_jobs": 12,
    "subsample_freq": 1,
    "verbosity": -1,
    "learning_rate": learning_rate,
    "max_depth": max_depth
}

for fold in range(2):
    df_train = df_train_list[fold]
    df_validate = df_validate_list[fold]
    feature_names = [x for x in df_train.columns if 'feature' in x]

    start_time = time.time()
    model = lgb.LGBMRegressor(
        n_estimators=num_boost_round,
        **parameters
    )

    # noinspection PyTypeChecker
    model.fit(
        X=df_train[feature_names].to_numpy(),
        y=df_train['target'].to_numpy()
    )

    corr = mean_grouped_spearman_correlation(
        pl.Series(model.predict(df_validate[feature_names].to_numpy())),
        df_validate['target'],
        df_validate['era']
    )

    print(f"Runtime on cpu for fold {fold}: {time.time() - start_time:.2f} seconds")
    print(f"Correlation on cpu for fold {fold}: {corr:.4f}")

In [None]:
# Runtime on cpu for fold 0: 1537.36 seconds
# Correlation on cpu for fold 0: 0.0286
# Runtime on cpu for fold 1: 2230.56 seconds
# Correlation on cpu for fold 1: 0.0232

# XGBoost

XGBoost does not support GPU on Apple Silicon either.

In [None]:
parameters = {
    "device_type": "cpu",
    "nthread": 12,
    "objective": "reg:squarederror",
    "verbosity": 0,
    "learning_rate": learning_rate,
    "max_depth": max_depth,
    "num_round": num_boost_round
}

for fold in range(2):
    df_train = df_train_list[fold]
    df_validate = df_validate_list[fold]
    feature_names = [x for x in df_train.columns if 'feature' in x]

    start_time = time.time()
    model = xgb.XGBRegressor(
        **parameters
    )

    model.fit(
        X=df_train[feature_names],
        y=df_train['target']
    )

    corr = mean_grouped_spearman_correlation(
        pl.Series(model.predict(df_validate[feature_names])),
        df_validate['target'],
        df_validate['era']
    )

    print(f"Runtime on CPU for fold {fold}: {time.time() - start_time:.2f} seconds")
    print(f"Correlation on CPU for fold {fold}: {corr:.4f}")

Runtime on cpu for fold 0: 435.00 seconds
Correlation on cpu for fold 0: 0.0229
Runtime on cpu for fold 1: 571.16 seconds
Correlation on cpu for fold 1: 0.0178
Runtime on cuda for fold 0: 477.23 seconds
Correlation on cuda for fold 0: 0.0229


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x11c62db90>>
Traceback (most recent call last):
  File "/Users/jonas-data-science/Library/Caches/pypoetry/virtualenvs/nmr-DUJvlELt-py3.11/lib/python3.11/site-packages/xgboost/core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


In [None]:
# Runtime on cpu for fold 0: 435.00 seconds
# Correlation on cpu for fold 0: 0.0229
# Runtime on cpu for fold 1: 571.16 seconds
# Correlation on cpu for fold 1: 0.0178

# CatBoost

In [5]:
parameters = {
    "task_type": "CPU",
    "thread_count": 12,
    "loss_function": "RMSE",
    "verbose": False,
    "learning_rate": learning_rate,
    "depth": max_depth,
    "iterations": num_boost_round,
    "allow_writing_files": False  # prevents creation of catboost_info folder
}

for fold in range(2):
    df_train = df_train_list[fold]
    df_validate = df_validate_list[fold]
    feature_names = [x for x in df_train.columns if 'feature' in x]

    start_time = time.time()
    model = CatBoostRegressor(
        **parameters
    )

    model.fit(
        X=df_train[feature_names].to_pandas(),
        y=df_train['target'].to_pandas()
    )

    corr = mean_grouped_spearman_correlation(
        pl.Series(model.predict(df_validate[feature_names].to_pandas())),
        df_validate['target'],
        df_validate['era']
    )

    print(f"Runtime on CPU for fold {fold}: {time.time() - start_time:.2f} seconds")
    print(f"Correlation on CPU for fold {fold}: {corr:.4f}")

Runtime on CPU for fold 0: 1449.13 seconds
Correlation on CPU for fold 0: 0.0275
Runtime on CPU for fold 1: 2017.71 seconds
Correlation on CPU for fold 1: 0.0225


CatBoostError: catboost/libs/train_lib/trainer_env.cpp:9: Environment for task type [GPU] not found

XGBoost is by far the fastest. We will use it to select features and then again compare performance between LightGBM and XGBoost.