In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [4]:
def read_parquet_partitions(base_path: str, partitions: int = 10) -> pd.DataFrame:
    dfs = []
    for partition in range(partitions):
        path = f'{base_path}/partition_id={partition}/part-0.parquet'
        df = pd.read_parquet(path)
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

# Usage
data = read_parquet_partitions('kaggle/data/train.parquet')

In [5]:
df = data.dropna()

In [6]:
df_length = df.shape[0]
valid_ratio = 0.10
train_data = df[:df_length - int(df_length*valid_ratio)]
valid_data = df[df_length - int(df_length*valid_ratio):]

train_data.shape, valid_data.shape

((31833740, 92), (3537082, 92))

In [7]:
feature_col = ['time_id', 'symbol_id'] + df.columns[df.columns.str.contains('feature')].tolist()

In [8]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [9]:
LGB_PARAMS = {
    'objective': 'regression_l2',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'random_state': 42,
    'device': 'gpu',
}

early_stopping_callback = lgb.early_stopping(100)
verbose_eval_callback = lgb.log_evaluation(period=50)

### 0. model

In [10]:
train_ds = lgb.Dataset(train_data.loc[:, feature_col],
                       label=train_data.loc[:, 'responder_6'])
valid_ds = lgb.Dataset(valid_data.loc[:, feature_col],
                       label=valid_data.loc[:, 'responder_6'])

model = lgb.train(
    LGB_PARAMS,
    train_ds,
    num_boost_round=1000,
    valid_sets=[train_ds, valid_ds],
    valid_names=['train', 'valid'],
    callbacks=[early_stopping_callback, verbose_eval_callback],
)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19736
[LightGBM] [Info] Number of data points in the train set: 31833740, number of used features: 81
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 81 dense feature groups (2550.16 MB) transferred to GPU in 1.963214 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.004831
Training until validation scores don't improve for 100 rounds
[50]	train's rmse: 0.825286	valid's rmse: 0.754016
[100]	train's rmse: 0.823233	valid's rmse: 0.753832
[150]	train's rmse: 0.821707	valid's rmse: 0.753828
[200]	train's rmse: 0.820662	valid's rmse: 0.753799
[250]	train's rmse: 0.819706	valid's rmse: 0.753804
Early stopping, best iteration is:
[183]	train's rmse: 0.821047	valid's rmse: 0.75378


In [11]:
y_valid_pred = model.predict(valid_data.loc[:, feature_col])
r2_score = calculate_r2(valid_data['responder_6'], y_valid_pred, valid_data['weight'])

In [12]:
r2_score

np.float64(0.004748522828299517)

### 1. weight 반영 모델

In [13]:
# Dataset 생성 시 weight 파라미터 추가
train_ds = lgb.Dataset(
    train_data.loc[:, feature_col],
    label=train_data.loc[:, 'responder_6'],
    weight=train_data['weight']  # 가중치 추가
)
valid_ds = lgb.Dataset(
    valid_data.loc[:, feature_col],
    label=valid_data.loc[:, 'responder_6'],
    weight=valid_data['weight']  # 가중치 추가
)

model = lgb.train(
    LGB_PARAMS,
    train_ds,
    num_boost_round=1000,
    valid_sets=[train_ds, valid_ds],
    valid_names=['train', 'valid'],
    callbacks=[early_stopping_callback, verbose_eval_callback],
)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19736
[LightGBM] [Info] Number of data points in the train set: 31833740, number of used features: 81
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 81 dense feature groups (2550.16 MB) transferred to GPU in 1.953941 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.006465
Training until validation scores don't improve for 100 rounds
[50]	train's rmse: 0.785878	valid's rmse: 0.742483
[100]	train's rmse: 0.783749	valid's rmse: 0.742247
[150]	train's rmse: 0.781923	valid's rmse: 0.742165
[200]	train's rmse: 0.780667	valid's rmse: 0.742134
[250]	train's rmse: 0.779569	valid's rmse: 0.74212
[300]	train's rmse: 0.778581	valid's rmse: 0.74214
[350]	train's rmse: 0.777555	valid's rmse

In [14]:
y_valid_pred = model.predict(valid_data.loc[:, feature_col])
r2_score = calculate_r2(valid_data['responder_6'], y_valid_pred, valid_data['weight'])

In [15]:
r2_score

np.float64(0.0050985918093915394)