In [1]:

import numpy as np
import os
import gc
import pickle
import warnings

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import joblib

#import kaggle_evaluation.jane_street_inference_server

# 설정
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

TARGET = 'responder_6'
FEAT_COLS_CAT = [f"feature_{i:02d}" for i in range(79)]
FEAT_COLS_LGB = [f"feature_{i:02d}" for i in range(79)]+ ['responder_0_lag_1', 'responder_1_lag_1', 'responder_2_lag_1',
       'responder_3_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
       'responder_6_lag_1', 'responder_7_lag_1', 'responder_8_lag_1']

In [3]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [4]:
import polars as pl

def load_data(date_id_range=None, time_id_range=None, columns=None, return_type='pl'):
    data = pl.scan_parquet(f'/kaggle/input/js24-preprocessing-create-lags/training.parquet').collect() # 내꺼로 바꾸기 lag 없는거
    
    if date_id_range is not None:
        start_date, end_date = date_id_range
        data = data.filter((pl.col("date_id") >= start_date) & (pl.col("date_id") <= end_date))
    
    if time_id_range is not None:
        start_time, end_time = time_id_range
        data = data.filter((pl.col("time_id") >= start_time) & (pl.col("time_id") <= end_time))
    
    if columns is not None:
        data = data.select(columns)

    if return_type == 'pd':
        return data.to_pandas()
    else:
        return data

In [5]:
def train_lgb_kfold_single(total_days=1699, n_splits=5, save_model=True, save_path='modellgb/'):
    if save_model and not os.path.exists(save_path):
        os.makedirs(save_path)

    max_valid_days = 1200
    valid_days = min(total_days, max_valid_days)
    valid_start = 1699 - valid_days
    
    fold_size = valid_days // n_splits
    folds = [(valid_start + i * fold_size, valid_start + (i + 1) * fold_size - 1) for i in range(n_splits)]
    
    lgb_models = []
    
    for fold_idx in range(n_splits):
        valid_range = folds[fold_idx]
        train_ranges = [folds[i] for i in range(n_splits) if i != fold_idx]
        print(f'Fold {fold_idx}: Training LGB')
        valid_data = load_data(
            date_id_range=valid_range,  # noqa: F821
            columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_LGB + [TARGET],
            return_type='pl')
        
        # 학습 데이터 로드
        train_data = None
        for train_range in train_ranges:
            partial_train_data = load_data(
                date_id_range=train_range,  # noqa: F821
                columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_LGB + [TARGET],
                return_type='pl')

            if train_data is None:
                train_data = partial_train_data
            else:
                train_data = train_data.vstack(partial_train_data)
                
        print(f"Train Data (before making ds) shape: {train_data.shape}")
        print(f"Valid Data (before making ds) shape: {valid_data.shape}")
        
        # LightGBM 데이터셋 생성
        train_ds = lgb.Dataset(train_data[FEAT_COLS_LGB+['weight']].to_pandas(),
                             label=train_data[TARGET].to_pandas(),
                             weight=train_data['weight'].to_pandas())
        valid_ds = lgb.Dataset(valid_data[FEAT_COLS_LGB+['weight']].to_pandas(),
                             label=valid_data[TARGET].to_pandas(),
                             weight=valid_data['weight'].to_pandas(),
                             reference=train_ds)
        
        # LightGBM 파라미터
        LGB_PARAMS = {
            'objective': 'regression_l2',
            'metric': 'rmse',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'random_state': 42,
            'device': 'gpu',
        }
        
        # 콜백 함수
        callbacks = [
            lgb.early_stopping(100),
            lgb.log_evaluation(period=50)
        ]
        
        # 모델 학습
        model = lgb.train(
            LGB_PARAMS,
            train_ds,
            num_boost_round=1000,
            valid_sets=[train_ds, valid_ds],
            valid_names=['train', 'valid'],
            callbacks=callbacks
        )
        
        lgb_models.append(model)
        
        # R2 점수 계산
        y_valid_pred = model.predict(valid_data[FEAT_COLS_LGB+['weight']].to_pandas())
        r2_score = calculate_r2(valid_data[TARGET].to_pandas(), y_valid_pred, valid_data['weight'].to_pandas())
        print(f"LGB Fold {fold_idx} validation R2 score: {r2_score}")
    
    # 모델 저장
    if save_model:
        joblib.dump(lgb_models, os.path.join(save_path, "lgb_models.pkl"))
        print("Saved all models to lgb_models.pkl")
    
    return lgb_models

In [6]:
lgb_models = train_lgb_kfold_single(total_days=500)

Fold 0: Training LGB


Train Data (before making ds) shape: (14254768, 92)
Valid Data (before making ds) shape: (3713248, 92)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 21987
[LightGBM] [Info] Number of data points in the train set: 14254768, number of used features: 89
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 89 dense feature groups (1250.69 MB) transferred to GPU in 1.043705 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.002701
Training until validation scores don't improve for 100 rounds
[50]	train's rmse: 0.810927	valid's rmse: 0.771196
[100]	train's rmse: 0.80772	valid's rmse: 0.770536
[150]	train's rmse: 0.804678	valid's rmse: 0.770276
[200]	train's rmse: 0.802237	valid's rmse: 0.770103
[250]	train's rmse: 0.800376	valid's rmse: 0.769996
[300]	train's rmse: 0.798676	valid's rmse: 0.769934
[350]	train's rmse: 0.797323	valid's rmse: 0.769904
[400]	train's rmse: 0.796026	valid's rmse: 0.769868
[450]	train's rmse: 0.7947	valid's rmse: 0.769851
[500]	train's rmse: 0.793477	valid's rmse: 0.769853
Early stopping, best iteration is:
[442]	train's rmse: 0.794971	valid's rmse: 0.769827
LGB Fold 0 validation R2 score: 0.01194556415139314
Fold 1: Training 

: 