In [4]:
import numpy as np
import os
import gc
import pickle
import warnings

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import joblib

#import kaggle_evaluation.jane_street_inference_server

# 설정
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

  from .autonotebook import tqdm as notebook_tqdm


In [1]:

TARGET = 'responder_6'
FEAT_COLS_CAT = [f"feature_{i:02d}" for i in range(79)]

In [9]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [8]:
import polars as pl

def load_data(date_id_range=None, time_id_range=None, columns=None, return_type='pl'):
    data = pl.scan_parquet(f'/kaggle/input/js24-preprocessing-create-lags/training.parquet').collect() # 내꺼로 바꾸기 lag 없는거
    
    if date_id_range is not None:
        start_date, end_date = date_id_range
        data = data.filter((pl.col("date_id") >= start_date) & (pl.col("date_id") <= end_date))
    
    if time_id_range is not None:
        start_time, end_time = time_id_range
        data = data.filter((pl.col("time_id") >= start_time) & (pl.col("time_id") <= end_time))
    
    if columns is not None:
        data = data.select(columns)

    if return_type == 'pd':
        return data.to_pandas()
    else:
        return data

In [12]:

def train_catboost_holdout(total_days=1699, train_days=680, validation_days=170, cat_features=None, save_model=True, save_path='modelcat2/'):
    if save_model and not os.path.exists(save_path): # save path 없으면 만들기 저장할곳
        os.makedirs(save_path)
    
    # Define validation and training range
    valid_start = total_days - validation_days
    valid_range = (valid_start, total_days - 1)  # Last 170 days
    train_start = valid_start - train_days
    train_range = (train_start, valid_start - 1)  # Train on last `train_days` before validation
    
    print(f"Validation range: {valid_range}")
    print(f"Training range: {train_range}")

    # Load validation data
    valid_data = load_data(
        date_id_range=valid_range,
        columns=["date_id", "symbol_id", "weight", "time_id"] + FEAT_COLS_CAT + [TARGET],
        return_type='pl'
    )
    
    # Load training data
    train_data = load_data(
        date_id_range=train_range,
        columns=["date_id", "symbol_id", "weight", "time_id"] + FEAT_COLS_CAT + [TARGET],
        return_type='pl'
    )
    
    # Convert to pandas for CatBoost
    train_df = train_data.to_pandas()
    valid_df = valid_data.to_pandas()

    print(f"Use categorical features: {cat_features}")
    print(f"Train shape: {train_df.shape}")
    print(f"Valid shape: {valid_df.shape}")

    # Train CatBoost model
    catboost_model = CatBoostRegressor(
        loss_function='RMSE',
        eval_metric='RMSE',
        iterations=1000,
        learning_rate=0.03,
        early_stopping_rounds=50,
        verbose=100,
        cat_features=cat_features,
        task_type='GPU'
    )

    catboost_model.fit(
        train_df[FEAT_COLS_CAT + ['symbol_id', 'weight', 'time_id']],
        train_df[TARGET],
        eval_set=(valid_df[FEAT_COLS_CAT + ['symbol_id', 'weight', 'time_id']], valid_df[TARGET]),
        sample_weight=train_df['weight']
    )
    
    # Predict and calculate R2
    valid_df['catboost_pred'] = catboost_model.predict(valid_df[FEAT_COLS_CAT + ['symbol_id', 'weight', 'time_id']])
    r2_score = calculate_r2(valid_df[TARGET], valid_df['catboost_pred'], valid_df['weight'])
    print(f"CatBoost Hold-out validation R2 score: {r2_score}")

    # Save model
    if save_model:
        model_path = os.path.join(save_path, "catboost_holdout_model.pkl") # 저장할곳
        joblib.dump(catboost_model, model_path)
        print(f"Saved model to {model_path}")

    return catboost_model


In [13]:
CAT_FEATURES = ['feature_09','feature_10','feature_11', 'symbol_id', 'time_id']
catboost_model = train_catboost_holdout(total_days=1699, train_days=1529, validation_days=170, cat_features=CAT_FEATURES, save_path='modelcatholdout2/')

Validation range: (1529, 1698)
Training range: (0, 1528)


Use categorical features: ['feature_09', 'feature_10', 'feature_11', 'symbol_id', 'time_id']
Train shape: (40808218, 84)
Valid shape: (5860272, 84)
0:	learn: 0.8592963	test: 0.8186848	best: 0.8186848 (0)	total: 976ms	remaining: 16m 15s
100:	learn: 0.8551070	test: 0.8165467	best: 0.8165467 (100)	total: 1m 38s	remaining: 14m 35s
200:	learn: 0.8536836	test: 0.8160565	best: 0.8160565 (200)	total: 3m 18s	remaining: 13m 7s
300:	learn: 0.8527655	test: 0.8157660	best: 0.8157660 (300)	total: 4m 58s	remaining: 11m 34s
400:	learn: 0.8520024	test: 0.8155967	best: 0.8155967 (400)	total: 6m 38s	remaining: 9m 55s
500:	learn: 0.8513609	test: 0.8154919	best: 0.8154919 (500)	total: 8m 21s	remaining: 8m 19s
600:	learn: 0.8508325	test: 0.8154216	best: 0.8154212 (597)	total: 10m 2s	remaining: 6m 39s
700:	learn: 0.8502459	test: 0.8153591	best: 0.8153591 (700)	total: 11m 43s	remaining: 5m
800:	learn: 0.8497600	test: 0.8153258	best: 0.8153258 (800)	total: 13m 26s	remaining: 3m 20s
900:	learn: 0.8492200	test: 