In [1]:
import numpy as np
import os
import gc
import pickle
import warnings

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import joblib

#import kaggle_evaluation.jane_street_inference_server

# 설정
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

## Configurations (for nn)

In [2]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    # feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    feature_cols = [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    
    model_paths = [
        #"/kaggle/input/js24-train-gbdt-model-with-lags-singlemodel/result.pkl",
        #"/kaggle/input/js24-trained-gbdt-model/result.pkl",
        "/kaggle/input/js-xs-nn-trained-model",
    ]

## load data (for nn training - calculate cv)

In [3]:

'''
valid = pl.scan_parquet(
    f"/kaggle/input/js24-preprocessing-create-lags/validation.parquet/"
).collect().to_pandas()
'''

'\nvalid = pl.scan_parquet(\n    f"/kaggle/input/js24-preprocessing-create-lags/validation.parquet/"\n).collect().to_pandas()\n'

## load model (for nn)

In [4]:
# Custom R2 metric for validation
def r2_val(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2


class NN(LightningModule):
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1))  # 输出层
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []

    def forward(self, x):
        return 5 * self.model(x).squeeze(-1)  # 输出为一维张量

    def training_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  # 考虑样本权重
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    def on_validation_epoch_end(self):
        """Calculate validation WRMSE at the end of the epoch."""
        y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
        if self.trainer.sanity_checking:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
        else:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            # r2_val
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")


# kaggle load data

In [5]:
# def load_data(date_id_range=None, time_id_range=None, columns=None, return_type='pl'):
#     data_dir = '/kaggle/input/jane-street-real-time-market-data-forecasting'
#     data = pl.scan_parquet(f'{data_dir}/train.parquet')
    
#     if date_id_range is not None:
#         start_date, end_date = date_id_range
#         data = data.filter((pl.col("date_id") >= start_date) & (pl.col("date_id") <= end_date))
    
#     if time_id_range is not None:
#         start_time, end_time = time_id_range
#         data = data.filter((pl.col("time_id") >= start_time) & (pl.col("time_id") <= end_time))
    
#     if columns is not None:
#         data = data.select(columns)

#     if return_type == 'pd':
#         return data.collect().to_pandas()
#     else:
#         return data.collect()

local load data

In [6]:
# train_preprocessed = pl.scan_parquet(
#     f"/home/jupyter-chan/.cache/kagglehub/datasets/yuanweijun/js24-preprocessing-create-lags/versions/1/training.parquet/"
#     ).collect().to_pandas()
# valid_preprocessed = pl.scan_parquet(
#     f"/home/jupyter-chan/.cache/kagglehub/datasets/yuanweijun/js24-preprocessing-create-lags/versions/1/validation.parquet/"
#     ).collect().to_pandas()

In [7]:
# data = pd.concat([train_preprocessed, valid_preprocessed], ignore_index=True)
# data = pl.DataFrame(data)

In [8]:
# min_date = data.select(pl.col("date_id").min()).item()
# max_date = data.select(pl.col("date_id").max()).item()

# print(f"Min date_id: {min_date}")
# print(f"Max date_id: {max_date}")

In [9]:
# data = pl.scan_parquet(f'./data/train.parquet').collect()

In [10]:
# min_date = data.select(pl.col("date_id").min()).item()
# max_date = data.select(pl.col("date_id").max()).item()

# print(f"Min date_id: {min_date}")
# print(f"Max date_id: {max_date}")

In [11]:
def load_data(date_id_range=None, time_id_range=None, columns=None, return_type='pl'):
    # data = pd.concat([train_preprocessed, valid_preprocessed], ignore_index=True)
    # data = pl.DataFrame(data)
    data = pl.scan_parquet(f'./data/train.parquet').collect() # 내꺼로 바꾸기 lag 없는거
    
    if date_id_range is not None:
        start_date, end_date = date_id_range
        data = data.filter((pl.col("date_id") >= start_date) & (pl.col("date_id") <= end_date))
    
    if time_id_range is not None:
        start_time, end_time = time_id_range
        data = data.filter((pl.col("time_id") >= start_time) & (pl.col("time_id") <= end_time))
    
    if columns is not None:
        data = data.select(columns)

    if return_type == 'pd':
        return data.to_pandas()
    else:
        return data

In [12]:
# 일단 보류
# def load_data_stacking(data_with_cat=None,date_id_range=None, time_id_range=None, columns=None, return_type='pl'):
#     data = pd.concat([train_preprocessed, valid_preprocessed], ignore_index=True)
#     data = pl.DataFrame(data)
#     data_with_cat = pl.DataFrame(data_with_cat)
    
#     if date_id_range is not None:
#         start_date, end_date = date_id_range
#         data = data.filter((pl.col("date_id") >= start_date) & (pl.col("date_id") <= end_date))
#         print('data1 shape:',data.shape)
#         data_with_cat = data_with_cat.with_columns(data["date_id"])
#         print('data2 shape:', data_with_cat.shape)
#         data = pl.concat([data,data_with_cat.select(['catboost_pred'])], how="horizontal")
#         print('data3 shape:',data.shape)
        
#     if time_id_range is not None:
#         start_time, end_time = time_id_range
#         data = data.filter((pl.col("time_id") >= start_time) & (pl.col("time_id") <= end_time))
    
#     if columns is not None:
#         data = data.select(columns)

#     if return_type == 'pd':
#         return data.to_pandas()
#     else:
#         return data

In [13]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [14]:
TARGET = 'responder_6'
FEAT_COLS_CAT = [f"feature_{i:02d}" for i in range(79)]
FEAT_COLS_LGB = [f"feature_{i:02d}" for i in range(79)]+ ['responder_0_lag_1', 'responder_1_lag_1', 'responder_2_lag_1',
       'responder_3_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
       'responder_6_lag_1', 'responder_7_lag_1', 'responder_8_lag_1']

In [15]:
def train_lgb_kfold_single(total_days=1699, n_splits=5, save_model=True, save_path='modellgb/'):
    if save_model and not os.path.exists(save_path):
        os.makedirs(save_path)
        
    # data_with_cat = data_with_cat
    
    max_valid_days = 1200
    valid_days = min(total_days, max_valid_days)
    valid_start = 1699 - valid_days
    
    fold_size = valid_days // n_splits
    folds = [(valid_start + i * fold_size, valid_start + (i + 1) * fold_size - 1) for i in range(n_splits)]
    
    lgb_models = []
    
    for fold_idx in range(n_splits):
        valid_range = folds[fold_idx]
        train_ranges = [folds[i] for i in range(n_splits) if i != fold_idx]
        print(f'Fold {fold_idx}: Training LGB')
        
        # 검증 데이터 로드
        # valid_data = load_data_stacking(data_with_cat = data_with_cat,
        #                                 date_id_range=valid_range,
        #                      #columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
        #                        columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_LGB + [TARGET],
        #                      return_type='pl')
        
        valid_data = load_data(date_id_range=valid_range,
                             #columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
                               columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_LGB + [TARGET],
                             return_type='pl')
        
        # 학습 데이터 로드
        train_data = None
        for train_range in train_ranges:
            # partial_train_data = load_data_stacking(data_with_cat = data_with_cat,
            #                                         date_id_range=train_range,
            #                              # columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
            #                                columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_LGB + [TARGET],
            #                              return_type='pl')
            partial_train_data = load_data(date_id_range=train_range,
                                         # columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
                                           columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_LGB + [TARGET],
                                         return_type='pl')

            if train_data is None:
                train_data = partial_train_data
            else:
                train_data = train_data.vstack(partial_train_data)
                
        print(f"Train Data (before making ds) shape: {train_data.shape}")
        print(f"Valid Data (before making ds) shape: {valid_data.shape}")
        
        # LightGBM 데이터셋 생성
        train_ds = lgb.Dataset(train_data[FEAT_COLS_LGB+['weight']].to_pandas(),
                             label=train_data[TARGET].to_pandas(),
                             weight=train_data['weight'].to_pandas())
        valid_ds = lgb.Dataset(valid_data[FEAT_COLS_LGB+['weight']].to_pandas(),
                             label=valid_data[TARGET].to_pandas(),
                             weight=valid_data['weight'].to_pandas(),
                             reference=train_ds)
        
        # LightGBM 파라미터
        LGB_PARAMS = {
            'objective': 'regression_l2',
            'metric': 'rmse',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'random_state': 42,
            'device': 'gpu',
        }
        
        # 콜백 함수
        callbacks = [
            lgb.early_stopping(100),
            lgb.log_evaluation(period=50)
        ]
        
        # 모델 학습
        model = lgb.train(
            LGB_PARAMS,
            train_ds,
            num_boost_round=1000,
            valid_sets=[train_ds, valid_ds],
            valid_names=['train', 'valid'],
            callbacks=callbacks
        )
        
        lgb_models.append(model)
        
        # R2 점수 계산
        y_valid_pred = model.predict(valid_data[FEAT_COLS_LGB+['weight']].to_pandas())
        r2_score = calculate_r2(valid_data[TARGET].to_pandas(), y_valid_pred, valid_data['weight'].to_pandas())
        print(f"LGB Fold {fold_idx} validation R2 score: {r2_score}")
    
    # 모델 저장
    if save_model:
        joblib.dump(lgb_models, os.path.join(save_path, "lgb_models.pkl"))
        print("Saved all models to lgb_models.pkl")
    
    return lgb_models

In [16]:
from catboost import CatBoostRegressor

def train_catboost_kfold_single(total_days=1699, n_splits=5, cat_features=None, save_model=True, save_path='modelcat/'):
    
    if save_model and not os.path.exists(save_path):
        os.makedirs(save_path)
        
    max_valid_days = 1200
    valid_days = min(total_days, max_valid_days)
    valid_start = 1699 - valid_days
    
    fold_size = valid_days // n_splits
    folds = [(valid_start + i * fold_size, valid_start + (i + 1) * fold_size - 1) for i in range(n_splits)]
    
    catboost_models = []
    data_fractions = []
    
    for fold_idx in range(n_splits):
        valid_range = folds[fold_idx]
        train_ranges = [folds[i] for i in range(n_splits) if i != fold_idx]
        print(f'Fold {fold_idx}: Training Catboost')
        
        # 검증 데이터 로드
        valid_data = load_data(date_id_range=valid_range,
                             #columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
                               columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_CAT + [TARGET],
                             return_type='pl')
        
        # 학습 데이터 로드
        train_data = None
        for train_range in train_ranges:
            partial_train_data = load_data(date_id_range=train_range,
                                         # columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
                                           columns=["date_id", "symbol_id", "weight"] + FEAT_COLS_CAT + [TARGET],
                                         return_type='pl')
            if train_data is None:
                train_data = partial_train_data
            else:
                train_data = train_data.vstack(partial_train_data)
        
        # CatBoost 모델 학습
        catboost_model = CatBoostRegressor(
            loss_function='RMSE',
            eval_metric='RMSE',
            iterations=1000,
            learning_rate=0.03,
            early_stopping_rounds=50,
            verbose=100,
            cat_features=cat_features,
            task_type='GPU'
        )
        
        # Polars to pandas conversion for CatBoost
        train_df = train_data.to_pandas()
        valid_df = valid_data.to_pandas()

        print(f"Use categorical features: {cat_features}")
        print(f"Train shape: {train_df.shape}")
        print(f"Valid shape: {valid_df.shape}")
        
        catboost_model.fit(
            #train_df[cat_features],  # FEAT_COLS 대신 cat_features만 사용
            #train_df[FEAT_COLS],
            train_df[FEAT_COLS_CAT+['symbol_id', 'weight']],
            train_df[TARGET],
            # eval_set=(valid_df[cat_features], valid_df[TARGET]),
            # eval_set=(valid_df[FEAT_COLS], valid_df[TARGET]),
            eval_set=(valid_df[FEAT_COLS_CAT+['symbol_id', 'weight']], valid_df[TARGET]),
            sample_weight=train_df['weight']
        )
        
        # 예측값 생성
        # valid_df['catboost_pred'] = catboost_model.predict(valid_df[cat_features])
        # valid_df['catboost_pred'] = catboost_model.predict(valid_df[FEAT_COLS])
        train_df['catboost_pred'] = catboost_model.predict(train_df[FEAT_COLS_CAT+['symbol_id','weight']])
        valid_df['catboost_pred'] = catboost_model.predict(valid_df[FEAT_COLS_CAT+['symbol_id','weight']])
        combined_df = pd.concat([train_df, valid_df], ignore_index=True)

        data_fractions.append(combined_df)
        
        r2_score = calculate_r2(valid_df[TARGET], valid_df['catboost_pred'], valid_df['weight'])
        print(f"Catboost Fold {fold_idx} validation R2 score: {r2_score}")
        
        catboost_models.append(catboost_model)

    data_with_cat = pd.concat(data_fractions, ignore_index=True)
    
    # 모델 저장
    if save_model:
        joblib.dump(catboost_models, os.path.join(save_path, "catboost_models.pkl"))
        print("Saved all models to catboost_models.pkl")

    return data_with_cat, catboost_models
   

In [17]:
def train_catboost_holdout(total_days=1699, train_days=680, validation_days=170, cat_features=None, save_model=True, save_path='modelcat2/'):
    if save_model and not os.path.exists(save_path): # save path 없으면 만들기 저장할곳
        os.makedirs(save_path)
    
    # Define validation and training range
    valid_start = total_days - validation_days
    valid_range = (valid_start, total_days - 1)  # Last 170 days
    train_start = valid_start - train_days
    train_range = (train_start, valid_start - 1)  # Train on last `train_days` before validation
    
    print(f"Validation range: {valid_range}")
    print(f"Training range: {train_range}")

    # Load validation data
    valid_data = load_data(
        date_id_range=valid_range,
        columns=["date_id", "symbol_id", "weight", "time_id"] + FEAT_COLS_CAT + [TARGET],
        return_type='pl'
    )
    
    # Load training data
    train_data = load_data(
        date_id_range=train_range,
        columns=["date_id", "symbol_id", "weight", "time_id"] + FEAT_COLS_CAT + [TARGET],
        return_type='pl'
    )
    
    # Convert to pandas for CatBoost
    train_df = train_data.to_pandas()
    valid_df = valid_data.to_pandas()

    print(f"Use categorical features: {cat_features}")
    print(f"Train shape: {train_df.shape}")
    print(f"Valid shape: {valid_df.shape}")

    # Train CatBoost model
    catboost_model = CatBoostRegressor(
        loss_function='RMSE',
        eval_metric='RMSE',
        iterations=1000,
        learning_rate=0.03,
        early_stopping_rounds=50,
        verbose=100,
        cat_features=cat_features,
        task_type='GPU'
    )

    catboost_model.fit(
        train_df[FEAT_COLS_CAT + ['symbol_id', 'weight', 'time_id']],
        train_df[TARGET],
        eval_set=(valid_df[FEAT_COLS_CAT + ['symbol_id', 'weight', 'time_id']], valid_df[TARGET]),
        sample_weight=train_df['weight']
    )
    
    # Predict and calculate R2
    valid_df['catboost_pred'] = catboost_model.predict(valid_df[FEAT_COLS_CAT + ['symbol_id', 'weight', 'time_id']])
    r2_score = calculate_r2(valid_df[TARGET], valid_df['catboost_pred'], valid_df['weight'])
    print(f"CatBoost Hold-out validation R2 score: {r2_score}")

    # Save model
    if save_model:
        model_path = os.path.join(save_path, "catboost_holdout_model.pkl") # 저장할곳
        joblib.dump(catboost_model, model_path)
        print(f"Saved model to {model_path}")

    return catboost_model


### First Version: Model Training

In [None]:
CAT_FEATURES = ['feature_09','feature_10','feature_11', 'symbol_id', 'time_id']
# catboost_models = train_catboost_kfold_single(total_days=500, cat_features=CAT_FEATURES)
catboost_model = train_catboost_holdout((total_days=1699, train_days=1529, validation_days=170, cat_features=CAT_FEATURES, save_path='modelcatholdout2/')
# lgb_models = train_lgb_kfold_single(data_with_cat=data_with_cat, total_days=5)
# lgb_models = train_lgb_kfold_single(total_days=500)

Validation range: (1529, 1698)
Training range: (849, 1528)
Use categorical features: ['feature_09', 'feature_10', 'feature_11', 'symbol_id', 'time_id']
Train shape: (24020920, 84)
Valid shape: (6312328, 84)
0:	learn: 0.8380845	test: 0.8128596	best: 0.8128596 (0)	total: 2.29s	remaining: 38m 6s
100:	learn: 0.8335822	test: 0.8106257	best: 0.8106257 (100)	total: 3m 30s	remaining: 31m 11s
200:	learn: 0.8320450	test: 0.8101528	best: 0.8101528 (200)	total: 6m 57s	remaining: 27m 41s


### Second Version: Model Loading

In [None]:
# 이거는 catboost 아닐때

# # Load the model from the saved file

# model_path = '/kaggle/input/jsmodel-chan2'
# model_name = 'lgb_model'
# models = []
# models.append(joblib.load(f'{model_path}/{model_name}.pkl'))

# print(f"Loaded model from the saved file.")

In [None]:
# 이거는 catboost 일때

model_path = '/kaggle/input/jsmodel-chan6'
cat_file_name = 'catboost_models'
lgb_file_name = 'lgb_models'

lgb_models = joblib.load(f'{model_path}/{lgb_file_name}.pkl')
catboost_models = joblib.load(f'{model_path}/{cat_file_name}.pkl')

print(f"Loaded model from the saved file.")

In [None]:
N_folds = 5
# 加载最佳模型
nn_models = []
for fold in range(N_folds):
    checkpoint_path = f"{CONFIG.model_paths[0]}/nn_{fold}.model"
    nn_model = NN.load_from_checkpoint(checkpoint_path)
    nn_models.append(nn_model.to("cuda:0"))
nn_models[0]

In [None]:
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    
    # intersection
    global lags_
    if lags is not None:
        lags_ = lags

    # Initialize predictions with `row_id`
    predictions = test.select('row_id').with_columns(
        pl.lit(0.0).alias('responder_6')
    )

    # Prepare test_nn for NN processing
    test_nn = test.clone()
    symbol_ids = test_nn.select('symbol_id').to_numpy()[:, 0]

    if lags is not None:
        lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()
        test_nn = test_nn.join(lags, on=["date_id", "symbol_id"], how="left")
    else:
        test_nn = test_nn.with_columns(
            (pl.lit(0.0).alias(f'responder_{idx}_lag_1') for idx in range(9))
        )

    # CatBoost predictions
    feat_cat = test[FEAT_COLS_CAT + ['symbol_id', 'weight']].to_pandas()
    feat_cat = feat_cat.fillna('NaN').astype(str)
    pred_cat = [model.predict(feat_cat) for model in catboost_models]
    pred_cat = np.mean(pred_cat, axis=0)

    # LightGBM predictions
    feat_lgb = test_nn[FEAT_COLS_LGB + ['weight'].to_pandas()
    # feat_lgb['pred_cat'] = pred_cat
    pred_lgb = [model.predict(feat_lgb) for model in lgb_models]
    pred_lgb = np.mean(pred_lgb, axis=0)

    # Neural network predictions
    preds_nn = np.zeros((test_nn.shape[0],))
    test_input = test_nn[CONFIG.feature_cols].to_pandas()
    test_input = test_input.fillna(method='ffill').fillna(0)
    test_input = torch.FloatTensor(test_input.values).to("cuda:0")
    with torch.no_grad():
        for i, nn_model in enumerate(tqdm(nn_models)):
            nn_model.eval()
            preds_nn += nn_model(test_input).cpu().numpy() / len(nn_models)
    print(f"predict> nn_preds.shape =", preds_nn.shape)

    # Final prediction
    pred = pred_cat * 0.3 + pred_lgb * 0.2 + preds_nn * 0.5

    # Clip predictions to the range [-5, 5]
    predictions = test.select('row_id').with_columns(
        pl.Series(
            name='responder_6',
            values=np.clip(pred, a_min=-5, a_max=5),
            dtype=pl.Float64
        )
    )

    print(predictions)
    
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    assert list(predictions.columns) == ['row_id', 'responder_6']
    assert len(predictions) == len(test)
    
    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )