In [1]:
import numpy as np
import pandas as pd 
import polars as pl
import lightgbm as lgb
import joblib

import os

import warnings 
warnings.filterwarnings("ignore")

import kaggle_evaluation.jane_street_inference_server

In [2]:
def load_data(date_id_range=None, time_id_range=None, columns=None, return_type='pl'):
    data_dir = '/kaggle/input/jane-street-real-time-market-data-forecasting'
    data = pl.scan_parquet(f'{data_dir}/train.parquet')
    
    if date_id_range is not None:
        start_date, end_date = date_id_range
        data = data.filter((pl.col("date_id") >= start_date) & (pl.col("date_id") <= end_date))
    
    if time_id_range is not None:
        start_time, end_time = time_id_range
        data = data.filter((pl.col("time_id") >= start_time) & (pl.col("time_id") <= end_time))
    
    if columns is not None:
        data = data.select(columns)

    if return_type == 'pd':
        return data.collect().to_pandas()
    else:
        return data.collect()

In [3]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [4]:
# def calculate_r2(y_true, y_pred, weights):
#     # pandas DataFrame을 numpy array로 변환
#     y_true = y_true.values.flatten()  # 또는 y_true.values.ravel()
#     y_pred = np.array(y_pred).flatten()  # 예측값도 1차원 배열로 변환
#     weights = np.array(weights).flatten()
    
#     numerator = np.sum(weights * (y_true - y_pred) ** 2)
#     denominator = np.sum(weights * (y_true ** 2))
#     r2_score = 1 - (numerator / denominator)
#     return r2_score

In [5]:
TARGET = 'responder_6'
FEAT_COLS = [f"feature_{i:02d}" for i in range(79)]

In [6]:
from catboost import CatBoostRegressor

def create_catboost_features_kfold(total_days=1699, n_splits=5, cat_features=None):
    max_valid_days = 1200
    valid_days = min(total_days, max_valid_days)
    valid_start = 1699 - valid_days
    
    fold_size = valid_days // n_splits
    folds = [(valid_start + i * fold_size, valid_start + (i + 1) * fold_size - 1) for i in range(n_splits)]
    
    all_data_with_preds = None
    catboost_models = []
    
    for fold_idx in range(n_splits):
        valid_range = folds[fold_idx]
        train_ranges = [folds[i] for i in range(n_splits) if i != fold_idx]
        print(f'Fold {fold_idx}: Creating CatBoost predictions')
        
        # 검증 데이터 로드
        valid_data = load_data(date_id_range=valid_range,
                             #columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
                               columns=["date_id", "symbol_id", "weight"] + FEAT_COLS + [TARGET],
                             return_type='pl')
        
        # 학습 데이터 로드
        train_data = None
        for train_range in train_ranges:
            partial_train_data = load_data(date_id_range=train_range,
                                         # columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
                                           columns=["date_id", "symbol_id", "weight"] + FEAT_COLS + [TARGET],
                                         return_type='pl')
            if train_data is None:
                train_data = partial_train_data
            else:
                train_data = train_data.vstack(partial_train_data)
        
        # CatBoost 모델 학습
        catboost_model = CatBoostRegressor(
            loss_function='RMSE',
            eval_metric='RMSE',
            iterations=1000,
            learning_rate=0.03,
            early_stopping_rounds=50,
            verbose=100,
            cat_features=cat_features,
            task_type='GPU'
        )
        
        # Polars to pandas conversion for CatBoost
        train_df = train_data.to_pandas()
        valid_df = valid_data.to_pandas()

        print(f"Using only categorical features: {cat_features}")
        print(f"Train shape with only cat features: {train_df[cat_features].shape}")
        print(f"Valid shape with only cat features: {valid_df[cat_features].shape}")
        
        catboost_model.fit(
            #train_df[cat_features],  # FEAT_COLS 대신 cat_features만 사용
            #train_df[FEAT_COLS],
            train_df[FEAT_COLS+['symbol_id']],
            train_df[TARGET],
            # eval_set=(valid_df[cat_features], valid_df[TARGET]),
            # eval_set=(valid_df[FEAT_COLS], valid_df[TARGET]),
            eval_set=(valid_df[FEAT_COLS+['symbol_id']], valid_df[TARGET]),
            sample_weight=train_df['weight']
        )
        
        # 예측값 생성
        # valid_df['catboost_pred'] = catboost_model.predict(valid_df[cat_features])
        # valid_df['catboost_pred'] = catboost_model.predict(valid_df[FEAT_COLS])
        valid_df['catboost_pred'] = catboost_model.predict(valid_df[FEAT_COLS+['symbol_id']])

        r2_score = calculate_r2(valid_df[TARGET], valid_df['catboost_pred'], valid_df['weight'])
        print(f"Catboost Fold {fold_idx} validation R2 score: {r2_score}")

        # symbol_id 컬럼 제거
        if 'symbol_id' in valid_df.columns:
            valid_df = valid_df.drop(columns=['symbol_id'])

        # 결과 저장
        if all_data_with_preds is None:
            all_data_with_preds = valid_df
        else:
            all_data_with_preds = pd.concat([all_data_with_preds, valid_df])

        # print(f"all_data_with_preds shape: {all_data_with_preds.shape}")
        # print(f"all_data_with_preds head: {all_data_with_preds.head()}")
        
        catboost_models.append(catboost_model)
    
    return all_data_with_preds, catboost_models
   

def train_lgb_kfold_with_catboost(total_days=1699, n_splits=5, save_model=True, save_path='models2/', cat_features=None):
    if save_model and not os.path.exists(save_path):
        os.makedirs(save_path)
    
    # CatBoost 예측값 생성
    print("Creating CatBoost features...")
    data_with_catboost, catboost_models = create_catboost_features_kfold(
        total_days=total_days,
        n_splits=n_splits,
        cat_features=CAT_FEATURES  # 카테고리컬 피처 리스트 필요
    )
    
    # CatBoost 예측값을 포함한 새로운 피처 리스트
    FEAT_COLS_WITH_CATBOOST = FEAT_COLS + ['catboost_pred']
    
    max_valid_days = 1200
    valid_days = min(total_days, max_valid_days)
    valid_start = 1699 - valid_days
    
    fold_size = valid_days // n_splits
    folds = [(valid_start + i * fold_size, valid_start + (i + 1) * fold_size - 1) for i in range(n_splits)]
    
    cv_scores = []
    lgb_models = []
    
    for fold_idx in range(n_splits):
        valid_range = folds[fold_idx]
        train_ranges = [folds[i] for i in range(n_splits) if i != fold_idx]
        print(f'Fold {fold_idx}: Training LightGBM')
        
        # 폴드별 데이터 분할
        valid_mask = (data_with_catboost['date_id'] >= valid_range[0]) & (data_with_catboost['date_id'] <= valid_range[1])
        valid_data = data_with_catboost[valid_mask]
        train_data = data_with_catboost[~valid_mask]
        
        # LightGBM 데이터셋 생성
        train_ds = lgb.Dataset(train_data[FEAT_COLS_WITH_CATBOOST],
                             label=train_data[TARGET],
                             weight=train_data['weight'])
        valid_ds = lgb.Dataset(valid_data[FEAT_COLS_WITH_CATBOOST],
                             label=valid_data[TARGET],
                             weight=valid_data['weight'],
                             reference=train_ds)
        
        # LightGBM 파라미터
        LGB_PARAMS = {
            'objective': 'regression_l2',
            'metric': 'rmse',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'random_state': 42,
            'device': 'gpu',
        }
        
        # 콜백 함수
        callbacks = [
            lgb.early_stopping(100),
            lgb.log_evaluation(period=50)
        ]
        
        # 모델 학습
        model = lgb.train(
            LGB_PARAMS,
            train_ds,
            num_boost_round=1000,
            valid_sets=[train_ds, valid_ds],
            valid_names=['train', 'valid'],
            callbacks=callbacks
        )
        
        lgb_models.append(model)
        
        # R2 점수 계산
        y_valid_pred = model.predict(valid_data[FEAT_COLS_WITH_CATBOOST])
        r2_score = calculate_r2(valid_data[TARGET], y_valid_pred, valid_data['weight'])
        print(f"Fold {fold_idx} validation R2 score: {r2_score}")
        
        cv_scores.append(r2_score)
    
    # 모델 저장
    if save_model:
        joblib.dump({
            'lgb_models': lgb_models,
            'catboost_models': catboost_models
        }, os.path.join(save_path, "stacking_models.pkl"))
        print("Saved all models to stacking_models.pkl")
    
    print(f"Cross-validation R2 scores: {cv_scores}")
    print(f"Mean R2 score: {np.mean(cv_scores)}, Std: {np.std(cv_scores)}")
    
    return lgb_models, catboost_models, np.mean(cv_scores), np.std(cv_scores)

In [7]:
'''
# lgb with k-fold cross-validation
def train_lgb_kfold_single(total_days=1699, n_splits=5, save_model=True, save_path='models2/'):
    if save_model and not os.path.exists(save_path):
        os.makedirs(save_path)

    # 여기 skip은 왜 있는건지 잘 모르겠음.
    # # Number of dates to skip from the beginning of the dataset
    # skip_dates = 500  # 跳过前500天
    
    max_valid_days = 1200  # 最多使用后1200天进行交叉验证
    valid_days = min(total_days, max_valid_days)  # 实际用于交叉验证的天数
    valid_start = 1699 - valid_days  # 计算交叉验证的起始日期（倒数）
    
    fold_size = valid_days // n_splits
    folds = [(valid_start + i * fold_size, valid_start + (i + 1) * fold_size - 1) for i in range(n_splits)]
    
    cv_scores = []
    model_group = []
    
    for fold_idx in range(n_splits):
        valid_range = folds[fold_idx]
        train_ranges = [folds[i] for i in range(n_splits) if i != fold_idx]
        print(f'Fold {fold_idx}: validation range {valid_range}, train parts: {train_ranges}')

        # load valid data
        valid_data = load_data(date_id_range=valid_range, 
                               columns=["date_id", "weight"] + FEAT_COLS + [TARGET], 
                               return_type='pl')
        valid_weight = valid_data['weight'].to_pandas()

        # load train data
        train_data = None
        for train_range in train_ranges:
            partial_train_data = load_data(date_id_range=train_range,
                                           columns=["date_id", "weight"] + FEAT_COLS + [TARGET], 
                                           return_type='pl')
            if train_data is None:
                train_data = partial_train_data
            else:
                train_data = train_data.vstack(partial_train_data)

        train_weight = train_data['weight'].to_pandas()

        # build LightGBM dataset
        train_ds = lgb.Dataset(train_data.select(FEAT_COLS + ['weight']).to_pandas(), 
                               label=train_data[TARGET].to_pandas(), weight=train_weight)
        valid_ds = lgb.Dataset(valid_data.select(FEAT_COLS + ['weight']).to_pandas(), 
                               label=valid_data[TARGET].to_pandas(), weight=valid_weight, reference=train_ds)

        # LightGBM parameters
        LGB_PARAMS = {
            'objective': 'regression_l2',
            'metric': 'rmse',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'random_state': 42,
            'device': 'gpu',
        }

        # callback functions
        early_stopping_callback = lgb.early_stopping(100)
        verbose_eval_callback = lgb.log_evaluation(period=50)

        # train model
        model = lgb.train(
            LGB_PARAMS,
            train_ds,
            num_boost_round=1000,
            valid_sets=[train_ds, valid_ds],
            valid_names=['train', 'valid'],
            callbacks=[early_stopping_callback, verbose_eval_callback],
        )

        # save model
        model_group.append(model)
        
        # predict on valid set and compute R2
        y_valid_pred = model.predict(valid_data.select(FEAT_COLS + ['weight']).to_pandas())
        r2_score = calculate_r2(valid_data[TARGET].to_pandas(), y_valid_pred, valid_weight)
        print(f"Fold {fold_idx} validation R2 score: {r2_score}")

        cv_scores.append(r2_score)

    # Model fusion: The output of all models is averaged
    print(f"Total trained models: {len(model_group)}")
    final_model = model_group[1]  # The structure of the first model is used
    print("Averaging models...")
    average_predictions = lambda data: average_models(model_group, data)
    print("Done.")
    # Save the entire model group
    if save_model:
        joblib.dump(final_model, "lgb_model.pkl")
        print("Saved the final merged model to lgb_model.pkl")
        
    print(f"Cross-validation R2 scores: {cv_scores}")
    print(f"Mean R2 score: {np.mean(cv_scores)}, Std: {np.std(cv_scores)}")

    return model, np.mean(cv_scores), np.std(cv_scores)
'''

'\n# lgb with k-fold cross-validation\ndef train_lgb_kfold_single(total_days=1699, n_splits=5, save_model=True, save_path=\'models2/\'):\n    if save_model and not os.path.exists(save_path):\n        os.makedirs(save_path)\n\n    # 여기 skip은 왜 있는건지 잘 모르겠음.\n    # # Number of dates to skip from the beginning of the dataset\n    # skip_dates = 500  # 跳过前500天\n    \n    max_valid_days = 1200  # 最多使用后1200天进行交叉验证\n    valid_days = min(total_days, max_valid_days)  # 实际用于交叉验证的天数\n    valid_start = 1699 - valid_days  # 计算交叉验证的起始日期（倒数）\n    \n    fold_size = valid_days // n_splits\n    folds = [(valid_start + i * fold_size, valid_start + (i + 1) * fold_size - 1) for i in range(n_splits)]\n    \n    cv_scores = []\n    model_group = []\n    \n    for fold_idx in range(n_splits):\n        valid_range = folds[fold_idx]\n        train_ranges = [folds[i] for i in range(n_splits) if i != fold_idx]\n        print(f\'Fold {fold_idx}: validation range {valid_range}, train parts: {train_ranges}\')\n\n    

In [8]:
# def train_lgb_holdout_single(total_days=1699, valid_days=120, save_model=True, save_path='models/'):

#     if save_model and not os.path.exists(save_path):
#         os.makedirs(save_path)

#     skip_date = 0
#     valid_start = total_days - valid_days  # Start date for validation data

#     print(f'Validation range: {valid_start} to {total_days - 1}')

#     # Load validation data
#     valid_data = load_data(
#         date_id_range=(valid_start, total_days - 1),
#         columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
#         return_type='pl'
#     )
#     # valid_weight = valid_data.select("weight").to_pandas()
#     valid_weight = valid_data.select("weight").to_pandas()["weight"]

#     # Load training data
#     train_data = load_data(
#         date_id_range=(skip_date, valid_start - 1),  # Use all available data up to the validation start
#         columns=["date_id", "weight"] + FEAT_COLS + [TARGET],
#         return_type='pl'
#     )
    
#     # train_weight = train_data.select("weight").to_pandas()
#     train_weight = train_data.select("weight").to_pandas()["weight"]

#     # Build LightGBM dataset
#     train_ds = lgb.Dataset(
#         train_data.select(FEAT_COLS + ['weight']).to_pandas(),
#         label=train_data.select(TARGET).to_pandas()[TARGET], 
#         weight=train_weight
#     )
#     # train_ds = lgb.Dataset(
#     #     train_data.select(FEAT_COLS + ['weight']).to_pandas(),
#     #     label=train_data.select(TARGET).to_pandas(), 
#     #     weight=train_weight
#     # )
#     valid_ds = lgb.Dataset(
#         valid_data.select(FEAT_COLS + ['weight']).to_pandas(),
#         label=valid_data.select(TARGET).to_pandas()[TARGET], 
#         weight=valid_weight, 
#         reference=train_ds
#     )
#     # valid_ds = lgb.Dataset(
#     #     valid_data.select(FEAT_COLS + ['weight']).to_pandas(),
#     #     label=valid_data.select(TARGET).to_pandas(), 
#     #     weight=valid_weight, 
#     #     reference=train_ds
#     # )

#     # LightGBM parameters
#     LGB_PARAMS = {
#         'objective': 'regression_l2',
#         'metric': 'rmse',
#         'learning_rate': 0.05,
#         'num_leaves': 31,
#         'max_depth': -1,
#         'random_state': 42,
#         'device': 'gpu',
#     }

#     # Callback functions
#     early_stopping_callback = lgb.early_stopping(100)
#     verbose_eval_callback = lgb.log_evaluation(period=50)

#     # Train model
#     model = lgb.train(
#         LGB_PARAMS,
#         train_ds,
#         num_boost_round=1000,
#         valid_sets=[train_ds, valid_ds],
#         valid_names=['train', 'valid'],
#         callbacks=[early_stopping_callback, verbose_eval_callback],
#     )

#     # Predict on validation set and compute R2
#     y_valid_pred = model.predict(valid_data.select(FEAT_COLS + ['weight']).to_pandas())
#     r2_score = calculate_r2(valid_data.select(TARGET).to_pandas(), y_valid_pred, valid_weight)
#     print(f"Validation R2 score: {r2_score}")

#     # Save the model
#     if save_model:
#         model_path = os.path.join(save_path, "lgb_model.pkl")
#         joblib.dump(model, model_path)
#         print(f"Saved the model to {model_path}")

#     return model, r2_score


### First Version: Model Training

In [9]:
# total_days = 1699 # Total num of diff date_id = 1699
# valid_days = 120
# lgb_models, _ = train_lgb_holdout_single(total_days=total_days,valid_days=valid_days)

In [10]:
# total_days = 500 # Total num of diff date_id = 1699
# lgb_models, _, _ = train_lgb_kfold_single(total_days=total_days)

In [11]:
# CAT_FEATURES 리스트 정의 필요
# CAT_FEATURES = ['feature_09','feature_10','feature_11']  # 카테고리형 변수명 리스트
CAT_FEATURES = ['feature_09','feature_10','feature_11', 'symbol_id']

# 학습 실행
lgb_models, catboost_models, mean_r2, std_r2 = train_lgb_kfold_with_catboost(
    total_days=30,
    cat_features=CAT_FEATURES
)

Creating CatBoost features...
Fold 0: Creating CatBoost predictions
Using only categorical features: ['feature_09', 'feature_10', 'feature_11', 'symbol_id']
Train shape with only cat features: (900240, 4)
Valid shape with only cat features: (218768, 4)
0:	learn: 0.7171150	test: 0.8072837	best: 0.8072837 (0)	total: 133ms	remaining: 2m 13s
100:	learn: 0.7094131	test: 0.8053134	best: 0.8053052 (98)	total: 4.87s	remaining: 43.4s
200:	learn: 0.7044933	test: 0.8046426	best: 0.8046426 (200)	total: 9.3s	remaining: 37s
300:	learn: 0.7001660	test: 0.8043937	best: 0.8043818 (283)	total: 13.8s	remaining: 32.1s
bestTest = 0.8043091426
bestIteration = 313
Shrink model to first 314 iterations.
Catboost Fold 0 validation R2 score: 0.007226412017324213
Fold 1: Creating CatBoost predictions
Using only categorical features: ['feature_09', 'feature_10', 'feature_11', 'symbol_id']
Train shape with only cat features: (895400, 4)
Valid shape with only cat features: (223608, 4)
0:	learn: 0.7372981	test: 0.726



[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 80 dense feature groups (68.68 MB) transferred to GPU in 0.087664 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.005068
Training until validation scores don't improve for 100 rounds
[50]	train's rmse: 0.702838	valid's rmse: 0.780189
[100]	train's rmse: 0.693456	valid's rmse: 0.781541
Early stopping, best iteration is:
[33]	train's rmse: 0.706841	valid's rmse: 0.778853
Fold 0 validation R2 score: 0.004633637729981821
Fold 1: Training LightGBM
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19465
[LightGBM] [Info] Number of data points in the train set: 895400, number of used features: 80
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM]

### Second Version: Model Loading

In [None]:
# 이거는 catboost 아닐때

# # Load the model from the saved file

# model_path = '/kaggle/input/jsmodel-chan2'
# model_name = 'lgb_model'
# models = []
# models.append(joblib.load(f'{model_path}/{model_name}.pkl'))

# print(f"Loaded model from the saved file.")

In [22]:
# 이거는 catboost 일때

model_path = '/kaggle/working/models2'
model_name = 'stacking_models'

stacking_models = joblib.load(f'{model_path}/{model_name}.pkl') #dict

lgb_models = stacking_models['lgb_models'] #lst
catboost_models = stacking_models['catboost_models'] #lst

print(f"Loaded model from the saved file.")

Loaded model from the saved file.


In [37]:
lags_ : pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6')
    )

    # feat = test[FEAT_COLS].to_pandas()
    # feat = test[FEAT_COLS + ['weight']].to_pandas()
    
    # model = models[0]
    # pred = model.predict(feat)
    
    # pred = [model.predict(feat) for model in models]
    # pred = np.mean(pred, axis=0)
    
    feat_cat = test[FEAT_COLS + ['symbol_id']].to_pandas()
    feat_cat = feat_cat.fillna('NaN').astype(str)
    pred_cat= [model.predict(feat_cat) for model in catboost_models]
    pred_cat = np.mean(pred_cat, axis=0)

    # feat_lgb: LightGBM 모델의 feature
    # `pred_cat`을 `test`의 feature로 추가
    feat_lgb = test[FEAT_COLS].to_pandas()
    feat_lgb['pred_cat'] = pred_cat  # pred_cat을 새로운 column으로 추가
    
    pred_lgb = [model.predict(feat_lgb) for model in lgb_models]
    pred_lgb = np.mean(pred_lgb, axis=0)

    # 최종 예측값 계산
    pred = pred_cat * 0.5 + pred_lgb * 0.5
    
    predictions = predictions.with_columns(pl.Series('responder_6', pred.ravel()))
    print(predictions)
    
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    
    assert list(predictions.columns) == ['row_id', 'responder_6']
    assert len(predictions) == len(test)
    
    return predictions

In [38]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

shape: (39, 2)
┌────────┬─────────────┐
│ row_id ┆ responder_6 │
│ ---    ┆ ---         │
│ i64    ┆ f64         │
╞════════╪═════════════╡
│ 0      ┆ 0.192078    │
│ 1      ┆ 0.191906    │
│ 2      ┆ 0.194858    │
│ 3      ┆ 0.195911    │
│ 4      ┆ 0.194446    │
│ …      ┆ …           │
│ 34     ┆ 0.195014    │
│ 35     ┆ 0.193742    │
│ 36     ┆ 0.202089    │
│ 37     ┆ 0.194819    │
│ 38     ┆ 0.195359    │
└────────┴─────────────┘
