In [1]:
import polars as pl

class CONFIG:
    path_str = "/kaggle/input/jane-street-realtime-marketdata-forecasting/train.parquet"
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.01
    start_dt = 0

In [2]:
# Use last 2 parquets
train = pl.scan_parquet(
    CONFIG.path_str
).select(
    pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
    pl.all(),
).with_columns(
    (pl.col(CONFIG.target_col)).cast(pl.Int32).alias("label"),
).filter(
    pl.col("date_id").gt(CONFIG.start_dt)
)


In [3]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
)
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date
train = train.join(lags, on=["date_id", "symbol_id"],  how="left")

In [4]:
# 전체 훈련 샘플 수를 "date_id" 열을 선택하고 행을 카운트하여 계산
len_train = train.select(pl.col("date_id")).collect().shape[0]
# 검증 비율에 기반하여 검증에 사용할 레코드 수 결정
valid_records = int(len_train * CONFIG.valid_ratio)
# 오프라인 모델(훈련)에 사용할 레코드 수 계산
len_ofl_mdl = len_train - valid_records
# 계산된 인덱스에서 date_id를 선택하여 오프라인 훈련 세트의 마지막 date_id 가져오기
last_tr_dt = train.select(pl.col("date_id")).collect().row(len_ofl_mdl)[0]
# 전체 훈련 샘플 수 출력
print(f"\n len_train = {len_train}")
# 검증 레코드 수 출력
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
# 마지막 오프라인 훈련 날짜 출력
print(f"\n---> Last offline train date = {last_tr_dt}\n")
training_data = train.filter(pl.col("date_id").le(last_tr_dt))
validation_data   = train.filter(pl.col("date_id").gt(last_tr_dt))


 len_train = 47120546

 len_ofl_mdl = 46649341

---> Last offline train date = 1686



In [5]:
training_data.collect().\
write_parquet(
    f"/kaggle/input/js24-preprocessing-create-lags/training.parquet", partition_by = "date_id",
)
validation_data.collect().\
write_parquet(
    f"/kaggle/input/js24-preprocessing-create-lags/validation.parquet", partition_by = "date_id",
)

In [13]:
%reload_ext autoreload

import joblib

# List of feature columns excluding non-feature columns
# feature_cols = [col for col in train.columns if col not in ['date_id', 'symbol_id', 'id', 'label']]
feature_cols = [col for col in train.columns if col not in ['id', 'label']]

# Compute means and standard deviations using Polars
means = training_data.select(feature_cols).mean().collect().to_dicts()[0]
stds = training_data.select(feature_cols).std().collect().to_dicts()[0]

data_stats = {'mean': means, 'std': stds}

  feature_cols = [col for col in train.columns if col not in ['id', 'label']]


In [14]:

data_stats

{'mean': {'date_id': 998.970946692297,
  'time_id': 468.56892322849956,
  'symbol_id': 18.094599675284115,
  'weight': 2.0084173679351807,
  'feature_00': 0.5515734553337097,
  'feature_01': 0.010147725231945515,
  'feature_02': 0.5505484342575073,
  'feature_03': 0.5502622127532959,
  'feature_04': -0.0006238542264327407,
  'feature_05': -0.037658803164958954,
  'feature_06': -0.004995108116418123,
  'feature_07': -0.013611395843327045,
  'feature_08': 0.08088661730289459,
  'feature_09': 32.76902803154763,
  'feature_10': 4.874676467998,
  'feature_11': 170.73092937011674,
  'feature_12': -0.03819642215967178,
  'feature_13': -0.026206044480204582,
  'feature_14': -0.04054224118590355,
  'feature_15': -0.2470085620880127,
  'feature_16': -0.20144912600517273,
  'feature_17': -0.23875097930431366,
  'feature_18': -0.009079842828214169,
  'feature_19': -0.03157450631260872,
  'feature_20': -0.2107597440481186,
  'feature_21': 0.03906584903597832,
  'feature_22': 0.06577453762292862,
  

In [15]:
# Save the stats using joblib
import os
os.makedirs("/kaggle/input/jane-street-data-preprocessing", exist_ok=True)
joblib.dump(data_stats, "/kaggle/input/jane-street-data-preprocessing/data_stats.pkl")

['/kaggle/input/jane-street-data-preprocessing/data_stats.pkl']