# Basically copied from https://www.kaggle.com/code/motono0223/js24-preprocessing-create-lags

In [1]:
!pip install --quiet -r requirements.txt

[0m

In [2]:
!nvidia-smi

Sun Dec  8 02:21:38 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060        Off |   00000000:01:00.0  On |                  N/A |
|  0%   39C    P5             19W /  170W |     953MiB /  12288MiB |     15%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import warnings
warnings.filterwarnings('ignore')

import os, gc
import pandas as pd
import polars as pl
import random
import glob
import numpy as np
import matplotlib.pyplot as plt

In [4]:
collected = gc.collect()
# Prints Garbage collector 
# as 0 object
print("Garbage collector: collected",
          "%d objects." % collected)

Garbage collector: collected 0 objects.


# Configurations

In [5]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = {f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.10
    start_dt = 0

# load training data

In [6]:
TEST = True
SAVE = True
PATH = os.getcwd() + '/input/'
if TEST:
    CONFIG.start_dt = 1100
    train_df = pl.scan_parquet(PATH+'train.parquet/partition_id=*/*.parquet').select(
        pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
        pl.all(),
    ).with_columns(
        (pl.col(CONFIG.target_col)*2).cast(pl.Int32).alias("label"),
    ).filter(
        pl.col("date_id").gt(CONFIG.start_dt)
    )
else:
    train_df = pl.scan_parquet(PATH+'train.parquet/partition_id=*/*.parquet').select(
        pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
        pl.all(),
    ).with_columns(
        (pl.col(CONFIG.target_col)*2).cast(pl.Int32).alias("label"),
    ).filter(
        pl.col("date_id").gt(CONFIG.start_dt)
    )

In [7]:
train_df

# Create lags data from training data

In [8]:
lags = train_df.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
    )
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date

In [9]:
train_df = train_df.join(lags, on=["date_id", "symbol_id"], how="left")

# Split training and validation data

In [10]:
len_train = train_df.select(pl.col("date_id")).collect().shape[0]
valid_records = int(len_train * CONFIG.valid_ratio)
len_ofl_mdl = len_train - valid_records # length of offline model train data
last_tr_dt = train_df.select(pl.col("date_id")).collect().row(len_ofl_mdl)[0]

print(f"\n len_train = {len_train}")
print(f"\n len_ofl_mdl = {len_ofl_mdl}")
print(f"\n---> Last offline train date = {last_tr_dt}\n")

training_data = train_df.filter(pl.col("date_id").le(last_tr_dt))
validation_data   = train_df.filter(pl.col("date_id").gt(last_tr_dt))


 len_train = 22104280

 len_ofl_mdl = 19893852

---> Last offline train date = 1639



In [11]:
#validation_data.show_graph() # I don't have graphviz installed lmao

# Save data as parquets

In [12]:
if TEST and SAVE:
    training_data.collect().write_parquet(f"./input/training_TEST.parquet")
    validation_data.collect().write_parquet(f"./input/validation_TEST.parquet")
elif not TEST and SAVE:
    training_data.collect().write_parquet(f"./input/training.parquet", partition_by="date_id")
    validation_data.collect().write_parquet(f"./input/validation.parquet", partition_by="date_id")