In [None]:
import polars as pl
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
train_data = pl.read_parquet('../data/jane-street/train.parquet')

# Display the first few rows (use .head() in an interactive notebook)
train_data.head()


In [None]:
# Filter out the first 85 days
train_data = train_data.filter(pl.col("date_id") >= 85)

# Fill NaN values with 0 (or other preferred method)
train_data = train_data.fill_null(0)


In [None]:
# Define columns to exclude
exclude_columns = ['date_id', 'time_id', 'symbol_id', 'weight', 'partition_id'] + \
                  [col for col in train_data.columns if col.startswith('responder_')]

# Select numerical columns
numerical_columns = [col for col in train_data.columns if col not in exclude_columns]


In [None]:
# Convert Polars DataFrame to NumPy for scaling
scaler = MinMaxScaler()
numerical_data = train_data.select(numerical_columns).to_numpy()

# Scale numerical columns
scaled_data = scaler.fit_transform(numerical_data)

# Replace scaled data back in the Polars DataFrame
scaled_df = pl.DataFrame(scaled_data, schema=numerical_columns)
train_data = train_data.drop(numerical_columns).hstack(scaled_df)


In [None]:
# Save the cleaned data as Parquet
train_data.to_parquet('../data/Encoder.parquet', engine='pyarrow', compression='snappy')


In [None]:
print("Columns in saved data:", train_data.columns)