In [1]:
import pandas as pd
import numpy as np
import gc

# --- 1. Configuration ---
class Config:
    DATA_PATH_TRAIN = '/kaggle/input/drw-crypto-market-prediction/train.parquet'
    # Define output paths for the engineered data
    OUTPUT_X_PATH = '/kaggle/working/X_engineered.parquet'
    OUTPUT_Y_PATH = '/kaggle/working/y_engineered.parquet'

# --- 2. Feature Engineering Function ---
def feature_engineer(df):
    """Applies all feature engineering steps to the dataframe."""
    print("Applying feature engineering...")
    x_cols = [col for col in df.columns if col.startswith('X_')]
    epsilon = 1e-10

    # Imbalances, Spreads, Sizes, Proxies, Logs, and basic interactions
    df['order_book_imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + epsilon)
    df['trade_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + epsilon)
    df['quantity_spread'] = df['ask_qty'] - df['bid_qty']
    df['avg_trade_size'] = df['volume'] / (df['buy_qty'] + df['sell_qty'] + epsilon)
    df['wap_proxy'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + epsilon)
    df['log_volume'] = np.log1p(df['volume'])
    df['log_ask_qty'] = np.log1p(df['ask_qty'])
    df['log_bid_qty'] = np.log1p(df['bid_qty'])
    df['imbalance_times_volume'] = df['order_book_imbalance'] * df['volume']

    # X-feature statistics
    df['X_mean'] = df[x_cols].mean(axis=1)
    df['X_std'] = df[x_cols].std(axis=1)
    df['X_skew'] = df[x_cols].skew(axis=1)
    df['X_median'] = df[x_cols].median(axis=1)
    df['X_kurtosis'] = df[x_cols].kurtosis(axis=1)

    # Advanced ratios and interactions
    df['total_depth'] = df['bid_qty'] + df['ask_qty']
    df['activity_intensity'] = df['volume'] / (df['total_depth'] + epsilon)
    df['imbalance_delta'] = df['trade_imbalance'] - df['order_book_imbalance']

    # Fill any potential NaNs created by division by zero
    df.fillna(0, inplace=True)
    return df

# --- Main Execution ---
if __name__ == '__main__':
    print("Loading and preparing data...")
    train_df = pd.read_parquet(Config.DATA_PATH_TRAIN)
    
    train_df = feature_engineer(train_df)
    
    # Define features by excluding only the 'label' column
    features = [col for col in train_df.columns if col != 'label']
    X = train_df[features].astype(np.float32)
    
    # Use 'label' as the target column
    y = train_df['label'].astype(np.float32)

    print(f"Saving X with shape {X.shape} to {Config.OUTPUT_X_PATH}")
    X.to_parquet(Config.OUTPUT_X_PATH)
    
    print(f"Saving y with shape {y.shape} to {Config.OUTPUT_Y_PATH}")
    y.to_frame().to_parquet(Config.OUTPUT_Y_PATH)
    
    print("Feature engineering complete and files saved.")
    del train_df, X, y
    gc.collect()

Loading and preparing data...
Applying feature engineering...
Saving X with shape (525886, 802) to /kaggle/working/X_engineered.parquet
Saving y with shape (525886,) to /kaggle/working/y_engineered.parquet
Feature engineering complete and files saved.
