In [1]:

# ----------------------------------------------------
# NOTEBOOK 2: LGBM TRAINING & TEST PREDICTION
# ----------------------------------------------------
import os
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy.stats import pearsonr

def feature_engineer(df):
    """Applies all feature engineering steps to the dataframe."""
    print("Applying feature engineering...")
    x_cols = [col for col in df.columns if col.startswith('X_')]
    epsilon = 1e-10

    # Imbalances, Spreads, Sizes, Proxies, Logs, and basic interactions
    df['order_book_imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + epsilon)
    df['trade_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + epsilon)
    df['quantity_spread'] = df['ask_qty'] - df['bid_qty']
    df['avg_trade_size'] = df['volume'] / (df['buy_qty'] + df['sell_qty'] + epsilon)
    df['wap_proxy'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + epsilon)
    df['log_volume'] = np.log1p(df['volume'])
    df['log_ask_qty'] = np.log1p(df['ask_qty'])
    df['log_bid_qty'] = np.log1p(df['bid_qty'])
    df['imbalance_times_volume'] = df['order_book_imbalance'] * df['volume']

    # X-feature statistics
    df['X_mean'] = df[x_cols].mean(axis=1)
    df['X_std'] = df[x_cols].std(axis=1)
    df['X_skew'] = df[x_cols].skew(axis=1)
    df['X_median'] = df[x_cols].median(axis=1)
    df['X_kurtosis'] = df[x_cols].kurtosis(axis=1)

    # Advanced ratios and interactions
    df['total_depth'] = df['bid_qty'] + df['ask_qty']
    df['activity_intensity'] = df['volume'] / (df['total_depth'] + epsilon)
    df['imbalance_delta'] = df['trade_imbalance'] - df['order_book_imbalance']

    # Fill any potential NaNs created by division by zero
    df.fillna(0, inplace=True)
    return df


# --- Configuration ---
class Config:
    INPUT_X_PATH = '/kaggle/input/feature-engineering/X_engineered.parquet'
    INPUT_Y_PATH = '/kaggle/input/feature-engineering/y_engineered.parquet'
    DATA_PATH_TEST = '/kaggle/input/drw-crypto-market-prediction/test.parquet' # Raw test data
    
    OUTPUT_OOF_PATH = '/kaggle/working/oof_lgbm.npy'
    OUTPUT_FEATURES_PATH = '/kaggle/working/sorted_features.csv'
    OUTPUT_TEST_PREDS_PATH = '/kaggle/working/test_preds_lgbm.npy' # New output
    OUTPUT_MODEL_DIR = '/kaggle/working/lgbm_models/' 
    N_SPLITS = 5
    RANDOM_STATE = 42

if __name__ == '__main__':
    # Create the output directory for models
    os.makedirs(Config.OUTPUT_MODEL_DIR, exist_ok=True) # <-- ADD THIS

    print("Loading engineered train data...")
    X = pd.read_parquet(Config.INPUT_X_PATH)
    y = pd.read_parquet(Config.INPUT_Y_PATH)['label']
    features = X.columns.tolist()
    
    print("Loading and engineering test data...")
    X_test = pd.read_parquet(Config.DATA_PATH_TEST)
    X_test = feature_engineer(X_test)
    X_test = X_test[features].astype(np.float32)

    print("\n--- Starting LightGBM Training & Prediction ---")
    
    lgb_params = { 
        'objective': 'regression', 'metric': 'rmse', 'n_estimators': 3000, 
        'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
        'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 128,
        'verbose': -1, 'n_jobs': -1, 'seed': Config.RANDOM_STATE, 
        'boosting_type': 'gbdt'
    }
    
    oof_lgbm = np.zeros(len(X))
    test_preds_lgbm = np.zeros(len(X_test))
    feature_importances = pd.DataFrame(index=features)
    kf = KFold(n_splits=Config.N_SPLITS, shuffle=True, random_state=Config.RANDOM_STATE)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"LGBM Fold {fold+1}/{Config.N_SPLITS}")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
        
        # --- SAVE THE FOLD MODEL ---
        model_path = os.path.join(Config.OUTPUT_MODEL_DIR, f'lgbm_fold_{fold+1}.txt')
        model.booster_.save_model(model_path)
        print(f"Saved model to {model_path}")
        # ---------------------------
        
        oof_lgbm[val_idx] = model.predict(X_val).clip(-5, 5)
        test_preds_lgbm += model.predict(X_test).clip(-5, 5) / Config.N_SPLITS
        
        feature_importances[f'fold_{fold+1}'] = model.feature_importances_
    
    # --- Save all artifacts ---
    feature_importances['mean'] = feature_importances.mean(axis=1)
    lgbm_score = pearsonr(y, oof_lgbm)[0]
    print(f"\nLGBM OOF Pearson Correlation: {lgbm_score:.5f}")
    
    NUM_TOP_FEATURES = 250
    sorted_features = feature_importances.sort_values('mean', ascending=False).index.tolist()
    top_features = sorted_features[:NUM_TOP_FEATURES]

    print(f"Saving top {NUM_TOP_FEATURES} feature list to {Config.OUTPUT_FEATURES_PATH}")
    pd.Series(top_features).to_csv(Config.OUTPUT_FEATURES_PATH, index=False, header=False)
    
    print(f"Saving LGBM OOF predictions to {Config.OUTPUT_OOF_PATH}")
    np.save(Config.OUTPUT_OOF_PATH, oof_lgbm)
    
    print(f"Saving LGBM Test predictions to {Config.OUTPUT_TEST_PREDS_PATH}")
    np.save(Config.OUTPUT_TEST_PREDS_PATH, test_preds_lgbm)
    
    print("LGBM training complete.")

Loading engineered train data...
Loading and engineering test data...
Applying feature engineering...

--- Starting LightGBM Training & Prediction ---
LGBM Fold 1/5
Saved model to /kaggle/working/lgbm_models/lgbm_fold_1.txt
LGBM Fold 2/5
Saved model to /kaggle/working/lgbm_models/lgbm_fold_2.txt
LGBM Fold 3/5
Saved model to /kaggle/working/lgbm_models/lgbm_fold_3.txt
LGBM Fold 4/5
Saved model to /kaggle/working/lgbm_models/lgbm_fold_4.txt
LGBM Fold 5/5
Saved model to /kaggle/working/lgbm_models/lgbm_fold_5.txt

LGBM OOF Pearson Correlation: 0.97070
Saving top 250 feature list to /kaggle/working/sorted_features.csv
Saving LGBM OOF predictions to /kaggle/working/oof_lgbm.npy
Saving LGBM Test predictions to /kaggle/working/test_preds_lgbm.npy
LGBM training complete.
