In [None]:
import polars as pl #resave only feature based files

# Define file paths
data_path = '/home/jupyter/data/jane-street/test.parquet'
output_path = '/home/jupyter/data/jane-street/test_cleaned.parquet'

# Load the dataset
print("Loading dataset...")
data = pl.read_parquet(data_path)

# Display all columns to identify responders
print("Columns in dataset:", data.columns)

# Define columns to drop (all responders)
columns_to_drop = [col for col in data.columns if "responder" in col]

# Drop the unnecessary responder columns
print(f"Dropping {len(columns_to_drop)} responder columns...")
cleaned_data = data.drop(columns_to_drop)

# Save the cleaned dataset
cleaned_data.write_parquet(output_path, compression="snappy")
print(f"Cleaned dataset saved to {output_path}.")


In [1]:
import polars as pl  #get rid of unneeded features and resave
import xgboost as xgb
import numpy as np
import optuna

# Weighted R² evaluation metric
def weighted_r2_score(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return r2

# Step 1: Load dataset lazily
print("Loading dataset lazily...")
data_lazy = pl.scan_parquet("/home/jupyter/data/XGFeatures_partitioned/")

# Step 2: Chunked Feature Importance Calculation (Manual Chunking)
feature_importance_agg = {}
chunk_size = 500_000  # Define chunk size for memory control
total_rows = data_lazy.select(pl.len()).collect().item()
num_chunks = total_rows // chunk_size + 1

print(f"Calculating feature importance using {num_chunks} chunks...")
for i in range(num_chunks):
    chunk = data_lazy.slice(i * chunk_size, chunk_size).collect()
    X_chunk = chunk.select(pl.exclude(["responder_6", "date_id", "time_id", "weight"]))
    y_chunk = chunk["responder_6"]
    weights_chunk = chunk["weight"]

    # Train a temporary XGBoost model on this chunk
    dtrain_chunk = xgb.DMatrix(X_chunk.to_numpy(), label=y_chunk.to_numpy(), weight=weights_chunk.to_numpy())

    params_temp = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "learning_rate": 0.1,
        "max_depth": 4,
        "device": "cuda",  # Proper GPU usage
        "tree_method": "hist",
        "seed": 42
    }
    temp_model = xgb.train(params_temp, dtrain_chunk, num_boost_round=20)

    # Correct feature extraction based on actual column names
    importance = temp_model.get_score(importance_type="weight")
    column_names = X_chunk.columns
    for feature_name, score in importance.items():
        try:
            feature_idx = int(feature_name[1:])  # Extract the feature number
            actual_feature_name = column_names[feature_idx]
            feature_importance_agg[actual_feature_name] = feature_importance_agg.get(actual_feature_name, 0) + score
        except (ValueError, IndexError):
            continue

# Step 3: Select Top 30 Features with Correct Column Handling
sorted_features = sorted(feature_importance_agg.items(), key=lambda x: x[1], reverse=True)
top_features = [f[0] for f in sorted_features[:30]]
print(f"Selected Top 30 Features: {top_features}")

# Step 4: Filter the entire dataset using the corrected top 30 features
print("Filtering dataset with top features...")
filtered_data = data_lazy.select(top_features + ["responder_6", "weight", "date_id", "time_id"]).collect()

# Step 5: Save Filtered Dataset for External Memory
train_path = "/home/jupyter/data/external_memory_train.parquet"
test_path = "/home/jupyter/data/external_memory_test.parquet"

split_date = filtered_data["date_id"].quantile(0.8)

train_data = filtered_data.filter(pl.col("date_id") < split_date)
test_data = filtered_data.filter(pl.col("date_id") >= split_date)

train_data.write_parquet(train_path)
test_data.write_parquet(test_path)
print("Filtered datasets saved for external memory training!")

Loading Parquet datasets for external memory usage...


KeyboardInterrupt: 

In [1]:
import polars as pl  #optuna hyperparameter tuning
import xgboost as xgb
import optuna
import numpy as np
import pickle
import os

# Weighted R² evaluation metric
def weighted_r2_score(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return r2

# Step 1: Load Parquet Files
train_path = "/home/jupyter/data/external_memory_train.parquet"
test_path = "/home/jupyter/data/external_memory_test.parquet"

print("Loading Parquet datasets for external memory usage...")
train_data = pl.read_parquet(train_path)
test_data = pl.read_parquet(test_path)

# Step 2: Prepare DMatrix with Parquet using Explicit Labels and Weights
X_train = train_data.select(pl.exclude(["responder_6", "weight", "date_id", "time_id"])).to_numpy()
y_train = train_data["responder_6"].to_numpy()
train_weights = train_data["weight"].to_numpy()

X_test = test_data.select(pl.exclude(["responder_6", "weight", "date_id", "time_id"])).to_numpy()
y_test = test_data["responder_6"].to_numpy()
test_weights = test_data["weight"].to_numpy()

# Explicitly define DMatrix using numpy arrays
dtrain_ext = xgb.DMatrix(X_train, label=y_train, weight=train_weights)
dtest_ext = xgb.DMatrix(X_test, label=y_test, weight=test_weights)

# Checkpoint path for Optuna study
optuna_checkpoint = "/home/jupyter/data/optuna_study.pkl"

# Step 3: Define the Optuna Objective Function for Hyperparameter Optimization with Chunking and Regularization
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "learning_rate": trial.suggest_float("learning_rate", 0.25, 0.3),
        "max_depth": trial.suggest_int("max_depth", 8, 9),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 3),
        "subsample": trial.suggest_float("subsample", 0.9, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.9, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 1.0),
        "device": "cuda",
        "tree_method": "hist",
        "seed": 42
    }

    batch_size = 10
    num_boost_round = 100
    model = None
    for i in range(0, num_boost_round, batch_size):
        model = xgb.train(
            params,
            dtrain_ext,
            num_boost_round=batch_size,
            xgb_model=model if i > 0 else None
        )

    # Evaluate Performance on Test Data
    y_pred = model.predict(dtest_ext)
    return -weighted_r2_score(y_test, y_pred, test_weights)

# Step 4: Check for existing Optuna checkpoint
if os.path.exists(optuna_checkpoint):
    print("Resuming existing Optuna study...")
    with open(optuna_checkpoint, "rb") as f:
        study = pickle.load(f)
else:
    print("Creating new Optuna study...")
    study = optuna.create_study(direction="maximize")

# Step 5: Run Bayesian Hyperparameter Optimization with Optuna
try:
    print("Running Bayesian optimization with Optuna...")
    study.optimize(objective, n_trials=100)
    with open(optuna_checkpoint, "wb") as f:
        pickle.dump(study, f)
except KeyboardInterrupt:
    print("Interrupted. Saving current progress...")
    with open(optuna_checkpoint, "wb") as f:
        pickle.dump(study, f)
    raise

# Step 6: Train Final Model with Optimized Hyperparameters Using Chunking
print("Training final model with optimized hyperparameters...")
best_params = study.best_params
batch_size = 10
num_boost_round = 100
final_model = None
for i in range(0, num_boost_round, batch_size):
    final_model = xgb.train(
        best_params,
        dtrain_ext,
        num_boost_round=batch_size,
        xgb_model=final_model if i > 0 else None
    )

# Step 7: Save Final Model
final_model.save_model("/home/jupyter/data/final_xgboost_model.json")
print("Final model saved successfully!")

# Step 8: Evaluate Final Model
print("Evaluating on test data...")
y_pred = final_model.predict(dtest_ext)
final_r2 = weighted_r2_score(y_test, y_pred, test_weights)
print(f"Final Weighted R² Score: {final_r2:.4f}")


Loading Parquet datasets for external memory usage...


[I 2025-01-04 05:56:48,439] A new study created in memory with name: no-name-03234175-805f-46d7-b1e7-d0cebc109e51


Creating new Optuna study...
Running Bayesian optimization with Optuna...


[I 2025-01-04 05:58:52,618] Trial 0 finished with value: -0.7341050505638123 and parameters: {'learning_rate': 0.2569775197414957, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.9977270835704746, 'colsample_bytree': 0.9414119024295959, 'reg_lambda': 0.15183940804495094, 'reg_alpha': 0.6968014803915679}. Best is trial 0 with value: -0.7341050505638123.
[I 2025-01-04 06:00:46,333] Trial 1 finished with value: -0.6965613067150116 and parameters: {'learning_rate': 0.2918470709430596, 'max_depth': 9, 'min_child_weight': 1, 'subsample': 0.99083237194199, 'colsample_bytree': 0.9121588042277565, 'reg_lambda': 0.26192821258898086, 'reg_alpha': 0.4096982819263813}. Best is trial 1 with value: -0.6965613067150116.
[I 2025-01-04 06:02:41,512] Trial 2 finished with value: -0.7237693965435028 and parameters: {'learning_rate': 0.27275364611760267, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.9354699133591238, 'colsample_bytree': 0.9377692899879054, 'reg_lambda': 0.6576882026622826, 'r

Training final model with optimized hyperparameters...
Final model saved successfully!
Evaluating on test data...
Final Weighted R² Score: 0.7251


In [7]:
test_data = pl.read_parquet('/home/jupyter/data/jane-street/test.parquet')
print(test_data.columns)


['row_id', 'date_id', 'time_id', 'symbol_id', 'weight', 'is_scored', 'feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'featur

In [4]:
#test against train.parquet provided in competition

import polars as pl
import xgboost as xgb
import pickle

# Step 1: Load the Pre-trained Model and Feature Names
print("Loading pre-trained model and metadata...")
model = xgb.Booster()
model.load_model("/home/jupyter/data/final_xgboost_model.json")

# Load saved feature names and best parameters for alignment
with open("/home/jupyter/data/final_model_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)
    feature_columns = metadata["feature_names"]

# Step 2: Load the Test Set
print("Loading test data...")
test_data = pl.read_parquet('/home/jupyter/data/jane-street/test.parquet')

# Step 3: Feature Engineering (Ensure Consistency with Training)
exclude_columns = ['date_id', 'time_id', 'symbol_id', 'weight', 'partition_id', 'responder_6']

# Remove boolean columns for feature engineering consistency
numerical_columns = [
    col for col in test_data.columns 
    if col not in exclude_columns and test_data[col].dtype != pl.Boolean
]

# Generate Lagged Features
for feature in numerical_columns:
    test_data = test_data.with_columns(
        pl.col(feature).shift(1).over("symbol_id").alias(f"{feature}_lag_1")
    )

# Generate Difference Features
for feature in numerical_columns:
    test_data = test_data.with_columns(
        (pl.col(feature).cast(pl.Float64) - pl.col(f'{feature}_lag_1').cast(pl.Float64))
        .alias(f'{feature}_lag_diff_1')
    )

# Generate Ratio Features
for feature in numerical_columns:
    test_data = test_data.with_columns(
        (pl.col(feature).cast(pl.Float64) / (pl.col(f'{feature}_lag_1').cast(pl.Float64) + 1e-9))
        .alias(f'{feature}_lag_ratio_1')
    )

# Step 4: Handle Missing Values (Fill Nulls After Feature Engineering)
test_data = test_data.fill_null(0)

# ✅ **Step 5: Align Features with Model's Trained Features**
# Ensure test data has the exact columns used during training
if feature_columns:
    print(f"Aligning test data with {len(feature_columns)} model features...")
    test_data = test_data.select(feature_columns)
else:
    raise ValueError("Feature columns were not found in the metadata. Cannot proceed.")

# Convert the test data into a DMatrix using the same feature order
X_test = test_data.to_numpy()
dtest = xgb.DMatrix(X_test, feature_names=feature_columns)

# Step 6: Make Predictions
print("Making predictions...")
predictions = model.predict(dtest)

# Step 7: Add Predictions to the DataFrame
test_data = test_data.with_columns(
    pl.Series(name="prediction", values=predictions)
)

# Step 8: Prepare the Submission File (CSV)
submission = test_data.select(["date_id", "symbol_id", "prediction"])
submission.write_csv("/home/jupyter/data/submission.csv")
print("Submission file created successfully!")

# ✅ Final Check
print("Predictions successfully made and saved!")


Loading pre-trained model and metadata...
Loading test data...
Aligning test data with 30 model features...


ColumnNotFoundError: responder_0

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'select' <---
DF ["row_id", "date_id", "time_id", "symbol_id"]; PROJECT */325 COLUMNS; SELECTION: None

In [2]:
from sklearn.model_selection import KFold

def cross_validate_with_chunking(X, y, weights, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        weights_train, weights_val = weights[train_idx], weights[val_idx]

        # Convert to DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train, weight=weights_train)
        dval = xgb.DMatrix(X_val, label=y_val, weight=weights_val)

        # Train with the best parameters from Optuna
        model = xgb.train(best_params, dtrain, num_boost_round=100)
        
        # Predict and score
        y_pred = model.predict(dval)
        score = weighted_r2_score(y_val, y_pred, weights_val)
        scores.append(score)

    avg_score = np.mean(scores)
    print(f"Average Weighted R² Score (CV): {avg_score:.4f}")

# Run the cross-validation
cross_validate_with_chunking(X_train, y_train, train_weights)


Average Weighted R² Score (CV): 0.8823
