# Hull Tactical Market Prediction â€” GradientBoosting Baseline

In [1]:
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
ALPHA_FOR_SCORER = 0.600132
TAU_ABS_FOR_SCORER = 9.43717e-05
MIN_INVESTMENT, MAX_INVESTMENT = 0.0, 2.0
TRADING_DAYS = 252

In [3]:
def post_process_signal(y_pred,
                        *,
                        tau: float = TAU_ABS_FOR_SCORER,
                        alpha: float = ALPHA_FOR_SCORER,
                        min_investment: float = MIN_INVESTMENT,
                        max_investment: float = MAX_INVESTMENT):
    sig = np.asarray(y_pred, dtype=float).ravel()
    pos = np.where(sig > tau, alpha, 0.0)
    return np.clip(pos, min_investment, max_investment)

## Load Data

In [4]:
PATH = "/kaggle/input/hull-tactical-market-prediction/"

train = pd.read_csv(f"{PATH}train.csv")

TARGET = "forward_returns"
if TARGET not in train.columns:
    raise ValueError(f"Expected target column '{TARGET}' in train.csv; found: {list(train.columns)}")

DROP_IF_EXISTS = ["row_id", "id", "risk_free_rate", "market_forward_excess_returns"]
use_cols = [c for c in train.columns if c not in DROP_IF_EXISTS]
train = train[use_cols]

## Preprocessing

In [5]:
def preprocess(df):
    df = df.copy()
    
    high_null_cols = [c for c in df.columns if df[c].isnull().mean() > 0.5]
    df = df.drop(columns=high_null_cols, errors='ignore')
    
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            if len(df[col].mode()) > 0:
                df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

In [6]:
train = preprocess(train)

X = train.drop(columns=[TARGET])
y = train[TARGET]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

feature_cols = X.columns.tolist()

x_train, x_val, y_train, y_val = train_test_split(
    X, y, random_state=123, test_size=0.2
)

Features shape: (8990, 87)
Target shape: (8990,)


## Model Training

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_val_scaled = scaler.transform(x_val)
from lightgbm import LGBMRegressor
model = GradientBoostingRegressor(
    n_estimators=1350,
    learning_rate=0.04,
    max_depth=12,
    random_state=123,
    verbose=0
)

print("training model...")
model.fit(X_train_scaled, y_train)
print("training completed!")

training model...
training completed!


In [8]:
y_pred_train = model.predict(X_train_scaled)
y_pred_val = model.predict(X_val_scaled)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

print(f"training RMSE: {train_rmse:.4f}")
print(f"validation RMSE: {val_rmse:.4f}")
print(f"overfitting ratio: {val_rmse/train_rmse:.4f}")

training RMSE: 0.0004
validation RMSE: 0.0109
overfitting ratio: 25.2049


## Prediction Function

In [9]:
def predict(test: pl.DataFrame) -> float:
    if not isinstance(test, pl.DataFrame):
        raise TypeError("predict(test): expected a Polars DataFrame input")

    if test.height != 1:
        raise ValueError(f"predict(test): expected a single-row Polars DataFrame, got {test.height} rows")

    drop_cols = [c for c in DROP_IF_EXISTS if c in test.columns]
    test_pl = test.drop(drop_cols) if drop_cols else test

    if TARGET in test_pl.columns:
        test_pl = test_pl.drop(TARGET)

    test_pd = test_pl.to_pandas()
    test_pd = preprocess(test_pd)
    test_pd = test_pd.reindex(columns=feature_cols, fill_value=0)
    test_scaled = scaler.transform(test_pd)

    raw = model.predict(test_scaled)
    pos = post_process_signal(raw)
    return float(np.asarray(pos).ravel()[0])

## Inference Server

In [10]:
import kaggle_evaluation.default_inference_server as kis
import os

inference_server = kis.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))