<a href="https://www.kaggle.com/code/kriti83/market-prediction?scriptVersionId=281588969" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggl

In [2]:
import os
import pandas as pd
import numpy as np
import polars as pl
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler
import kaggle_evaluation.default_inference_server

df = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
print(f"Initial shape: {df.shape}")

Initial shape: (9021, 98)


In [3]:
# 1. Handle missing values
missing_pct = df.isnull().sum() / len(df)
keep_cols = missing_pct[missing_pct < 0.4].index.tolist()
df = df[keep_cols]

for col in df.columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median())

print(f"After missing value handling: {df.shape}")

After missing value handling: (9021, 90)


# Feature Engineering

In [4]:
def create_features(data):
    """Focus on features that create diverse predictions"""
    data = data.copy()
    
    # Historical returns
    data['past_return'] = data['forward_returns'].shift(1)
    data['past_market'] = data['market_forward_excess_returns'].shift(1)
    
    # Momentum - multiple timeframes
    for window in [3, 5, 10, 20, 40, 60]:
        data[f'momentum_{window}'] = data['past_return'].rolling(window).sum()
        data[f'market_momentum_{window}'] = data['past_market'].rolling(window).sum()
    
    # Volatility
    for window in [5, 10, 20, 40, 60]:
        data[f'vol_{window}'] = data['past_return'].rolling(window).std()
        data[f'market_vol_{window}'] = data['past_market'].rolling(window).std()
    
    # Moving averages
    for window in [5, 10, 20, 40, 60]:
        data[f'ma_{window}'] = data['past_return'].rolling(window).mean()
    
    # Mean reversion
    data['dev_ma_10'] = data['past_return'] - data['ma_10']
    data['dev_ma_20'] = data['past_return'] - data['ma_20']
    data['dev_ma_60'] = data['past_return'] - data['ma_60']
    
    # Trend
    for window in [10, 20, 40]:
        data[f'trend_{window}'] = data['past_return'].rolling(window).apply(
            lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) == window else 0, 
            raw=True
        )
    
    # Sharpe ratios
    data['sharpe_10'] = data['ma_10'] / (data['vol_10'] + 1e-6)
    data['sharpe_20'] = data['ma_20'] / (data['vol_20'] + 1e-6)
    data['sharpe_60'] = data['ma_60'] / (data['vol_60'] + 1e-6)
    
    # Regime indicators
    data['up_ratio_10'] = (data['past_return'] > 0).rolling(10).mean()
    data['up_ratio_20'] = (data['past_return'] > 0).rolling(20).mean()
    data['strong_up_20'] = (data['past_return'] > 0.01).rolling(20).sum()
    data['strong_down_20'] = (data['past_return'] < -0.01).rolling(20).sum()
    
    # Volatility regime
    data['vol_ratio_short_long'] = data['vol_10'] / (data['vol_60'] + 1e-6)
    data['vol_vs_market'] = data['vol_20'] / (data['market_vol_20'] + 1e-6)
    
    # Relative to market
    data['vs_market_5'] = data['momentum_5'] - data['market_momentum_5']
    data['vs_market_20'] = data['momentum_20'] - data['market_momentum_20']
    data['vs_market_60'] = data['momentum_60'] - data['market_momentum_60']
    
    # Range position
    data['max_20'] = data['past_return'].rolling(20).max()
    data['min_20'] = data['past_return'].rolling(20).min()
    data['range_position'] = (data['past_return'] - data['min_20']) / (data['max_20'] - data['min_20'] + 1e-6)
    
    # Acceleration
    data['accel_5'] = data['momentum_5'] - data['momentum_5'].shift(5)
    data['accel_10'] = data['momentum_10'] - data['momentum_10'].shift(10)
    data['accel_20'] = data['momentum_20'] - data['momentum_20'].shift(20)
    
    # Crossovers
    data['ma_cross_5_20'] = data['ma_5'] - data['ma_20']
    data['ma_cross_10_40'] = data['ma_10'] - data['ma_40']
    
    # Recent lags
    for lag in [1, 2, 3, 5, 10]:
        data[f'ret_lag_{lag}'] = data['past_return'].shift(lag)
    
    # D features
    d_cols = [col for col in data.columns if col.startswith('D')]
    if len(d_cols) > 0:
        data['d_sum'] = data[d_cols].sum(axis=1)
        data['d_count'] = (data[d_cols] > 0).sum(axis=1)
    
    return data

df = create_features(df)
df = df.dropna()
print(f"After feature engineering: {len(df)} rows")

After feature engineering: 5484 rows


  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


# Create Target

In [5]:
def create_balanced_target(data):
    """
    Create a more balanced target - equal bullish and bearish opportunities
    """
    returns = data['forward_returns'].values
    market_returns = data['market_forward_excess_returns'].values
    
    # Calculate dynamic thresholds based on distribution
    p95 = np.percentile(returns, 95)
    p90 = np.percentile(returns, 90)
    p85 = np.percentile(returns, 85)
    p75 = np.percentile(returns, 75)
    
    p25 = np.percentile(returns, 25)
    p15 = np.percentile(returns, 15)
    p10 = np.percentile(returns, 10)
    p5 = np.percentile(returns, 5)
    
    allocation = np.ones(len(returns))
    
    for i in range(len(returns)):
        ret = returns[i]
        mkt = market_returns[i]
        
        # Symmetric allocation around 1.0
        if ret > p95:
            alloc = 1.7
        elif ret > p90:
            alloc = 1.5
        elif ret > p85:
            alloc = 1.3
        elif ret > p75:
            alloc = 1.15
        elif ret > np.median(returns):
            alloc = 1.05
        elif ret < p5:
            alloc = 0.3
        elif ret < p10:
            alloc = 0.5
        elif ret < p15:
            alloc = 0.7
        elif ret < p25:
            alloc = 0.85
        else:
            alloc = 0.95
        
        # Market adjustments
        if mkt < -0.03:
            alloc *= 0.7
        elif mkt > 0.03:
            alloc *= 1.15
        
        allocation[i] = alloc
    
    return np.clip(allocation, 0, 2)

df['target'] = create_balanced_target(df)

print(f"\nTarget distribution:")
print(df['target'].describe())
print(f"\nTarget bins:")
bins = [0, 0.5, 0.8, 0.95, 1.05, 1.2, 1.5, 2.0]
target_bins = pd.cut(df['target'], bins=bins)
print(target_bins.value_counts().sort_index())

# Check balance
below_one = (df['target'] < 1.0).sum()
above_one = (df['target'] > 1.0).sum()
print(f"\nTarget balance: {below_one} below 1.0, {above_one} above 1.0")


Target distribution:
count    5484.000000
mean        1.001425
std         0.309318
min         0.210000
25%         0.925000
50%         1.000000
75%         1.075000
max         1.955000
Name: target, dtype: float64

Target bins:
target
(0.0, 0.5]       549
(0.5, 0.8]       274
(0.8, 0.95]     1919
(0.95, 1.05]    1371
(1.05, 1.2]      548
(1.2, 1.5]       548
(1.5, 2.0]       275
Name: count, dtype: int64

Target balance: 2742 below 1.0, 2742 above 1.0


# Split data

In [6]:
train_size = int(len(df) * 0.75)
train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:]

exclude = ['date_id', 'forward_returns', 'risk_free_rate', 
           'market_forward_excess_returns', 'target',
           'past_return', 'past_market']
features = [c for c in df.columns if c not in exclude]

print(f"\nFeature count: {len(features)}")
print(f"Train: {len(train_df)}, Val: {len(val_df)}")

X_train = train_df[features].values
y_train = train_df['target'].values
X_val = val_df[features].values
y_val = val_df['target'].values


Feature count: 146
Train: 4113, Val: 1371


# Train Validation Model

In [7]:
scaler = RobustScaler()  # Less sensitive to outliers
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

print("\nTraining Gradient Boosting models...")

# GB Model 1 - Deeper trees
gb1 = GradientBoostingRegressor(
    n_estimators=250,
    max_depth=8,
    min_samples_leaf=3,
    learning_rate=0.06,
    subsample=0.8,
    max_features='sqrt',
    random_state=42
)
gb1.fit(X_train, y_train)

# GB Model 2 - More trees, shallower
gb2 = GradientBoostingRegressor(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=5,
    learning_rate=0.04,
    subsample=0.85,
    max_features='sqrt',
    random_state=123
)
gb2.fit(X_train, y_train)

# GB Model 3 - Balanced
gb3 = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=7,
    min_samples_leaf=4,
    learning_rate=0.07,
    subsample=0.8,
    max_features='sqrt',
    random_state=456
)
gb3.fit(X_train, y_train)

print("Models trained!")



Training Gradient Boosting models...
Models trained!


# Calculate validation score

In [8]:
pred1 = gb1.predict(X_val)
pred2 = gb2.predict(X_val)
pred3 = gb3.predict(X_val)

print("\n" + "="*60)
print("RAW PREDICTIONS")
print("="*60)
print(f"GB1 - Mean: {pred1.mean():.3f}, Std: {pred1.std():.3f}, Min: {pred1.min():.3f}, Max: {pred1.max():.3f}")
print(f"GB2 - Mean: {pred2.mean():.3f}, Std: {pred2.std():.3f}, Min: {pred2.min():.3f}, Max: {pred2.max():.3f}")
print(f"GB3 - Mean: {pred3.mean():.3f}, Std: {pred3.std():.3f}, Min: {pred3.min():.3f}, Max: {pred3.max():.3f}")

# Ensemble of the three GB models
pred_ensemble = (pred1 + pred2 + pred3) / 3
print(f"\nEnsemble (raw) - Mean: {pred_ensemble.mean():.3f}, Std: {pred_ensemble.std():.3f}")

# Wide clipping range
pred_clipped = np.clip(pred_ensemble, 0.3, 1.8)
print(f"After clipping [0.3, 1.8] - Mean: {pred_clipped.mean():.3f}, Std: {pred_clipped.std():.3f}")

# Minimal smoothing
pred_final = 0.95 * pred_clipped + 0.05 * 1.0
print(f"After smoothing (95/5) - Mean: {pred_final.mean():.3f}, Std: {pred_final.std():.3f}")

def modified_sharpe_ratio(strategy_returns, market_returns):
    if len(strategy_returns) == 0:
        return 0
    
    excess = strategy_returns.mean() - market_returns.mean()
    strat_vol = strategy_returns.std()
    mkt_vol = market_returns.std()
    
    if strat_vol <= 0:
        return 0
    
    sharpe = excess / strat_vol
    
    vol_ratio = strat_vol / mkt_vol if mkt_vol > 0 else 1
    if vol_ratio > 1.2:
        penalty = (vol_ratio - 1.2) * 0.5
        sharpe *= (1 - penalty)
    
    return sharpe

# Test different versions
market_returns = val_df['market_forward_excess_returns'].values

score1 = modified_sharpe_ratio(pred1 * market_returns, market_returns)
score2 = modified_sharpe_ratio(pred2 * market_returns, market_returns)
score3 = modified_sharpe_ratio(pred3 * market_returns, market_returns)
score_ensemble = modified_sharpe_ratio(pred_ensemble * market_returns, market_returns)
score_clipped = modified_sharpe_ratio(pred_clipped * market_returns, market_returns)
score_final = modified_sharpe_ratio(pred_final * market_returns, market_returns)
baseline = modified_sharpe_ratio(market_returns, market_returns)

print("\n" + "="*60)
print("VALIDATION SCORES")
print("="*60)
print(f"GB1 (raw):            {score1:.4f}")
print(f"GB2 (raw):            {score2:.4f}")
print(f"GB3 (raw):            {score3:.4f}")
print(f"Ensemble (raw):       {score_ensemble:.4f}")
print(f"After clipping:       {score_clipped:.4f}")
print(f"After smoothing:      {score_final:.4f}")
print(f"Baseline (1.0):       {baseline:.4f}")
print("="*60)

# Distribution analysis
print(f"\nFinal Allocation Distribution:")
print(f"  Mean: {pred_final.mean():.3f}")
print(f"  Std:  {pred_final.std():.3f}")
print(f"  Min:  {pred_final.min():.3f}")
print(f"  Max:  {pred_final.max():.3f}")

bins = [0, 0.5, 0.8, 0.95, 1.05, 1.2, 1.5, 2.0]
print(f"\nAllocation bins:")
print(pd.cut(pred_final, bins=bins).value_counts().sort_index())

# Balance check
below = (pred_final < 1.0).sum()
above = (pred_final > 1.0).sum()
print(f"\nPrediction balance: {below} below 1.0 ({below/len(pred_final)*100:.1f}%), {above} above 1.0 ({above/len(pred_final)*100:.1f}%)")


RAW PREDICTIONS
GB1 - Mean: 0.942, Std: 0.132, Min: 0.520, Max: 1.541
GB2 - Mean: 0.966, Std: 0.107, Min: 0.623, Max: 1.693
GB3 - Mean: 0.953, Std: 0.140, Min: 0.502, Max: 1.666

Ensemble (raw) - Mean: 0.954, Std: 0.116
After clipping [0.3, 1.8] - Mean: 0.954, Std: 0.116
After smoothing (95/5) - Mean: 0.956, Std: 0.111

VALIDATION SCORES
GB1 (raw):            0.0023
GB2 (raw):            0.0047
GB3 (raw):            0.0027
Ensemble (raw):       0.0032
After clipping:       0.0032
After smoothing:      0.0031
Baseline (1.0):       0.0000

Final Allocation Distribution:
  Mean: 0.956
  Std:  0.111
  Min:  0.644
  Max:  1.508

Allocation bins:
(0.0, 0.5]        0
(0.5, 0.8]       74
(0.8, 0.95]     662
(0.95, 1.05]    406
(1.05, 1.2]     195
(1.2, 1.5]       33
(1.5, 2.0]        1
Name: count, dtype: int64

Prediction balance: 992 below 1.0 (72.4%), 379 above 1.0 (27.6%)


# Train Model

In [9]:
print("\nTraining final models on all data...")

X_full = scaler.fit_transform(df[features].values)
y_full = df['target'].values

gb1_final = GradientBoostingRegressor(
    n_estimators=250, max_depth=8, min_samples_leaf=3,
    learning_rate=0.06, subsample=0.8, max_features='sqrt', random_state=42
)
gb1_final.fit(X_full, y_full)

gb2_final = GradientBoostingRegressor(
    n_estimators=300, max_depth=6, min_samples_leaf=5,
    learning_rate=0.04, subsample=0.85, max_features='sqrt', random_state=123
)
gb2_final.fit(X_full, y_full)

gb3_final = GradientBoostingRegressor(
    n_estimators=200, max_depth=7, min_samples_leaf=4,
    learning_rate=0.07, subsample=0.8, max_features='sqrt', random_state=456
)
gb3_final.fit(X_full, y_full)

print("Final models ready!")


Training final models on all data...
Final models ready!


# Prediction

In [10]:
def predict(test: pl.DataFrame) -> float:
    try:
        test_pd = test.to_pandas()
        
        feats = []
        for col in features:
            if col in test_pd.columns:
                val = test_pd[col].iloc[0]
                feats.append(val if not pd.isna(val) else 0)
            else:
                feats.append(0)
        
        feats_scaled = scaler.transform(np.array(feats).reshape(1, -1))
        
        # Ensemble of three GB models
        pred1 = gb1_final.predict(feats_scaled)[0]
        pred2 = gb2_final.predict(feats_scaled)[0]
        pred3 = gb3_final.predict(feats_scaled)[0]
        
        allocation = (pred1 + pred2 + pred3) / 3
        
        # Wide clipping
        allocation = np.clip(allocation, 0.3, 1.8)
        
        # Minimal smoothing
        allocation = 0.95 * allocation + 0.05 * 1.0
        
        return float(np.clip(allocation, 0, 2))
    
    except Exception as e:
        return 1.0

In [11]:
print("\nSetting up inference server...")

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print("Running in competition mode...")
    inference_server.serve()
else:
    print("Running in local test mode...")
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))


Setting up inference server...
Running in local test mode...
