In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggl

**Volatility of Volatility (VVOL)** 
VVOL stands for Volatility of Volatility. It's a second-order risk measure that gives your model a much deeper understanding of market stability.

**The Definition of VVOL**
To understand VVOL, we must first recall what volatility is: Volatility is the speed and magnitude of price changes—it measures risk (e.g., how much the price moves each day).

Volatility of Volatility (VVOL) measures how much the market's risk level itself is fluctuating.

Think of it this way:

Volatility is the speed you're driving (your daily risk).

VVOL is how hard and fast you are pressing the accelerator and the brake (how quickly your risk level is changing).

A high VVOL means the market is unstable because its own risk profile is changing rapidly.

For the LightGBM model, VVOL features provide crucial context that traditional momentum features (M1, E1) cannot offer. It helps the model distinguish between a steady market trend and a frantic, short-lived market panic.How We Implement VVOL in the CodeSince the Hull dataset already provides features like V1 and V2 (which represent volatility), we don't need to calculate volatility from raw prices. We just need to calculate the volatility of those volatility features.To implement VVOL in the preprocess_data function, we take a rolling standard deviation of the existing volatility features:$$\text{VVOL} = \text{RollingStdDev}(\text{V1})$$For example**, we would add features like V1_roll_std_5 and V2_roll_std_10 to capture this concept.

In [2]:
import os
import polars as pl
import numpy as np
import logging
from typing import List
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb 

In [3]:
# --- Kaggle Evaluation API Interface (Mandatory) ---
try:
    import kaggle_evaluation.default_inference_server as inference_server
except ImportError:
    class MockInferenceServer:
        def __init__(self, predict_fn):
            self.predict_fn = predict_fn
        def serve(self):
            logging.info("Mock Inference Server Running...")
        def run_local_gateway(self, path):
            logging.info(f"Mock Local Gateway Running with path: {path}")
    inference_server = MockInferenceServer

# --- Global Model Variables ---
MODEL = None
TRAIN_COLS = None
TRAIN_MEANS = None

logging.basicConfig(level=logging.INFO)

# --- Feature Engineering and Selection ---

def preprocess_data(df: pl.DataFrame, is_training: bool = False) -> pd.DataFrame:
    """
    Standard preprocessing for the financial time series data, including
    the creation of interaction features, EMAs, rolling means/stdevs, and lags.
    
    This version includes Cross-Asset Spreads and VVOL (Volatility of Volatility).
    """
    
    # --- Polars Feature Engineering (Time Series Features) ---
    if 'date_id' in df.columns:
        df = df.with_columns(
            (pl.col('date_id') % 5).alias('day_of_cycle') # Proxy for Day-of-Week
        )
    
    ROLLING_WINDOWS = [5, 10, 20]
    BASE_FEATURES = ['M1', 'E1', 'V1', 'S1', 'T1', 'P1', 'D1']

    expressions = []
    
    # 1. Simple Lag Features for Momentum
    LAG_WINDOWS = [1, 5]
    for lag in LAG_WINDOWS:
        for col in BASE_FEATURES:
            if col in df.columns:
                expressions.append(
                    pl.col(col).shift(lag).alias(f'{col}_lag_{lag}')
                )
    
    # 2. Rolling Mean and Std Dev Features 
    # NOTE: The Rolling Std Dev on V1 and V2 automatically creates the VVOL features.
    for window in ROLLING_WINDOWS:
        for col in BASE_FEATURES:
            if col in df.columns:
                expressions.append(
                    pl.col(col).rolling_mean(window_size=window, min_samples=1).alias(f'{col}_roll_mean_{window}')
                )
                
                # --- VVOL Features (Volatility of Volatility) ---
                # When col is 'V1' or 'V2', this creates the VVOL feature.
                expressions.append(
                    pl.col(col).rolling_std(window_size=window, min_samples=1).alias(f'{col}_roll_std_{window}')
                )

    if expressions:
        df = df.with_columns(expressions)
        
    # --- Convert to Pandas and create Interaction and EMA Features ---
    pdf = df.to_pandas()
    
    # 3. Exponential Moving Averages (EMAs - Better for Financial Momentum)
    EMA_WINDOWS = [10, 30, 60]
    for window in EMA_WINDOWS:
        for col in BASE_FEATURES:
            if col in pdf.columns:
                pdf[f'{col}_ema_{window}'] = pdf[col].ewm(span=window, adjust=False).mean()
    
    # 4. Standard Interaction Features (Ratios and Differences)
    FEATURE_PAIRS = [('M1', 'M2'), ('E1', 'E2'), ('V1', 'V2')]
    for col1, col2 in FEATURE_PAIRS:
        if col1 in pdf.columns and col2 in pdf.columns:
            pdf[f'{col1}_div_{col2}'] = pdf[col1] / (pdf[col2].replace(0, 1e-6) + 1e-6)
            pdf[f'{col1}_minus_{col2}'] = pdf[col1] - pdf[col2]
            
    # 5. Cross-Asset Spread Features
    
    # Risk-Off Proxy: Market Return (M1) vs. Volatility (V1)
    if 'M1' in pdf.columns and 'V1' in pdf.columns:
        pdf['M1_div_V1_spread'] = pdf['M1'] / (pdf['V1'].replace(0, 1e-6) + 1e-6)

    # Credit Strength: Credit (D1) vs. Equity (E1)
    if 'D1' in pdf.columns and 'E1' in pdf.columns:
        pdf['D1_div_E1_spread'] = pdf['D1'] / (pdf[col1].replace(0, 1e-6) + 1e-6)
        
    # Relative Momentum: Small Cap (S1) vs. Private Equity Proxy (P1)
    if 'S1' in pdf.columns and 'P1' in pdf.columns:
        pdf['S1_minus_P1_spread'] = pdf['S1'] - pdf['P1']
            
    # Drop non-feature columns that are not needed for the ML pipeline
    EXCLUDE_FINAL_COLS = ['date_id', 'forward_returns', 'risk_free_rate',
                          'market_forward_excess_returns', 'is_scored',
                          'lagged_forward_returns', 'lagged_risk_free_rate',
                          'lagged_market_forward_excess_returns']
                          
    final_cols = [col for col in pdf.columns if col not in EXCLUDE_FINAL_COLS]
    
    return pdf[final_cols]

# --- Core Prediction and Allocation Logic ---

def train_model(train_df: pl.DataFrame):
    """
    Trains a LightGBM Regressor to predict the magnitude of the excess return.
    Includes hyperparameter tuning for better score.
    """
    global MODEL, TRAIN_COLS, TRAIN_MEANS
    
    y_train = train_df['market_forward_excess_returns'].to_numpy()
    
    X_train_pd = preprocess_data(train_df, is_training=True)
    
    TRAIN_COLS = list(X_train_pd.columns)
    
    TRAIN_MEANS = X_train_pd.mean()
    X_train_pd = X_train_pd.fillna(TRAIN_MEANS)
    
    regressor = lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',
        n_estimators=7500,           # Increased for better convergence
        learning_rate=0.003,         # Slightly lower for more stable training
        max_depth=-1,
        num_leaves=1023,               
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=4,
        reg_lambda=1.0,                
        min_child_samples=15,
        boosting_type='gbdt'
    )
    
    MODEL = regressor
    
    logging.info(f"Starting LightGBM REGRESSION model training on {len(X_train_pd)} samples with {len(TRAIN_COLS)} features...")
    MODEL.fit(X_train_pd, y_train)
    logging.info("Model training complete.")

def convert_prediction_to_allocation(predicted_value: float, volatility: float) -> float:
    """
    Converts the model's predicted excess return into an allocation size,
    adjusted inversely by the current market volatility (risk-adjusted allocation).
    """
    
    PREDICTION_SCALER = 50.0 
    MIN_VOL = 1e-5
    
    edge = predicted_value * PREDICTION_SCALER
    
    # VOLATILITY ADJUSTMENT
    volatility_adjusted_edge = edge / max(volatility, MIN_VOL) 
    
    scaled_edge = np.tanh(volatility_adjusted_edge)
    
    final_allocation = 1.0 + scaled_edge
    
    final_allocation = np.clip(final_allocation, 0.0, 2.0)
    
    return float(final_allocation)

# --- The Required Kaggle Inference Function ---

def predict(test: pl.DataFrame) -> float:
    """
    The main inference function called by the Kaggle evaluation API for each timestep.
    """
    global MODEL, TRAIN_COLS, TRAIN_MEANS
    
    is_mock_run = not os.getenv('KAGGLE_IS_COMPETITION_RERUN')
    default_return = (1.0, 0.0, 0.0) if is_mock_run else 1.0
    
    if MODEL is None:
        train_path = os.path.join('/kaggle/input/hull-tactical-market-prediction/', 'train.csv')
        try:
            train_df = pl.read_csv(train_path, try_parse_dates=True, infer_schema_length=100000)
        except Exception as e:
            logging.error(f"Could not load train.csv: {e}. Returning neutral allocation.")
            return default_return

        train_model(train_df)
        
        if MODEL is None:
            logging.error("Model training failed. Returning neutral allocation.")
            return default_return

    try:
        X_test_pd = preprocess_data(test)
        missing_cols = set(TRAIN_COLS) - set(X_test_pd.columns)
        for col in missing_cols:
            X_test_pd[col] = np.nan
            
        X_test_pd = X_test_pd[TRAIN_COLS]
        
        if TRAIN_MEANS is not None:
              X_test_pd = X_test_pd.fillna(TRAIN_MEANS)
        else:
              X_test_pd = X_test_pd.fillna(X_test_pd.mean())
        
    except Exception as e:
        logging.error(f"Feature selection failed in predict: {e}")
        return default_return

    # --- Volatility Extraction (RISK ADJUSTMENT PROXY) ---
    current_volatility = 1.0
    
    VOLATILITY_FEATURE = 'M1_roll_std_5'
    if VOLATILITY_FEATURE in X_test_pd.columns:
        vol = X_test_pd[VOLATILITY_FEATURE].iloc[-1]
        current_volatility = abs(vol)
    
    try:
        predicted_value = MODEL.predict(X_test_pd)[-1]
        
        final_allocation = convert_prediction_to_allocation(predicted_value, current_volatility)
        
        actual_return = test['market_forward_excess_returns'].to_numpy()[-1] if 'market_forward_excess_returns' in test.columns else 0.0
        
        if is_mock_run:
              return (final_allocation, predicted_value, actual_return)
        
        return final_allocation

    except Exception as e:
        logging.error(f"Inference failed: {e}. Returning neutral allocation.")
        return default_return

# --- Visualization Helpers (Saving to PNG) ---

def plot_results(results_df: pd.DataFrame):
    """Generates a plot of the predicted returns and allocations and saves it to a file."""
    
    fig, ax1 = plt.subplots(figsize=(12, 6))

    color = 'tab:blue'
    ax1.set_xlabel('Simulated Day')
    ax1.set_ylabel('Predicted Excess Return (Value)', color=color) 
    ax1.plot(results_df['day'], results_df['predicted_value'], color=color, label='Predicted Return', alpha=0.6)
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.grid(True, linestyle='--', alpha=0.5)
    ax1.axhline(0.0, color='gray', linestyle='--', label='Neutral (0.0)')
    ax1.legend(loc='upper left')

    ax2 = ax1.twinx()  
    color = 'tab:red'
    ax2.set_ylabel('Final Allocation (0.0 to 2.0)', color=color)  
    ax2.plot(results_df['day'], results_df['allocation'], color=color, label='Volatility-Adjusted Allocation', linewidth=1.5, alpha=0.8)
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.axhline(1.0, color='darkred', linestyle=':', label='Neutral (1.0)')
    ax2.legend(loc='upper right')

    fig.suptitle(f'Prediction and Volatility-Adjusted Allocation', fontsize=14)
    fig.tight_layout()
    plt.savefig('results_plot.png')
    plt.close(fig)

def plot_monthly_comparison(monthly_df: pd.DataFrame):
    """Generates a bar plot comparing aggregated returns and saves it to a file."""
    fig, ax = plt.subplots(figsize=(12, 6))
    
    bar_width = 0.4
    x = np.arange(len(monthly_df))

    ax.bar(x - bar_width/2, monthly_df['actual_return_sum'], bar_width, 
           label='Mock Actual Return Sum', color='darkgreen', alpha=0.8)
    
    ax.bar(x + bar_width/2, monthly_df['predicted_value_sum'], bar_width, 
           label='Predicted Return Sum', color='tab:blue', alpha=0.8)

    ax.set_title('Monthly Predicted vs. Mock Actual Excess Returns', fontsize=14)
    ax.set_xlabel('Simulated Month', fontsize=12)
    ax.set_ylabel('Sum of Excess Returns', fontsize=12)
    ax.set_xticks(x)
    ax.set_xticklabels(monthly_df['month'])
    ax.legend()
    ax.grid(axis='y', linestyle='--', alpha=0.6)
    ax.axhline(0, color='black', linewidth=0.8)

    plt.tight_layout()
    plt.savefig('monthly_comparison.png')
    plt.close(fig)

def plot_feature_timeseries(feature_pdf: pd.DataFrame, num_samples: int = 500):
    """Plots the time series of the new Cross-Asset Spread and VVOL features."""
    
    # Use only the first N samples for a clearer time series plot
    pdf_sample = feature_pdf.head(num_samples) 
    
    features_to_plot = [
        'M1_div_V1_spread',    # Cross-Asset Spread (Risk Proxy)
        'D1_div_E1_spread',    # Cross-Asset Spread (Credit Strength)
        'V1_roll_std_10',      # VVOL (Volatility of M1 Volatility)
        'V2_roll_std_10'       # VVOL (Volatility of E1/M1 Volatility)
    ]
    
    # Filter for features that actually exist in the DataFrame
    existing_features = [f for f in features_to_plot if f in pdf_sample.columns]
    
    if not existing_features:
        logging.warning("No new engineered features found to plot.")
        return

    fig, axes = plt.subplots(len(existing_features), 1, figsize=(12, 12), sharex=True)
    
    # Ensure axes is an array even for a single subplot
    if len(existing_features) == 1:
        axes = [axes]
        
    fig.suptitle('Time Series of Engineered Features (Cross-Asset Spreads & VVOL)', fontsize=16)

    for i, feature in enumerate(existing_features):
        ax = axes[i]
        # Use a simple index for time
        ax.plot(pdf_sample.index, pdf_sample[feature], label=feature, linewidth=1.5, alpha=0.7)
        ax.set_ylabel(feature, fontsize=10)
        ax.grid(True, linestyle=':', alpha=0.5)
        ax.axhline(pdf_sample[feature].mean(), color='orange', linestyle=':', linewidth=0.5, label='Mean')
        ax.legend(loc='upper right')
        
    axes[-1].set_xlabel(f'Sample Index (approx. {num_samples} Days)', fontsize=12)
    plt.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust layout for suptitle
    plt.savefig('feature_timeseries_vvol_plot.png')
    plt.close(fig)


# --- Mock Test Runner ---
def run_mock_test_and_visualize():
    """Simulates 1825 days of inference for visualization."""
    
    MOCK_FEATURES = [
        'D1', 'D2', 'E1', 'E2', 'V1', 'V2', 'S1', 'S2', 'M1', 'M2', 'T1', 'T2', 'P1', 'P2'
    ]
    NUM_SIMULATION_DAYS = 1000 # Increased length for better plot viewing
    
    # 1. GENERATE MOCK TRAINING DATA and Features for plotting
    mock_train_data = {
        c: np.random.uniform(0.1, 0.9, NUM_SIMULATION_DAYS + 100) # Need extra days for rolling features
        for c in MOCK_FEATURES
    }
    mock_train_data['market_forward_excess_returns'] = np.random.uniform(-0.005, 0.005, NUM_SIMULATION_DAYS + 100)
    mock_train_data['date_id'] = np.arange(1, NUM_SIMULATION_DAYS + 101)
    mock_train_df = pl.DataFrame(mock_train_data)
    
    # 2. Train the model (uses the first N rows for training)
    _ = predict(mock_train_df.head(100)) # Call predict to trigger training
    
    # 3. Preprocess the mock data for feature visualization
    X_train_processed = preprocess_data(mock_train_df)
    plot_feature_timeseries(X_train_processed)
    logging.info("Feature Time Series Plot saved to 'feature_timeseries_vvol_plot.png'.")

    # 4. Run the simulation
    results = []
    logging.info(f"Running {NUM_SIMULATION_DAYS} day mock inference simulation...")
    
    for day in range(NUM_SIMULATION_DAYS):
        # We use a sliding window of the mock training data for simulation
        test_data_slice = mock_train_df.slice(day, 100) # Use 100 days history for rolling calculation
        
        allocation, predicted_value, actual_return = predict(test_data_slice)
        
        results.append({
            'day': day + 1,
            'predicted_value': predicted_value, 
            'allocation': allocation,
            'actual_return': actual_return
        })

    # 5. Process and Plot Results
    results_df = pd.DataFrame(results)
    
    results_df['month'] = ((results_df['day'] - 1) // 30) + 1
    monthly_df = results_df.groupby('month').agg(
        predicted_value_sum=('predicted_value', 'sum'),
        actual_return_sum=('actual_return', 'sum')
    ).reset_index()
    
    logging.info(f"\n--- MOCK TEST SIMULATION SUMMARY ({NUM_SIMULATION_DAYS} Days) ---")
    logging.info(f"Mean Predicted Excess Return: {results_df['predicted_value'].mean():.6f}")
    logging.info(f"Mean Final Allocation: {results_df['allocation'].mean():.4f}")
    
    # Generate and save the plots
    plot_results(results_df)
    plot_monthly_comparison(monthly_df)
    
    logging.info("\n--- PLOT GENERATION COMPLETE ---")
    logging.info("The graphs have been saved to 'results_plot.png' (Allocation), 'monthly_comparison.png', and 'feature_timeseries_vvol_plot.png'.")

# --- Main Execution Block for Kaggle ---

inference_server_instance = inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server_instance.serve()
else:
    logging.info("Running local gateway for testing.")
    try:
        local_input_path = os.path.join(os.getcwd(), 'kaggle_input/hull-tactical-market-prediction/')
        inference_server_instance.run_local_gateway((local_input_path,))
    except Exception as e:
        # Run the expanded mock test and visualization
        run_mock_test_and_visualize()
        
        # --- Mandatory Dummy Submission File Generation for Kaggle System Check ---
        logging.info("Generating dummy submission.parquet for Kaggle system check.")
        dummy_submission = pl.DataFrame({
            'date_id': [999], 
            'allocation': [1.0] 
        })
        dummy_submission.write_parquet('submission.parquet')
        logging.info("submission.parquet created successfully.")

INFO:root:Running local gateway for testing.
INFO:root:Starting LightGBM REGRESSION model training on 9021 samples with 170 features...


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010943 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38479
[LightGBM] [Info] Number of data points in the train set: 9021, number of used features: 170
[LightGBM] [Info] Start training from score 0.000053


INFO:root:Model training complete.
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
INFO:root:Feature Time Series Plot saved to 'feature_timeseries_vvol_plot.png'.
INFO:root:Running 1000 day mock inference simulation...
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.nan
  X_test_pd[col] = np.n