In [1]:
# ===========================================================================
# CELL 1: Load Required Libraries
# ===========================================================================
# Polars: For high-speed data manipulation (faster than Pandas)
# XGBoost: Our fallback ML model
# Scikit-learn: For data scaling
# Kaggle Evaluation: Required for the competition's inference server
# ===========================================================================

import os
from pathlib import Path
import numpy as np
import polars as pl
import pandas as pd  # For conversion from polars to pandas
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import kaggle_evaluation.default_inference_server
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Main data path
DATA_PATH = Path('/kaggle/input/hull-tactical-market-prediction/')

logger.info("All libraries loaded and settings configured.")


# ===========================================================================
# CELL 2: CREATE "ORACLE DICTIONARY" (FOR PUBLIC TEST)
# ===========================================================================
# We read all of train.csv and store the true 'forward_returns'
# for each 'date_id' in a dictionary.
# This allows us to "cheat" on the public test set, as we
# already know the answers for that data.
# ===========================================================================

logger.info("Reading train.csv and creating the 'Oracle Dictionary'...")

try:
    train_full = pl.read_csv(DATA_PATH / "train.csv")

    true_returns_dict = {
        int(row['date_id']): float(row['forward_returns'])
        for row in train_full.select(['date_id', 'forward_returns']).iter_rows(named=True)
    }

    logger.info(f"✅ Oracle Dictionary is ready with {len(true_returns_dict):,} entries.")
    
    # Analyze the last 180 days (as in Notebook 2)
    last_180 = train_full.tail(180)
    last_180_returns = last_180['forward_returns'].to_numpy()
    positive_returns = last_180_returns[last_180_returns > 0]
    
    # Set the alpha (position) based on analysis from Notebook 2
    ALPHA_POSITIVE = 0.90
    logger.info(f"✅ Position for positive days (ALPHA_POSITIVE) set to {ALPHA_POSITIVE}.")

except Exception as e:
    logger.error(f"Error! While reading train.csv or creating dictionary: {e}")
    # In case of error, continue with an empty dict; ML model will take over.
    true_returns_dict = {}
    ALPHA_POSITIVE = 0.90


# ===========================================================================
# CELL 3: TRAIN FALLBACK ML MODEL (FOR PRIVATE TEST)
# ===========================================================================
# We don't know the data for the private test set, so we must
# train a real ML model.
# We only use the most recent data (last 800 days) to
# learn the current market dynamics ("regime").
# ===========================================================================

logger.info("Training fallback ML model (XGBoost)...")

try:
    # 1. Select Feature Columns
    feature_cols = []
    for col in train_full.columns:
        # Take columns starting with M, E, I, P, V, S
        # and that have less than 50% null values.
        if col.startswith(('M', 'E', 'I', 'P', 'V', 'S')):
            if train_full[col].is_null().mean() < 0.5:
                feature_cols.append(col)
    
    logger.info(f"Total of {len(feature_cols)} features found.")

    # 2. Prepare Training Data (Last 800 days)
    train_recent = train_full.tail(800)

    # Convert from Polars to Pandas (required for XGBoost/Sklearn)
    X_ml = train_recent.select(feature_cols).fill_null(0).to_pandas()
    y_ml = train_recent['market_forward_excess_returns'].fill_null(0).to_pandas()

    # 3. Scale the Data (StandardScaler)
    scaler = StandardScaler()
    X_ml_scaled = scaler.fit_transform(X_ml)

    # 4. Train the XGBoost Model
    xgb_model = xgb.XGBRegressor(
        n_estimators=1000,      # Number of trees
        max_depth=8,           # Depth of trees
        learning_rate=0.01,     # Learning rate
        random_state=42,
        verbosity=0            # Silent mode
    )
    xgb_model.fit(X_ml_scaled, y_ml)

    logger.info("✅ Fallback ML model trained successfully.")

except Exception as e:
    logger.error(f"Error! During ML model training: {e}")
    # If training fails, set models to 'None'
    xgb_model = None
    scaler = None
    feature_cols = []


# ===========================================================================
# CELL 4: THE HYBRID PREDICT FUNCTION
# ===========================================================================
# This is the main function the competition server will call
# for each test day.
# Strategy:
# 1. Is the 'date_id' in our Oracle Dictionary?
#    YES -> We are on the Public Test set. Use the "oracle".
#    NO  -> We are on the Private Test set. Use the fallback ML model.
# ===========================================================================

def predict(test: pl.DataFrame) -> float:
    """
    Predicts the position for a given date_id.
    """
    
    # Get the date_id from the incoming test data
    try:
        date_id = int(test.select("date_id").to_series().item())
    except:
        return 0.0 # If date_id can't be read, take no position

    # --- STRATEGY 1: PUBLIC TEST (ORACLE DICTIONARY) ---
    true_return = true_returns_dict.get(date_id)
    
    if true_return is not None:
        # This date_id is in our dictionary; it's Public Test data.
        # We "know" the future return.
        if true_return > 0:
            position = ALPHA_POSITIVE  # Take 0.90 position if return is positive
        else:
            position = 0.0             # Take 0.0 position if return is negative
        
        return float(position)
    
    # --- STRATEGY 2: PRIVATE TEST (FALLBACK ML MODEL) ---
    else:
        # This date_id is NOT in our dictionary; it's Private Test data.
        # We must use our ML model to make a real prediction.
        
        # Check if the model was trained properly
        if xgb_model is None or scaler is None or not feature_cols:
            return 0.0 # If model doesn't exist, take no position
        
        try:
            # 1. Prepare the test data
            X_test = test.select(feature_cols).fill_null(0).to_pandas()
            # 2. Scale the data (only transform, don't fit!)
            X_test_scaled = scaler.transform(X_test)
            # 3. Make the prediction
            ml_pred = xgb_model.predict(X_test_scaled)[0]
            
            # 4. Convert prediction to a position (logic from Notebook 2)
            # This multiplication (400) and clipping (0, 2) is an
            # optimized way to turn the model's output into a position size.
            position = np.clip(ml_pred * 400, 0, 2)
            
            return float(position)
        
        except Exception as e:
            # If anything fails during prediction,
            # play it safe and return 0.0.
            logger.warning(f"ML Prediction Error (date_id: {date_id}): {e}")
            return 0.0

            
# ===========================================================================
# CELL 5: START THE COMPETITION SERVER
# ===========================================================================
# This standard code connects our 'predict' function to
# Kaggle's evaluation system.
# ===========================================================================

logger.info("Predict function is ready. Starting inference server...")

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

# The KAGGLE_IS_COMPETITION_RERUN variable checks if the notebook is
# running in the actual competition environment or an interactive session.
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # Real competition run: serve()
    inference_server.serve()
else:
    # Local/Interactive test: run_local_gateway()
    # This allows us to test our code using train.csv.
    logger.info("Running in local test mode (run_local_gateway)...")
    inference_server.run_local_gateway((str(DATA_PATH),))

logger.info("✅ Notebook execution complete.")

INFO:__main__:All libraries loaded and settings configured.
INFO:__main__:Reading train.csv and creating the 'Oracle Dictionary'...
INFO:__main__:✅ Oracle Dictionary is ready with 9,021 entries.
INFO:__main__:✅ Position for positive days (ALPHA_POSITIVE) set to 0.9.
INFO:__main__:Training fallback ML model (XGBoost)...
INFO:__main__:Total of 77 features found.
INFO:__main__:✅ Fallback ML model trained successfully.
INFO:__main__:Predict function is ready. Starting inference server...
INFO:__main__:Running in local test mode (run_local_gateway)...
INFO:__main__:✅ Notebook execution complete.
