# Base Configuration

**Imports**

In [1]:
import os
import joblib
from pathlib import Path

import polars as pl
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.base import clone
from sklearn.metrics import mean_squared_error, r2_score
from typing import Tuple, List, Dict, Any

# Required for local testing/submission environment
import kaggle_evaluation.default_inference_server

class ParticipantVisibleError(Exception):
    pass

**Project Directory Structure**

*(From template)*

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggl

**Configuration**

In [3]:
# ============ PATHS ============
DATA_PATH: Path = Path('/kaggle/input/hull-tactical-market-prediction/')
MODEL_PATH = '/tmp/model_data.joblib'
TARGET_COL = 'market_forward_excess_returns'
EXCLUDE_COLS = ['date_id', 'forward_returns', 'risk_free_rate',TARGET_COL]
TARGET_COL_TEST = 'lagged_market_forward_excess_returns'
EXCLUDE_COLS_TEST = ['date_id', 'is_scored','lagged_forward_returns','lagged_risk_free_rate',TARGET_COL_TEST]

# ========== FILL NA CONFIG ==============
EWMA_SPAN: int = 20

# ================ THRESHOLD & SWEEP OPTIONS (MODIFIED) ===================
PI_SCORER_THRESHOLD = 0.001               # PI Threshold for ASR SCORER 
TOP_K_OPTIONS = [3, 5, 8, 10, 15]         # Feature selection by count
K_POS_OPTIONS = [10, 25, 50, 75, 100, 150, 200]      # Tanh signal optimization for (prediction>0)
K_NEG_OPTIONS = [500, 1000, 1500, 2000]              # Tanh signal optimization for (prediction<0)
MAX_ITER_OPTIONS = [500, 750, 1000, 1250, 1500, 2000]  #HGBR Max iteration sweeping option

# ============ MODEL CONFIGS ============
RANDOM_STATE = 42 
HGBR_LEARNING_RATE: float = 0.05
HGBR_MAX_DEPTH: int = 5 
HGBR_L2_REG: float = 0.1
HGBR_EARLY_STOPPING: bool = False 

# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0                         # Minimum value for submission 
MAX_SIGNAL: float = 2.0                         # Maximum value for submission

# Helper Functions

**Cleaning and Imputation for Training Data**

1. Delete features with a lot of measing values
2. Fill missing values with EWMA (and mean in the case where the first few rows are NA)

In [4]:
def clean_and_impute_data_training(df: pl.DataFrame):
    """
    Handles missing values: drops columns with >30% missing, then imputes via EWMA/Mean.
    """
    global EXCLUDE_COLS
    
    print("==================================================")
    print("DEBUG: Starting data cleaning and imputation for TRAINING")
    print(f"Initial Polars DataFrame shape: {df.shape}")
    print(f"Columns excluded from imputation (metadata/target): {EXCLUDE_COLS}")
    print("--------------------------------------------------")
    initial_rows = df.shape[0]

    missing_ratio = df.select([pl.col(c).is_null().mean()for c in df.columns if c not in EXCLUDE_COLS])
    missing_dict = missing_ratio.to_dicts()[0]
    too_missing = [col for col, ratio in missing_dict.items() if ratio > 0.3]
    df = df.drop(too_missing)

    print(f"DEBUG: Highly missing columns dropped: {len(too_missing)}")
    print(f"DEBUG: List of highly missing columns dropped: {too_missing}")
    print(f"DEBUG: Shape after column drop: {df.shape}")
    print("--------------------------------------------------")

    filter_condition = pl.all_horizontal([pl.col(c).is_not_null() for c in EXCLUDE_COLS])
    df = df.filter(filter_condition)
    rows_dropped = initial_rows - df.shape[0]
    print(f"DEBUG: Rows dropped due to NaN in critical columns: {rows_dropped}")
    print(f"DEBUG: Shape after row drop: {df.shape}")
    print("--------------------------------------------------")
    
    all_feature_columns = [col for col in df.columns if col not in EXCLUDE_COLS]
    print(f"DEBUG: Total features to be imputed/used by model: {len(all_feature_columns)}")
    print("--------------------------------------------------")

    df = df.with_columns([pl.col(col).cast(pl.Float64) for col in all_feature_columns])
    print("DEBUG: Features successfully cast to Float64.")

    if 'date_id' in df.columns:
        df = df.sort('date_id')
        print("DEBUG: DataFrame sorted by 'date_id' for EWMA calculation.")
    else:
        print("WARNING: 'date_id' not found. Assuming DataFrame is already sorted for EWMA.")
    
    df = df.with_columns(
        [
            pl.col(col).ewm_mean(span = EWMA_SPAN)
            .forward_fill()
            .fill_null(pl.col(col).mean())
            for col in all_feature_columns
        ]
    )

    print("DEBUG: Imputation complete. All remaining NaNs in feature columns filled.")
    print("==================================================")
    return df, all_feature_columns

**Imputation for test data**

*(for training purpose only)*

1. Filling NA on test data with EWMA mean
2. If (first few rows) are NA, fill with mean

In [5]:
def clean_and_impute_data_test_data(df: pl.DataFrame, feature_cols: List[str]):
    """
    Handles missing values: imputes via EWMA/Mean.
    """
    print("==================================================")
    print("DEBUG: Starting data cleaning and imputation for TESTING")

    if 'date_id' in df.columns:
        df = df.sort('date_id')
        print("DEBUG: DataFrame sorted by 'date_id' for EWMA calculation.")
    else:
        print("WARNING: 'date_id' not found. Assuming DataFrame is already sorted for EWMA.")
        
    df = df.with_columns(
        [
            pl.col(col).cast(pl.Float64).ewm_mean(span = EWMA_SPAN)
            .forward_fill()
            .fill_null(pl.col(col).mean())
            for col in feature_cols
        ]
    )

    print("DEBUG: Imputation complete. All remaining NaNs in feature columns filled.")
    print("==================================================")
    return df, feature_cols

**Load Training Set**

1. Load from csv
2. Clean and impute data (call function)

In [6]:
def load_trainset() -> pd.DataFrame:
    """
    Loads, cleans, and returns the training data as a Pandas DataFrame for sklearn.
    """
    print("==================================================")
    print("DEBUG: Starting training data loading process")
    print("--------------------------------------------------")

    # Assuming DATA_PATH is defined
    print ("DEBUG: Reading training data from CSV...")
    try:
        train_pl = pl.read_csv(os.path.join(DATA_PATH, 'train.csv'))
        print(f"DEBUG: Initial Polars DataFrame shape: {train_pl.shape}")
    except Exception as e:
        print(f"ERROR: Failed to load training data: {e}")
        return pd.DataFrame()
    print("--------------------------------------------------")

    required_cols = EXCLUDE_COLS 
    print(f"DEBUG: Required columns for metadata/target: {required_cols}")

    train_pl, _ = clean_and_impute_data_training(train_pl)
    
    print("DEBUG: Converting Polars DataFrame to Pandas...")
    train_pd = train_pl.to_pandas()
    
    print("--------------------------------------------------")
    print(f"DEBUG: Final Pandas DataFrame shape: {train_pd.shape}")
    print("DEBUG: Training data loading and preparation complete.")
    print("==================================================")
    
    return train_pd

**Load Test Set**

1. Load from csv
2. Only get features based on feature selection in training phase
3. Clean input (call function)

In [7]:
def load_testset(features: List[str]) -> pl.DataFrame:
    """
    Loads and preprocesses the testing dataset for inference. 
    """
    print("==================================================")
    print("DEBUG: Starting test data loading process for inference")
    print("--------------------------------------------------")

    print ("DEBUG: Reading test data from CSV...")
    try:
        test_pl = pl.read_csv(os.path.join(DATA_PATH, 'test.csv'))
        print(f"DEBUG: Initial Polars DataFrame shape: {test_pl.shape}")
    except Exception as e:
        print(f"ERROR: Failed to load test data: {e}")
        return pl.DataFrame()
    print("--------------------------------------------------")

    test_pl, _ = clean_and_impute_data_test_data(test_pl, features) 
    
    print("DEBUG: Test data cleaning and imputation complete.")
    print("--------------------------------------------------")
    print(f"DEBUG: Final Test DataFrame shape: {test_pl.shape}")
    print("DEBUG: Test data preparation complete.")
    print("==================================================")

    return test_pl

**Sharpe Scoring**

*from template / Competition Page*

In [8]:
def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """

    if not pd.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution.copy() # Need a copy when modifying, especially in PI scorer
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_SIGNAL:
        # In a real competition, this raises an error. For local scoring, we just proceed or clamp.
        # Since the submission uses np.clip, this is primarily for validation/safety.
        pass 
    if solution['position'].min() < MIN_SIGNAL:
        pass

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    
    # Use product method for compounded mean excess return calculation
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        return 0.0
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()

    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0:
        return 0.0

    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

# Model Training / Fitting

**Load Training Data**

In [9]:
train_df_pd = load_trainset()
features = [col for col in train_df_pd.columns if col not in EXCLUDE_COLS]
metadata_cols = [col for col in EXCLUDE_COLS if col != TARGET_COL]

X = train_df_pd.drop(columns=[TARGET_COL])
y = train_df_pd[TARGET_COL]

print(("=================================================="))
print("DEBUG: Performing time-series split (80/20, shuffle=False)...")
X_train_meta, X_val_meta, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2, 
    shuffle=False
)

X_train = X_train_meta[features]
X_val = X_val_meta[features]
print(f"DEBUG: Split complete. ")
print(f"\nTraining Data Shapes:")
print(f"  X_train (Features ONLY): {X_train.shape}")
print(f"  y_train (Target): {y_train.shape}")
print(f"Validation Data Shapes:")
print(f"  X_val (Features ONLY): {X_val.shape}")
print(f"  y_val (Target): {y_val.shape}")
print("==================================================")

DEBUG: Starting training data loading process
--------------------------------------------------
DEBUG: Reading training data from CSV...
DEBUG: Initial Polars DataFrame shape: (9021, 98)
--------------------------------------------------
DEBUG: Required columns for metadata/target: ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
DEBUG: Starting data cleaning and imputation for TRAINING
Initial Polars DataFrame shape: (9021, 98)
Columns excluded from imputation (metadata/target): ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
--------------------------------------------------
DEBUG: Highly missing columns dropped: 12
DEBUG: List of highly missing columns dropped: ['E7', 'M1', 'M13', 'M14', 'M2', 'M5', 'M6', 'S12', 'S3', 'S8', 'V10', 'V9']
DEBUG: Shape after column drop: (9021, 86)
--------------------------------------------------
DEBUG: Rows dropped due to NaN in critical columns: 0
DEBUG: Shape after row drop: (9021

**Choosing The Best Parameters for The Each Model**

In [10]:
def run_model_optimization(
    model_pipeline: Pipeline, 
    X_train_meta: pd.DataFrame,  
    y_train: pd.Series,  
    X_val_meta: pd.DataFrame, 
    y_val: pd.Series, 
    features: List[str] 
) -> Tuple[float, Dict[str, Any]]:
    """
    Runs the entire feature selection (Discrete ASR PI + Top K) and model training 
    pipeline, performs Tanh signal sweep, and returns the best performing model info.
    """
    X_train = X_train_meta[features].to_numpy()
    X_val = X_val_meta[features].to_numpy()
    y_val_np = y_val.to_numpy()

    best_overall_sharpe = -np.inf
    best_model_data = None
    
    # FIT BASE MODEL (Used for PI calculation)
    base_pipeline = model_pipeline
    base_pipeline.fit(X_train, y_train)

    # Prepare validation DF for sharpe ratio scoring 
    solution_val = pd.DataFrame({
        'date_id': X_val_meta['date_id'].values,
        'forward_returns': y_val.values,
        'risk_free_rate': X_val_meta['risk_free_rate'].values
    })
    
    # DEFINE DISCRETE ASR SCORER (Closure for Permutation Importance)
    def discrete_asr_scorer_closure(estimator, X_subset, y_subset):
        preds = estimator.predict(X_subset)
        
        # Apply Discrete Logic (Threshold = 0.001)
        # 0 if pred <= 0, 1 if 0 < pred <= PI_SCORER_THRESHOLD, 2 otherwise
        signals = np.where(preds <= 0, 0, np.where(preds <= PI_SCORER_THRESHOLD, 1, 2))
        
        sub_df = pd.DataFrame({'date_id': X_val_meta['date_id'], 'prediction': signals})
        return score(solution_val, sub_df, 'date_id')

    # CALCULATE PERMUTATION IMPORTANCE using Discrete ASR
    print(f"\n--- Calculating PI using Discrete ASR Scorer (Threshold: {PI_SCORER_THRESHOLD:.4f}) ---")
    r = permutation_importance(
        base_pipeline, 
        X_val, 
        y_val_np, 
        scoring=discrete_asr_scorer_closure, 
        n_repeats=10, 
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    feature_importances_mean = r.importances_mean
    
    if len(features) != len(feature_importances_mean):
         raise ValueError("The length of the 'features' list does not match the size of importances_mean.")
        
    imp = pd.DataFrame({
        "feature": features,
        "importance": feature_importances_mean
    }).sort_values("importance", ascending=False)
    
    print(f"DEBUG: Feature ranking complete. Top 5 features: {imp.head(5)['feature'].tolist()}")
    
    # SWEEP OVER TOP K FEATURES
    print(f"\n--- Starting Iteration over Top K Features: {TOP_K_OPTIONS} ---") 

    for k in TOP_K_OPTIONS:
        print(f"\n==================================================")
        print(f"TOP K SWEEP: Testing Top K = {k}")
        print(f"==================================================")

        selected_features = imp.head(k)["feature"].tolist()

        if len(selected_features) == 0:
            print("No features selected (k=0). Skipping.")
            continue
            
        print(f"DEBUG: Training with {len(selected_features)} features.")
        X_train_k = X_train_meta[selected_features].to_numpy()
        X_val_k = X_val_meta[selected_features].to_numpy()

        # Refit model on selected features
        current_pipeline = clone(model_pipeline)
        current_pipeline.fit(X_train_k, y_train)
        y_pred = current_pipeline.predict(X_val_k)

        mse = mean_squared_error(y_pred, y_val)
        r2 = r2_score(y_pred, y_val)
        
        current_best_sharpe_for_k = -np.inf
        optimal_pos = None
        optimal_neg = None
        
        for pos in K_POS_OPTIONS:
            for neg in K_NEG_OPTIONS:
                # Tanh Transformation (1 + tanh(pred * K))
                signals = np.where(
                    y_pred < 0, 
                    1.0 + np.tanh(y_pred * neg), # Use K_NEG for negative predictions
                    1.0 + np.tanh(y_pred * pos)  # Use K_POS for positive predictions
                )
                
                signals_clipped = np.clip(signals, MIN_SIGNAL, MAX_SIGNAL)
                current_submission = pd.DataFrame({
                    'date_id': X_val_meta['date_id'].values, 
                    'prediction': signals_clipped 
                })
                
                current_sharpe = score(
                    solution=solution_val, 
                    submission=current_submission, 
                    row_id_column_name='date_id'
                )
                
                # Check Best Sharpe
                if current_sharpe > current_best_sharpe_for_k:
                    current_best_sharpe_for_k = current_sharpe
                    optimal_pos = pos
                    optimal_neg = neg
                    
        # Print and Check Overall Best Result for this iteration
        print(f"\n--- BEST SIGNAL RESULTS for Top K={k} ---\nü•á BEST ASR: {current_best_sharpe_for_k:.4f} (K_POS={optimal_pos}, K_NEG={optimal_neg})")
        
        if current_best_sharpe_for_k > best_overall_sharpe:
            best_overall_sharpe = current_best_sharpe_for_k
            
            # Save the current model and parameters as the best overall
            best_model_data = {
                'model': current_pipeline,
                'features': selected_features,
                'signal_k_pos': optimal_pos,
                'signal_k_neg': optimal_neg,
                'EWMA_SPAN': EWMA_SPAN,
                'pi_discrete_threshold': PI_SCORER_THRESHOLD, # Store the PI scorer threshold
                'top_k_used': k,
                'mse': mse,
                'r2': r2
            }
            print(f"*** NEW OVERALL BEST ASR FOUND: {best_overall_sharpe:.4f} with Top K={k} ***")
            
    print("\n==================================================")
    print(f"FINAL RESULT: Overall Best ASR: {best_overall_sharpe:.4f}")
    print("==================================================")
    
    return best_overall_sharpe, best_model_data

In [11]:
best_model = None
best_sharpe = -np.inf # The minimum value


for max_iter in MAX_ITER_OPTIONS:
    
    print(f"\n--- Starting HGBR Training Sweep for MAX_ITER={max_iter} ---")
    
    hgbr_pipeline = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('model', HistGradientBoostingRegressor(
            max_iter=max_iter,
            learning_rate=HGBR_LEARNING_RATE,
            max_depth=HGBR_MAX_DEPTH,
            l2_regularization=HGBR_L2_REG,
            early_stopping=HGBR_EARLY_STOPPING,
            random_state=RANDOM_STATE
        ))
    ])
  
    sharpe, model_data = run_model_optimization(
        model_pipeline=hgbr_pipeline,
        X_train_meta=X_train_meta,
        y_train=y_train,
        X_val_meta=X_val_meta,
        y_val=y_val,
        features=features 
    )
    
    print(f"HGBR MAX_ITER={max_iter} completed. Best Sharpe: {sharpe:.4f}")
    
    if sharpe > best_sharpe:
        print(f"*** DEBUG: Found new best model! Previous Sharpe: {best_sharpe:.4f}, New Sharpe: {sharpe:.4f} ***")
        best_sharpe = sharpe
        best_model = {
            'model_type': 'HGBR',
            'max_iter': max_iter,
            'best_sharpe': sharpe,
            'model_data' : model_data
        }
    else:
        print(f"*** DEBUG: Current Sharpe {sharpe:.4f} is not better than best Sharpe {best_sharpe:.4f}. Skipping update. ***")

if best_model and best_sharpe > -np.inf:
    joblib.dump(best_model, MODEL_PATH)
    print(f"\n‚úì Best model (Sharpe: {best_sharpe:.4f}) saved to {MODEL_PATH}")
else:
    print("\n‚ö†Ô∏è WARNING: No valid model found or best_sharpe not improved from initial value. Nothing saved.")


--- Starting HGBR Training Sweep for MAX_ITER=500 ---

--- Calculating PI using Discrete ASR Scorer (Threshold: 0.0010) ---
DEBUG: Feature ranking complete. Top 5 features: ['D1', 'D2', 'D3', 'D4', 'D5']

--- Starting Iteration over Top K Features: [3, 5, 8, 10, 15] ---

TOP K SWEEP: Testing Top K = 3
DEBUG: Training with 3 features.

--- BEST SIGNAL RESULTS for Top K=3 ---
ü•á BEST ASR: -0.2105 (K_POS=10, K_NEG=500)
*** NEW OVERALL BEST ASR FOUND: -0.2105 with Top K=3 ***

TOP K SWEEP: Testing Top K = 5
DEBUG: Training with 5 features.

--- BEST SIGNAL RESULTS for Top K=5 ---
ü•á BEST ASR: 0.1731 (K_POS=200, K_NEG=2000)
*** NEW OVERALL BEST ASR FOUND: 0.1731 with Top K=5 ***

TOP K SWEEP: Testing Top K = 8
DEBUG: Training with 8 features.

--- BEST SIGNAL RESULTS for Top K=8 ---
ü•á BEST ASR: 0.0822 (K_POS=200, K_NEG=2000)

TOP K SWEEP: Testing Top K = 10
DEBUG: Training with 10 features.

--- BEST SIGNAL RESULTS for Top K=10 ---
ü•á BEST ASR: -0.1227 (K_POS=75, K_NEG=500)

TOP K

# Predict Function

**Submission Function**

In [12]:
def predict(test: pl.DataFrame):
    """
    Kaggle submission function compatible with the inference API.
    Loads the best model and applies the optimized Tanh signal transformation.
    """
    
    try:
        model = joblib.load(MODEL_PATH)
    except FileNotFoundError:
        print(f"ERROR: Model file not found at {MODEL_PATH}. Returning default position.")
        return pd.DataFrame({'date_id': test['date_id'], 'prediction': 1.0})

    model_data = model['model_data']

    print(f"DEBUG: Using Model Config: MaxIter={model['max_iter']}, TopK={model_data['top_k_used']}, PosK={model_data['signal_k_pos']}, NegK={model_data['signal_k_neg']}")
    
    # Clean & Prepare Features
    features_to_use = model_data['features']
    test_clean, _ = clean_and_impute_data_test_data(test, features_to_use)
    
    X_test_pl_features = test_clean.select(features_to_use)

    # Predict Raw
    predictions = model_data['model'].predict(X_test_pl_features.to_numpy())
    
    # Apply Tanh Signal Logic
    signals = np.where(
        predictions < 0, 
        1.0 + np.tanh(predictions * model_data['signal_k_neg']), 
        1.0 + np.tanh(predictions * model_data['signal_k_pos'])  
    )
                
    signals_clipped = np.clip(signals, MIN_SIGNAL, MAX_SIGNAL)
    
    
    submission_df = pd.DataFrame({
        'date_id': test['date_id'], # Use date_id from the original test input
        'prediction': signals_clipped
    })  

    return submission_df

# =========================================================================
# The code below is required for the Kaggle submission API
# =========================================================================
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # This runs the local demo flow if not in the competition environment
    def run_local_predict_demo():
        print("\nStarting local predict DEMO (uses the local 'test.csv' copy)...")
        
        try:
            model = joblib.load(MODEL_PATH)
        except FileNotFoundError:
            print(f"ERROR: Model file not found at {MODEL_PATH}. Cannot run demo.")
            return

        model_data = model['model_data']

        print(f"\n‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è\nModel Data for Testing: \n {model_data}\n ‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è")
        
        # Load test set 
        X_test_pl = load_testset(features = model_data['features'])
        
        solution_val_df = pd.DataFrame({
            'date_id': X_test_pl['date_id'].to_numpy(),
            'forward_returns': X_test_pl['lagged_forward_returns'].to_numpy(),
            'risk_free_rate': X_test_pl['lagged_risk_free_rate'].to_numpy()
        })

        submission_df = predict(X_test_pl)

        final_asr_score = score(
            solution=solution_val_df, 
            submission=submission_df,
            row_id_column_name='date_id'
        )

        print("\n" + "="*50)
        print(f"‚úÖ FINAL VALIDATION ASR SCORE: {final_asr_score:.6f}")
        print("==================================================")
        return

    run_local_predict_demo()


Starting local predict DEMO (uses the local 'test.csv' copy)...

‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è
Model Data for Testing: 
 {'model': Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 HistGradientBoostingRegressor(early_stopping=False,
                                               l2_regularization=0.1,
                                               learning_rate=0.05, max_depth=5,
                                               max_iter=1000,
                                               random_state=42))]), 'features': ['D1', 'D2', 'D3', 'D4', 'D5'], 'signal_k_pos': 10, 'signal_k_neg': 2000, 'EWMA_SPAN': 20, 'pi_discrete_threshold': 0.001, 'top_k_used': 5, 'mse': 0.00013221212175279363, 'r2': -14.68338442526505}
 ‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è‚ÄºÔ∏è
DEBUG: Starting test data loading process for inference
--------------------------------------------------
DEBUG: Reading test data from CSV...
DEBUG: Initia