## 1Ô∏è‚É£ Setup: Load Modules

In [None]:
import os
import sys
from pathlib import Path
import yaml

# ========== CONFIGURATION ==========
DATASET_NAME = "my-hull-models"
# ===================================

print("="*80)
print("SETTING UP ENVIRONMENT")
print("="*80)

# Kaggle Îç∞Ïù¥ÌÑ∞ÏÖã Í≤ΩÎ°ú
DATASET_PATH = Path(f"/kaggle/input/{DATASET_NAME}")

if DATASET_PATH.exists():
    # src Ìè¥Îçî Í≤ΩÎ°ú
    src_path = DATASET_PATH / "src"
    sys.path.insert(0, str(src_path))
    sys.path.insert(0, str(DATASET_PATH))
    
    print(f"‚úì Dataset found: {DATASET_PATH}")
    print(f"‚úì Src path: {src_path}")
    
    # ========== src Î™®Îìà Í∞ÄÏÉÅ ÏÉùÏÑ± ==========
    import importlib.util
    import types
    
    src_module = types.ModuleType('src')
    src_module.__path__ = [str(src_path)]
    sys.modules['src'] = src_module
    
    # ÏùòÏ°¥ÏÑ± ÏàúÏÑúÎåÄÎ°ú Î™®Îìà Î°úÎìú
    py_files_to_load = [
        'utils', 'metric', 'cv', 'data', 'features', 'models',
        'tuner', 'backtest', 'position', 'risk', 'interpretability',
        'ensemble', 'timeseries_risk'
    ]
    
    for module_name in py_files_to_load:
        py_file = src_path / f"{module_name}.py"
        if py_file.exists():
            full_module_name = f"src.{module_name}"
            spec = importlib.util.spec_from_file_location(full_module_name, py_file)
            if spec and spec.loader:
                module = importlib.util.module_from_spec(spec)
                sys.modules[full_module_name] = module
                setattr(src_module, module_name, module)
                try:
                    spec.loader.exec_module(module)
                    print(f"  ‚úì Loaded: src.{module_name}")
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  Error loading src.{module_name}: {e}")
    
    print(f"\n‚úì Created virtual 'src' module")
    
    # Working directory ÏÑ§Ï†ï
    os.chdir("/kaggle/working")
    print(f"‚úì Working directory: {os.getcwd()}")
    
else:
    print(f"‚ùå Dataset not found: {DATASET_PATH}")
    raise FileNotFoundError(f"Dataset '{DATASET_NAME}' not found!")

# ========== CONFIG Î≥µÏÇ¨ ==========
config_path = DATASET_PATH / "conf" / "params.yaml"
if config_path.exists():
    working_config_dir = Path("/kaggle/working/conf")
    working_config_dir.mkdir(exist_ok=True)
    
    import shutil
    shutil.copy(config_path, working_config_dir / "params.yaml")
    
    sys.path.insert(0, str(working_config_dir.parent))
    print(f"\n‚úì Config copied to: {working_config_dir}/params.yaml")
else:
    print(f"\n‚ö†Ô∏è  Config file not found: {config_path}")

# ========== ÌïÑÏàò Î™®Îìà import ==========
from src.features import FeatureEngineering
from src.position import SharpeScalingMapper
from src.data import load_data
from src.cv import PurgedGroupTimeSeriesSplit
from src.models import LightGBMModel
from src.metric import sharpe_ratio

import pandas as pd
import numpy as np
import pickle
import json
from datetime import datetime

print("\n‚úÖ Setup complete!")

## 2Ô∏è‚É£ Load Competition Data

In [None]:
print("="*80)
print("LOADING COMPETITION DATA")
print("="*80)

# Load train data from competition
comp_data_path = Path("/kaggle/input/hull-tactical-market-prediction")

train_df = pd.read_csv(comp_data_path / "train.csv")
print(f"‚úì Loaded train data: {train_df.shape}")
print(f"  Date range: {train_df['date_id'].min()} to {train_df['date_id'].max()}")
print(f"  Symbols: {train_df['symbol'].nunique()}")

print("\n‚úÖ Data loaded!")

## 3Ô∏è‚É£ Feature Engineering

In [None]:
print("="*80)
print("FEATURE ENGINEERING")
print("="*80)

fe = FeatureEngineering()
train_features = fe.transform(train_df)

print(f"‚úì Features created: {train_features.shape}")
print(f"  Original features: {len([c for c in train_df.columns if c not in ['date_id', 'symbol', 'return_1d']])}")
print(f"  Total features: {len([c for c in train_features.columns if c not in ['date_id', 'symbol', 'return_1d']])}")

print("\n‚úÖ Feature engineering complete!")

## 4Ô∏è‚É£ Train Return Model (4-Fold CV)

In [None]:
print("="*80)
print("TRAINING RETURN MODEL")
print("="*80)

# Prepare data
feature_cols = [col for col in train_features.columns 
                if col not in ['date_id', 'return_1d', 'symbol']]
X = train_features[feature_cols]
y = train_features['return_1d']
groups = train_features['date_id']

# Cross-validation
cv = PurgedGroupTimeSeriesSplit(n_splits=4, group_gap=5)

# Best params from optimization
best_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
}

return_models = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    print(f"\n{'='*60}")
    print(f"Fold {fold + 1}/4")
    print(f"{'='*60}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train model
    model = LightGBMModel(params=best_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            model.lgb.early_stopping(stopping_rounds=50, verbose=False),
            model.lgb.log_evaluation(period=100)
        ]
    )
    
    return_models.append(model)
    
    # Validation score
    val_pred = model.predict(X_val)
    val_mse = np.mean((y_val - val_pred) ** 2)
    print(f"  Validation MSE: {val_mse:.6f}")

print(f"\n‚úÖ Return model training complete! ({len(return_models)} folds)")

## 5Ô∏è‚É£ Feature Selection for Return Model

In [None]:
print("="*80)
print("FEATURE SELECTION - RETURN MODEL")
print("="*80)

# Get feature importance from first fold
importances = return_models[0].model.feature_importance(importance_type='gain')
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances
}).sort_values('importance', ascending=False)

# Select top features
n_features = 150
selected_features_return = feature_importance.head(n_features)['feature'].tolist()

print(f"‚úì Selected top {len(selected_features_return)} features")
print(f"  Top 5: {selected_features_return[:5]}")

# Retrain with selected features
X_selected = X[selected_features_return]
return_models_final = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y, groups)):
    print(f"\nRetraining Fold {fold + 1}/4 with {len(selected_features_return)} features...")
    
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LightGBMModel(params=best_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            model.lgb.early_stopping(stopping_rounds=50, verbose=False),
            model.lgb.log_evaluation(period=100)
        ]
    )
    
    return_models_final.append(model)

# Replace with final models
return_models = return_models_final

print(f"\n‚úÖ Feature selection complete!")

## 6Ô∏è‚É£ Train Risk Model (4-Fold CV)

In [None]:
print("="*80)
print("TRAINING RISK MODEL")
print("="*80)

# Calculate realized volatility as target
print("Calculating realized volatility...")
train_features['volatility'] = train_features.groupby('symbol')['return_1d'].transform(
    lambda x: x.rolling(window=20, min_periods=5).std()
)
train_features['volatility'] = train_features['volatility'].fillna(train_features['volatility'].median())

y_risk = train_features['volatility']

# Best params for risk model
risk_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
}

risk_models = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y_risk, groups)):
    print(f"\n{'='*60}")
    print(f"Fold {fold + 1}/4")
    print(f"{'='*60}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_risk.iloc[train_idx], y_risk.iloc[val_idx]
    
    # Train model
    model = LightGBMModel(params=risk_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            model.lgb.early_stopping(stopping_rounds=50, verbose=False),
            model.lgb.log_evaluation(period=100)
        ]
    )
    
    risk_models.append(model)
    
    # Validation score
    val_pred = model.predict(X_val)
    val_mse = np.mean((y_val - val_pred) ** 2)
    print(f"  Validation MSE: {val_mse:.6f}")

print(f"\n‚úÖ Risk model training complete! ({len(risk_models)} folds)")

## 7Ô∏è‚É£ Feature Selection for Risk Model

In [None]:
print("="*80)
print("FEATURE SELECTION - RISK MODEL")
print("="*80)

# Get feature importance from first fold
importances = risk_models[0].model.feature_importance(importance_type='gain')
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances
}).sort_values('importance', ascending=False)

# Select top features
n_features = 150
selected_features_risk = feature_importance.head(n_features)['feature'].tolist()

print(f"‚úì Selected top {len(selected_features_risk)} features")
print(f"  Top 5: {selected_features_risk[:5]}")

# Retrain with selected features
X_selected = X[selected_features_risk]
risk_models_final = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y_risk, groups)):
    print(f"\nRetraining Fold {fold + 1}/4 with {len(selected_features_risk)} features...")
    
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y_risk.iloc[train_idx], y_risk.iloc[val_idx]
    
    model = LightGBMModel(params=risk_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            model.lgb.early_stopping(stopping_rounds=50, verbose=False),
            model.lgb.log_evaluation(period=100)
        ]
    )
    
    risk_models_final.append(model)

# Replace with final models
risk_models = risk_models_final

print(f"\n‚úÖ Feature selection complete!")

## 8Ô∏è‚É£ Optimize Position Strategy

In [None]:
print("="*80)
print("OPTIMIZING POSITION STRATEGY")
print("="*80)

# Get predictions on validation set (last fold)
fold_idx = 3
train_idx, val_idx = list(cv.split(X, y, groups))[fold_idx]

X_val_return = X[selected_features_return].iloc[val_idx]
X_val_risk = X[selected_features_risk].iloc[val_idx]
y_val = y.iloc[val_idx]

# Ensemble predictions
r_hat = np.mean([model.predict(X_val_return) for model in return_models], axis=0)
sigma_hat = np.mean([model.predict(X_val_risk) for model in risk_models], axis=0)

# Grid search for best parameters
from scipy.optimize import differential_evolution

def objective(params):
    alpha, beta, gamma = params
    mapper = SharpeScalingMapper(alpha=alpha, beta=beta, gamma=gamma)
    
    allocations = np.array([mapper.map(r, s) for r, s in zip(r_hat, sigma_hat)])
    allocations = np.clip(allocations, 0, 2)
    
    returns = allocations * y_val.values
    sharpe = sharpe_ratio(returns)
    
    return -sharpe  # Minimize negative sharpe

print("Running optimization...")
result = differential_evolution(
    objective,
    bounds=[(0.5, 2.0), (0.5, 2.0), (0.0, 1.0)],
    maxiter=50,
    popsize=10,
    seed=42
)

best_alpha, best_beta, best_gamma = result.x
strategy_params = {
    'alpha': float(best_alpha),
    'beta': float(best_beta),
    'gamma': float(best_gamma)
}

print(f"\n‚úÖ Optimization complete!")
print(f"  Best alpha: {best_alpha:.4f}")
print(f"  Best beta: {best_beta:.4f}")
print(f"  Best gamma: {best_gamma:.4f}")
print(f"  Validation Sharpe: {-result.fun:.4f}")

## 9Ô∏è‚É£ Define Prediction Function

In [None]:
import polars as pl

def predict(test: pl.DataFrame) -> float:
    """
    Real-time prediction function called by Kaggle API.
    
    Args:
        test: Polars DataFrame with batch of test features
        
    Returns:
        allocation: float between 0.0 and 2.0
    """
    global return_models, risk_models, strategy_params
    global selected_features_return, selected_features_risk, fe
    
    try:
        # Convert to pandas for feature engineering
        test_pd = test.to_pandas()
        
        # Feature engineering
        test_features = fe.transform(test_pd)
        
        # Select features for each model
        test_return = test_features[selected_features_return]
        test_risk = test_features[selected_features_risk]
        
        # Ensemble predictions
        r_hat_preds = [model.predict(test_return) for model in return_models]
        r_hat = float(np.mean([np.mean(pred) for pred in r_hat_preds]))
        
        sigma_hat_preds = [model.predict(test_risk) for model in risk_models]
        sigma_hat = float(np.mean([np.mean(pred) for pred in sigma_hat_preds]))
        
        # Map to position
        mapper = SharpeScalingMapper(**strategy_params)
        allocation = mapper.map(r_hat, sigma_hat)
        
        # Ensure within bounds [0, 2]
        allocation = max(0.0, min(2.0, float(allocation)))
        
        return allocation
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Prediction error: {e}")
        import traceback
        traceback.print_exc()
        return 0.0  # Safe default

print("‚úÖ Prediction function defined!")

## üîü Start Inference Server

In [None]:
import kaggle_evaluation.default_inference_server

print("="*80)
print("STARTING INFERENCE SERVER")
print("="*80)

print(f"\nüìä Training Summary:")
print(f"  Return models: {len(return_models)} folds with {len(selected_features_return)} features")
print(f"  Risk models: {len(risk_models)} folds with {len(selected_features_risk)} features")
print(f"  Position strategy: alpha={strategy_params['alpha']:.4f}, beta={strategy_params['beta']:.4f}, gamma={strategy_params['gamma']:.4f}")

# Create inference server
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # Production: Serve API for hidden test set
    print("\nüöÄ Running in COMPETITION mode - serving real-time predictions")
    inference_server.serve()
else:
    # Local testing: Run on public test set
    print("\nüß™ Running in LOCAL mode - testing on public data")
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))

print("\n‚úÖ Inference complete!")