## 1Ô∏è‚É£ Setup: Configure Dataset & CV Strategy

In [None]:
import os
import sys
from pathlib import Path
import yaml

# ========== CONFIGURATION ==========
DATASET_NAME = "my-hull-models"  # ‚Üê ÏóÖÎ°úÎìúÌïú dataset Ïù¥Î¶Ñ

# ========== CV STRATEGY ÏÑ†ÌÉù (Ïó¨Í∏∞Îßå Î∞îÍæ∏ÏÑ∏Ïöî!) ==========
CV_STRATEGY = "time_based"  # time_based, expanding_window, purged_walk_forward, regime_aware
# ====================================================================

print("="*80)
print(f"CV STRATEGY EVALUATION: {CV_STRATEGY.upper()}")
print("="*80)

# Kaggle Îç∞Ïù¥ÌÑ∞ÏÖã Í≤ΩÎ°ú
DATASET_PATH = Path(f"/kaggle/input/{DATASET_NAME}")

if DATASET_PATH.exists():
    sys.path.insert(0, str(DATASET_PATH))
    print(f"‚úì Dataset found: {DATASET_PATH}")
else:
    print(f"‚ùå Dataset not found: {DATASET_PATH}")
    input_dir = Path("/kaggle/input/")
    if input_dir.exists():
        print("\nüìÅ Available datasets:")
        for item in input_dir.iterdir():
            print(f"  - {item.name}")
    raise FileNotFoundError(f"Dataset '{DATASET_NAME}' not found!")

# Config Î≥µÏÇ¨
config_path = DATASET_PATH / "conf" / "params.yaml"
if config_path.exists():
    working_config_dir = Path("/kaggle/working/conf")
    working_config_dir.mkdir(parents=True, exist_ok=True)
    
    import shutil
    shutil.copy(config_path, working_config_dir / "params.yaml")
    sys.path.insert(0, str(working_config_dir.parent))
    print(f"‚úì Config copied to: {working_config_dir}/params.yaml")
else:
    print(f"‚ö†Ô∏è  Config file not found: {config_path}")

print(f"\n‚úÖ Setup complete! Using CV strategy: {CV_STRATEGY}")

## 2Ô∏è‚É£ Train Models with Selected CV Strategy

In [None]:
import warnings

import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import pickle
import json
from typing import List, Tuple

from src.data import DataLoader
from src.features import FeatureEngineering
from src.cv import create_cv_strategy
from src.metric import CompetitionMetric
from src.position import SharpeScalingMapper

warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

print("="*80)
print(f"TRAINING WITH {CV_STRATEGY.upper()} CV")
print("="*80)

# ========== 1. Data Preparation ==========
print("\nüìä Step 1: Loading data...")
data_loader = DataLoader("conf/params.yaml")
train_df, _ = data_loader.load_data()
print(f"‚úì Loaded {len(train_df)} samples")

print("\nüîß Step 2: Preprocessing...")
train_processed, _ = data_loader.preprocess_timeseries(
    train_df,
    handle_outliers=True,
    normalize=True,
    scale=True,
    window=60
)

# Get feature columns (exclude metadata and target columns)
exclude_cols = {'date_id', 'forward_returns', 'realized_vol', 'risk_free_rate'}
feature_cols = [col for col in train_processed.columns if col not in exclude_cols]
print(f"\n‚úì Using {len(feature_cols)} features from preprocessing")

# ========== 2. Create CV Strategy ==========
print(f"\nüîÄ Step 3: Creating {CV_STRATEGY} CV strategy...")
cv_strategy = create_cv_strategy(
    config_path="conf/params.yaml",
    strategy=CV_STRATEGY
)
folds = list(cv_strategy.get_folds(train_processed))
print(f"‚úì Created {len(folds)} folds")

# ========== 3. Model Parameters ==========
return_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'verbosity': -1,
    'n_jobs': 1
}

risk_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 15,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'verbosity': -1,
    'n_jobs': 1
}

# ========== 4. Train Models ==========
print("\nüéØ Step 4: Training models...")

# Prepare realized volatility target
# Sort by date_id to ensure chronological order
train_processed = train_processed.sort_values('date_id').reset_index(drop=True)

# Calculate rolling volatility over time (Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞Ïóê ÎåÄÌï¥)
train_processed['realized_vol'] = (
    train_processed['forward_returns']
    .rolling(window=30, min_periods=5)
    .std()
)

# Fill remaining NaN values with mean
train_processed['realized_vol'] = train_processed['realized_vol'].fillna(
    train_processed['realized_vol'].mean()
)

print(f"‚úì Realized volatility calculated (rolling window=20)")
print(f"  Mean: {train_processed['realized_vol'].mean():.6f}")
print(f"  Std: {train_processed['realized_vol'].std():.6f}")
print(f"  NaN count: {train_processed['realized_vol'].isna().sum()}")

return_models = []
risk_models = []
oof_predictions = {
    'r_hat': np.zeros(len(train_processed)),
    'sigma_hat': np.zeros(len(train_processed)),
    'allocations': np.zeros(len(train_processed)),
    'mask': np.zeros(len(train_processed), dtype=bool)
}

for fold_idx, (train_idx, val_idx) in enumerate(folds):
    print(f"\n  üìÅ Fold {fold_idx + 1}/{len(folds)}")
    
    # Return model
    X_train = train_processed.iloc[train_idx][feature_cols]
    y_train = train_processed.iloc[train_idx]['forward_returns']
    X_val = train_processed.iloc[val_idx][feature_cols]
    y_val = train_processed.iloc[val_idx]['forward_returns']
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    return_model = lgb.train(
        return_params,
        train_data,
        num_boost_round=100,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(stopping_rounds=20, verbose=False)]
    )
    return_models.append(return_model)
    r_hat = return_model.predict(X_val)
    
    # Risk model
    y_train_vol = train_processed.iloc[train_idx]['realized_vol']
    y_val_vol = train_processed.iloc[val_idx]['realized_vol']
    
    train_data_vol = lgb.Dataset(X_train, label=y_train_vol)
    val_data_vol = lgb.Dataset(X_val, label=y_val_vol, reference=train_data_vol)
    
    risk_model = lgb.train(
        risk_params,
        train_data_vol,
        num_boost_round=100,
        valid_sets=[val_data_vol],
        callbacks=[lgb.early_stopping(stopping_rounds=20, verbose=False)]
    )
    risk_models.append(risk_model)
    sigma_hat = np.maximum(risk_model.predict(X_val), 1e-6)
    
    # Position strategy
    position_mapper = SharpeScalingMapper("conf/params.yaml")
    allocations = position_mapper.map_positions(
        r_hat=r_hat,
        sigma_hat=sigma_hat,
        k=1.0,
        b=2.0
    )
    
    # Store OOF
    oof_predictions['r_hat'][val_idx] = r_hat
    oof_predictions['sigma_hat'][val_idx] = sigma_hat
    oof_predictions['allocations'][val_idx] = allocations
    oof_predictions['mask'][val_idx] = True
    
    print(f"    Return RMSE: {np.sqrt(np.mean((y_val - r_hat)**2)):.6f}")
    print(f"    Risk RMSE: {np.sqrt(np.mean((y_val_vol - sigma_hat)**2)):.6f}")

# ========== 5. Calculate OOF Score ==========
print("\nüìä Step 5: Calculating OOF competition score...")

metric_calc = CompetitionMetric()
oof_mask = oof_predictions['mask']

oof_score = metric_calc.calculate_score(
    allocations=oof_predictions['allocations'][oof_mask],
    forward_returns=train_processed['forward_returns'].values[oof_mask],
    risk_free_rate=train_processed['risk_free_rate'].values[oof_mask] if 'risk_free_rate' in train_processed.columns else None
)

print("\n" + "="*80)
print(f"OOF SCORE - {CV_STRATEGY.upper()}")
print("="*80)
print(f"Competition Score: {oof_score['score']:.6f}")
print(f"  ‚Üí Sharpe Ratio: {oof_score['sharpe']:.6f}")
print(f"  ‚Üí Vol Penalty: {oof_score['vol_penalty']:.4f}")
print(f"  ‚Üí Return Penalty: {oof_score['return_penalty']:.4f}")
print(f"  ‚Üí Vol Ratio: {oof_score['vol_ratio']:.4f}")
print(f"Coverage: {oof_mask.sum() / len(train_processed):.2%}")
print("="*80)

# ========== 6. Save Models & OOF Score ==========
print("\nüíæ Step 6: Saving models and results...")

model_dir = Path("/kaggle/working/artifacts")
model_dir.mkdir(parents=True, exist_ok=True)

# Save return models
return_dir = model_dir / "return_models"
return_dir.mkdir(exist_ok=True)
for i, model in enumerate(return_models):
    with open(return_dir / f"model_fold_{i}.pkl", 'wb') as f:
        pickle.dump(model, f)

# Save risk models
risk_dir = model_dir / "risk_models"
risk_dir.mkdir(exist_ok=True)
for i, model in enumerate(risk_models):
    with open(risk_dir / f"model_fold_{i}.pkl", 'wb') as f:
        pickle.dump(model, f)

# Save feature names for both return and risk models
feature_info = {
    'return_features': return_models[0].feature_name(),
    'risk_features': risk_models[0].feature_name()
}
with open(model_dir / "feature_names.json", 'w') as f:
    json.dump(feature_info, f)

# Save OOF score
with open(model_dir / "oof_score.json", 'w') as f:
    json.dump({
        'cv_strategy': CV_STRATEGY,
        'oof_score': float(oof_score['score']),
        'oof_sharpe': float(oof_score['sharpe']),
        'oof_vol_penalty': float(oof_score['vol_penalty']),
        'oof_return_penalty': float(oof_score['return_penalty']),
        'oof_vol_ratio': float(oof_score['vol_ratio']),
        'coverage': float(oof_mask.sum() / len(train_processed))
    }, f, indent=2)

print(f"‚úì Models saved to {model_dir}")
print(f"‚úì Feature names saved:")
print(f"  - Return features: {len(feature_info['return_features'])}")
print(f"  - Risk features: {len(feature_info['risk_features'])}")
print(f"‚úì OOF score saved: {oof_score['score']:.6f}")
print("\n‚úÖ Training complete!")

ModuleNotFoundError: No module named 'polars'

## 3Ô∏è‚É£ Load Models & Start Inference Server

In [None]:
# ========== Cell 3: Inference - Competition Server or Local Parquet ==========

# Load feature names first
with open(model_dir / "feature_names.json", 'r') as f:
    feature_info = json.load(f)
    return_feature_names = feature_info['return_features']
    risk_feature_names = feature_info['risk_features']

def predict(test, model=None):
    """
    Prediction function for competition server.
    
    Args:
        test: polars DataFrame with test features
        model: Not used (models loaded globally)
    
    Returns:
        float: allocation (0 to 2)
    """
    try:
        # Convert to pandas
        test_pd = test.to_pandas()
        
        # Preprocess test data (ÎèôÏùºÌïú Ï†ÑÏ≤òÎ¶¨ Ï†ÅÏö©)
        data_loader = DataLoader("conf/params.yaml")
        test_processed, _ = data_loader.preprocess_timeseries(
            test_pd,
            handle_outliers=True,
            normalize=True,
            scale=True,
            window=60
        )
        
        # Prepare features for RETURN model (in exact training order)
        X_test_return = pd.DataFrame(index=test_processed.index)
        for feat in return_feature_names:
            if feat in test_processed.columns:
                X_test_return[feat] = test_processed[feat]
            else:
                X_test_return[feat] = 0.0
        
        # Prepare features for RISK model (in exact training order)
        X_test_risk = pd.DataFrame(index=test_processed.index)
        for feat in risk_feature_names:
            if feat in test_processed.columns:
                X_test_risk[feat] = test_processed[feat]
            else:
                X_test_risk[feat] = 0.0
        
        # Ensemble prediction - return (FIX: Ïò¨Î∞îÎ•∏ ÏïôÏÉÅÎ∏î)
        r_hat_preds = np.array([model.predict(X_test_return) for model in return_models])
        r_hat = float(np.mean(r_hat_preds, axis=0)[0])
        
        # Ensemble prediction - risk (FIX: Ïò¨Î∞îÎ•∏ ÏïôÏÉÅÎ∏î)
        sigma_hat_preds = np.array([model.predict(X_test_risk) for model in risk_models])
        sigma_hat = float(np.mean(sigma_hat_preds, axis=0)[0])
        sigma_hat = max(sigma_hat, 1e-6)
        
        # Position mapping
        mapper = SharpeScalingMapper()
        allocation = mapper.map_positions(
            r_hat=np.array([r_hat]),
            sigma_hat=np.array([sigma_hat]),
            k=1.0,
            b=2.0
        )[0]
        
        # Ensure bounds
        allocation = max(0.0, min(2.0, float(allocation)))
        
        return allocation
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Prediction error: {e}")
        import traceback
        traceback.print_exc()
        return 0.0

print("‚úì Prediction function defined!")

# ========== Check if running in Kaggle environment ==========
import os

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # ========== COMPETITION MODE: Start Inference Server ==========
    print("\n" + "="*80)
    print("üöÄ COMPETITION MODE - STARTING INFERENCE SERVER")
    print("="*80)
    print(f"CV Strategy: {CV_STRATEGY.upper()}")
    print(f"OOF Score: {oof_score['score']:.6f}")
    print(f"Models: {len(return_models)} folds")
    print("="*80)
    
    import kaggle_evaluation.default_inference_server
    
    inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)
    inference_server.serve()
    
    print("\n‚úÖ Inference server completed!")

else:
    # ========== LOCAL MODE: Generate submission.parquet ==========
    print("\n" + "="*80)
    print("üíª LOCAL MODE - GENERATING SUBMISSION.PARQUET")
    print("="*80)
    
    # Auto-detect test data path
    input_dir = Path("/kaggle/input/")
    test_path = None
    
    # Search for test.csv in all input datasets
    if input_dir.exists():
        for dataset_dir in input_dir.iterdir():
            if dataset_dir.is_dir():
                potential_test = dataset_dir / "test.csv"
                if potential_test.exists():
                    test_path = potential_test
                    print(f"‚úì Found test data: {test_path}")
                    break
    
    if test_path is None or not test_path.exists():
        print("‚ö†Ô∏è  Warning: Test file not found, using placeholder")
        submission = pl.DataFrame({
            'date_id': range(100),
            'allocation': [1.0] * 100
        })
    else:
        test_data = pl.read_csv(test_path)
        print(f"‚úì Loaded test data: {len(test_data)} rows")
        
        # Generate predictions
        allocations = []
        for idx in range(len(test_data)):
            test_row = test_data[idx:idx+1]
            allocation = predict(test_row)
            allocations.append(allocation)
        
        # Create submission
        submission = pl.DataFrame({
            'date_id': test_data['date_id'].to_list(),
            'allocation': allocations
        })
    
    # Save to parquet
    output_path = Path("/kaggle/working/submission.parquet")
    submission.write_parquet(output_path)
    
    print(f"\n‚úÖ Submission saved to: {output_path}")
    print(f"üìä Prediction Summary:")
    print(f"  Mean allocation: {submission['allocation'].mean():.6f}")
    print(f"  Std allocation: {submission['allocation'].std():.6f}")
    print(f"  Min allocation: {submission['allocation'].min():.6f}")
    print(f"  Max allocation: {submission['allocation'].max():.6f}")
    print(f"\nüîç First 5 predictions:")
    print(submission.head())
    print("="*80)