In [1]:
"""
Ïù¥ÎçîÎ¶¨ÏõÄ Ìä∏Î†àÏù¥Îî© Î∞±ÌÖåÏä§ÌåÖ ÏãúÏä§ÌÖú (Data Leakage Î∞©ÏßÄ Î≤ÑÏ†Ñ)
29Í∞ú Î™®Îç∏ √ó 7Í∞ú Fold √ó 8Í∞ú Ìè¨ÏßÄÏÖò Ï†ÑÎûµ √ó 3Í∞ÄÏßÄ Í±∞Îûò Î∞©Ïãù
"""

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, field
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

UPSTAGE_API_KEY = "up_VVMdYIYXa3F6qY8ns8fO8kRGNSeWQ"

# ==================== ÏÑ§Ï†ï ====================
@dataclass
class Config:
    base_path: Path = Path("../model_results/2025-10-26")
    raw_data_path: Path = Path("../macro_data/macro_data/macro_crypto_data.csv") 
    output_path: Path = None
    
    models: List[str] = field(default_factory=lambda: 
                              ['RandomForest', 'LightGBM', 'XGBoost', 'SVM', 'LogisticRegression', 'NaiveBayes', 
                               'KNN', 'AdaBoost', 'CatBoost', 'DecisionTree', 'ExtraTrees', 'Bagging', 
                               'GradientBoosting', 'HistGradientBoosting', 'StackingEnsemble', 'VotingHard', 
                               'VotingSoft', 'MLP', 'LSTM', 'BiLSTM', 'GRU', 'TCN', 'CNN_LSTM', 
                               'LSTM_Attention', 'DTW_LSTM', 'VMD_Hybrid', 'EMD_LSTM', 'Hybrid_LSTM_GRU', 
                               'Residual_LSTM'])
    
    folds: List[int] = field(default_factory=lambda: list(range(1, 8)))
    
    position_strategies: Dict[str, float] = field(default_factory=lambda: {
        'fixed_01': 0.01, 'fixed_05': 0.05, 'fixed_10': 0.10,
        'fixed_30': 0.30, 'fixed_50': 0.50, 'fixed_80': 0.80,
        'kelly': None, 'confidence_based': None
    })
    
    initial_capital: float = 1_000_000  # 100ÎßåÏõê
    trading_cost: float = 0.004  # 0.4%
    
    # Swing Trading ÌååÎùºÎØ∏ÌÑ∞
    swing_trend_window: int = 5
    swing_entry_threshold: float = 0.35
    swing_exit_threshold: float = 0.15
    swing_min_confidence: float = 0.3
    swing_stop_loss: float = -0.03
    swing_take_profit: float = 0.15
    swing_max_holding: int = 20
    
    # Kelly Criterion ÏµúÏÜå Í±∞Îûò Ïàò
    kelly_min_trades: int = 20
    
    def __post_init__(self):
        if self.output_path is None:
            self.output_path = self.base_path / "backtest_results_comprehensive"
        self.output_path.mkdir(parents=True, exist_ok=True)
        (self.output_path / "detailed_trades").mkdir(exist_ok=True)

# ==================== Í∞úÏÑ†Îêú Îç∞Ïù¥ÌÑ∞ Î°úÎçî ====================
class ImprovedDataLoader:
    """Î∞±ÌÖåÏä§ÌåÖÏö© Îç∞Ïù¥ÌÑ∞ Î°úÎçî - ÏõêÎ≥∏ Í∞ÄÍ≤© Î≥ëÌï©"""
    
    def __init__(self, config: Config):
        self.config = config
        
        # ‚úÖ ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ Î°úÎìú (Ìïú Î≤àÎßå)
        print(f"Loading raw price data from: {config.raw_data_path}")
        self.raw_prices = pd.read_csv(config.raw_data_path)
        self.raw_prices['date'] = pd.to_datetime(self.raw_prices['date'])
        
        # Î∞±ÌÖåÏä§ÌåÖÏóê ÌïÑÏöîÌïú Ïª¨ÎüºÎßå ÏÑ†ÌÉù
        price_cols = ['date', 'ETH_Open', 'ETH_High', 'ETH_Low', 'ETH_Close', 'ETH_Volume']
        available_cols = [col for col in price_cols if col in self.raw_prices.columns]
        
        if len(available_cols) < 5:
            raise ValueError(f"ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞Ïóê ÌïÑÏàò Ïª¨Îüº ÎàÑÎùΩ: {set(price_cols) - set(available_cols)}")
        
        self.raw_prices = self.raw_prices[available_cols].copy()
        print(f"‚úì Loaded {len(self.raw_prices)} rows of price data")
    
    def load_fold_data(self, fold: int) -> Tuple[pd.DataFrame, Dict[str, pd.DataFrame]]:
        """FoldÎ≥Ñ ÏãúÏû• Îç∞Ïù¥ÌÑ∞ÏôÄ Î™®Îç∏ ÏòàÏ∏° Î°úÎìú + ÏõêÎ≥∏ Í∞ÄÍ≤© Î≥ëÌï©"""
        fold_name = f"fold_{fold}_walk_forward_rolling_reverse"
        
        # ===== 1. ÏòàÏ∏°/ÌÉÄÍ≤ü Îç∞Ïù¥ÌÑ∞ Î°úÎìú =====
        market_path = self.config.base_path / "raw_data/direction/walk_forward" / fold_name / "test_raw.csv"
        
        if not market_path.exists():
            raise FileNotFoundError(f"Market data not found: {market_path}")
        
        market_df = pd.read_csv(market_path)
        market_df['date'] = pd.to_datetime(market_df['date'])
        
        # Target Ïª¨Îüº
        target_cols = ['date', 'next_open', 'next_close', 'next_direction', 'next_log_return']
        available_targets = [col for col in target_cols if col in market_df.columns]
        market_df = market_df[available_targets].copy()
        
        # ===== 2. ÏõêÎ≥∏ Í∞ÄÍ≤© Îç∞Ïù¥ÌÑ∞ Î≥ëÌï© =====
        market_df = market_df.merge(
            self.raw_prices, 
            on='date', 
            how='left'
        )
        
        # ===== 3. Í∞ÄÍ≤© Ï†ïÎ†¨ Í≤ÄÏ¶ù =====
        if 'next_open' in market_df.columns and 'ETH_Open' in market_df.columns:
            # next_openÏùÄ shift(-1)Ïù¥ÎØÄÎ°ú, ETH_OpenÏùò Îã§Ïùå ÎÇ†Í≥º ÏùºÏπòÌï¥Ïïº Ìï®
            market_df['_check'] = market_df['next_open'] - market_df['ETH_Open'].shift(-1)
            misalignment = market_df['_check'].abs().max()
            
            if pd.notna(misalignment) and misalignment > 0.01:
                print(f"  ‚ö†Ô∏è  Fold {fold}: Í∞ÄÍ≤© Ï†ïÎ†¨ Î∂àÏùºÏπò (max diff: {misalignment:.2f})")
            
            market_df.drop('_check', axis=1, inplace=True)
        
        # ===== 4. NaN Ï≤¥ÌÅ¨ =====
        price_nan = market_df[['ETH_Open', 'ETH_Close', 'next_open', 'next_close']].isna().sum()
        if price_nan.any():
            print(f"  ‚ö†Ô∏è  Fold {fold} NaN Î∞úÍ≤¨: {price_nan[price_nan > 0].to_dict()}")
        
        # ===== 5. Î™®Îç∏ ÏòàÏ∏° Î°úÎìú =====
        predictions = {}
        pred_base = self.config.base_path / "fold_results/direction" / fold_name
        
        for model in self.config.models:
            pred_path = pred_base / f"{model}_predictions.csv"
            if pred_path.exists():
                pred_df = pd.read_csv(pred_path)
                pred_df['date'] = pd.to_datetime(pred_df['date'])
                predictions[model] = pred_df
        
        return market_df, predictions
    
    def calculate_market_regime(self, market_df: pd.DataFrame) -> str:
        """ÏãúÏû• Ï≤¥Ï†ú Î∂ÑÎ•ò"""
        if 'next_close' not in market_df.columns or 'next_open' not in market_df.columns:
            return 'UNKNOWN'
        
        total_return = (market_df['next_close'].iloc[-1] / market_df['next_open'].iloc[0] - 1) * 100
        
        if total_return >= 5:
            return 'BULL'
        elif total_return <= -5:
            return 'BEAR'
        else:
            return 'SIDEWAYS'

# ==================== Ìè¨ÏßÄÏÖò Í¥ÄÎ¶¨ ====================
class PositionManager:
    def __init__(self, config: Config):
        self.config = config
        self.trade_history = []
    
    def calculate_position_size(self, strategy: str, capital: float, 
                               confidence: float = 0, trades: List = None) -> float:
        """Ìè¨ÏßÄÏÖò ÌÅ¨Í∏∞ Í≥ÑÏÇ∞"""
        if strategy.startswith('fixed_'):
            ratio = self.config.position_strategies[strategy]
            return capital * ratio
        
        elif strategy == 'confidence_based':
            return capital * confidence * 0.3
        
        elif strategy == 'kelly':
            if trades is None or len(trades) < self.config.kelly_min_trades:
                return capital * 0.10  # Í∏∞Î≥∏Í∞í
            
            wins = [t for t in trades if t['pnl'] > 0]
            losses = [t for t in trades if t['pnl'] < 0]
            
            if not wins or not losses:
                return capital * 0.10
            
            win_rate = len(wins) / len(trades)
            avg_win = np.mean([t['pnl_pct'] for t in wins])
            avg_loss = abs(np.mean([t['pnl_pct'] for t in losses]))
            
            kelly_pct = (win_rate * avg_win - (1 - win_rate) * avg_loss) / avg_win
            kelly_pct = np.clip(kelly_pct, 0.01, 0.50)  # 1-50% Ï†úÌïú
            
            return capital * kelly_pct
        
        return capital * 0.10

# ==================== Day Trading Î∞±ÌÖåÏä§ÌÑ∞ ====================
class DayTradingBacktester:
    def __init__(self, config: Config):
        self.config = config
        self.position_manager = PositionManager(config)
    
    def backtest(self, market_df: pd.DataFrame, pred_df: pd.DataFrame, 
                 strategy: str) -> Dict:
        """
        Day Trading Î∞±ÌÖåÏä§ÌåÖ
        
        market_df Íµ¨Ï°∞:
        - date: ÏòàÏ∏°Ïùº (T)
        - ETH_Open, ETH_Close: TÏùº Ïã§Ï†ú Í∞ÄÍ≤© (ÌòÑÏû¨Í∞Ä)
        - next_open: T+1Ïùº ÏãúÍ∞Ä (ÏßÑÏûÖ)
        - next_close: T+1Ïùº Ï¢ÖÍ∞Ä (Ï≤≠ÏÇ∞)
        """
        capital = self.config.initial_capital
        trades = []
        daily_capital = []
        
        # Î≥ëÌï©
        merged_df = pred_df.merge(market_df, on='date', how='left')
        
        for idx, row in merged_df.iterrows():
            pred_direction = row['pred_direction']
            confidence = row.get('confidence', 0.5)
            
            # ===== Í∞ÄÍ≤© Í≤ÄÏ¶ù =====
            if pd.isna(row.get('next_open')) or pd.isna(row.get('next_close')):
                daily_capital.append(capital)
                continue
            
            if pred_direction == 1:  # Îß§Ïàò Ïã†Ìò∏
                position_size = self.position_manager.calculate_position_size(
                    strategy, capital, confidence, trades
                )
                
                # TÏùº Ï¢ÖÍ∞ÄÏóê ÏòàÏ∏° ‚Üí T+1Ïùº ÏãúÍ∞ÄÏóê ÏßÑÏûÖ
                entry_price = row['next_open']
                exit_price = row['next_close']
                
                # Í±∞Îûò ÎπÑÏö©
                trading_cost = position_size * self.config.trading_cost * 2
                
                # ÏÜêÏùµ
                gross_pnl = position_size * (exit_price / entry_price - 1)
                net_pnl = gross_pnl - trading_cost
                
                capital += net_pnl
                
                trades.append({
                    'date': row['date'],
                    'type': 'day_trade',
                    'entry_price': entry_price,
                    'exit_price': exit_price,
                    'position_size': position_size,
                    'pnl': net_pnl,
                    'pnl_pct': net_pnl / position_size,
                    'confidence': confidence
                })
            
            daily_capital.append(capital)
        
        return self._calculate_metrics(trades, daily_capital, market_df)
    
    def _calculate_metrics(self, trades: List, daily_capital: List, 
                          market_df: pd.DataFrame) -> Dict:
        """ÏÑ±Í≥º ÏßÄÌëú Í≥ÑÏÇ∞"""
        if not trades:
            return self._empty_metrics()
        
        final_capital = daily_capital[-1]
        total_return = (final_capital / self.config.initial_capital - 1) * 100
        
        # Í±∞Îûò Î∂ÑÏÑù
        winning_trades = [t for t in trades if t['pnl'] > 0]
        losing_trades = [t for t in trades if t['pnl'] < 0]
        
        win_rate = len(winning_trades) / len(trades) if trades else 0
        
        total_wins = sum(t['pnl'] for t in winning_trades)
        total_losses = abs(sum(t['pnl'] for t in losing_trades))
        profit_factor = total_wins / total_losses if total_losses > 0 else 0
        
        # ÏµúÎåÄ ÎÇôÌè≠
        peak = self.config.initial_capital
        max_dd = 0
        for capital in daily_capital:
            if capital > peak:
                peak = capital
            dd = (peak - capital) / peak
            if dd > max_dd:
                max_dd = dd
        
        # Sharpe/Sortino
        returns = pd.Series(daily_capital).pct_change().dropna()
        sharpe = returns.mean() / returns.std() * np.sqrt(252) if len(returns) > 0 and returns.std() > 0 else 0
        
        negative_returns = returns[returns < 0]
        sortino = returns.mean() / negative_returns.std() * np.sqrt(252) if len(negative_returns) > 0 and negative_returns.std() > 0 else 0
        
        # Ïó∞Í∞Ñ ÏàòÏùµÎ•†
        days = len(daily_capital)
        annual_return = ((final_capital / self.config.initial_capital) ** (252 / days) - 1) * 100 if days > 0 else 0
        
        return {
            'total_return_pct': total_return,
            'annual_return_pct': annual_return,
            'num_trades': len(trades),
            'win_rate': win_rate,
            'profit_factor': profit_factor,
            'max_drawdown': max_dd,
            'sharpe_ratio': sharpe,
            'sortino_ratio': sortino,
            'final_capital': final_capital,
            'trades': trades
        }
    
    def _empty_metrics(self) -> Dict:
        return {
            'total_return_pct': 0, 'annual_return_pct': 0,
            'num_trades': 0, 'win_rate': 0, 'profit_factor': 0,
            'max_drawdown': 0, 'sharpe_ratio': 0, 'sortino_ratio': 0,
            'final_capital': self.config.initial_capital, 'trades': []
        }

# ==================== Swing Trading Î∞±ÌÖåÏä§ÌÑ∞ ====================
class SwingTradingBacktester:
    def __init__(self, config: Config):
        self.config = config
        self.position_manager = PositionManager(config)
    
    def backtest(self, market_df: pd.DataFrame, pred_df: pd.DataFrame, 
                 strategy: str) -> Dict:
        """
        Swing Trading Î∞±ÌÖåÏä§ÌåÖ
        
        ÏßÑÏûÖ: T+1Ïùº ÏãúÍ∞Ä (next_open)
        Ï≤≠ÏÇ∞: Ï°∞Í±¥ Ï∂©Ï°± Ïãú ÎãπÏùº Ï¢ÖÍ∞Ä (ETH_Close)
        """
        capital = self.config.initial_capital
        trades = []
        daily_capital = []
        
        in_position = False
        entry_price = 0
        entry_date = None
        position_size = 0
        holding_days = 0
        entry_cost = 0
        
        merged_df = pred_df.merge(market_df, on='date', how='left')
        
        for idx in range(len(merged_df)):
            row = merged_df.iloc[idx]
            
            if not in_position:
                # ===== ÏßÑÏûÖ Ï°∞Í±¥ =====
                trend_strength = self._calculate_trend_strength(merged_df, idx)
                
                if (trend_strength > self.config.swing_entry_threshold and
                    row['pred_direction'] == 1 and
                    row.get('confidence', 0) > self.config.swing_min_confidence and
                    pd.notna(row.get('next_open'))):
                    
                    position_size = self.position_manager.calculate_position_size(
                        strategy, capital, row.get('confidence', 0.5), trades
                    )
                    
                    # ÏßÑÏûÖ: T+1Ïùº ÏãúÍ∞Ä
                    entry_price = row['next_open']
                    entry_date = row['date']
                    entry_cost = position_size * self.config.trading_cost
                    
                    in_position = True
                    holding_days = 0
            
            else:
                # ===== Ï≤≠ÏÇ∞ Ï°∞Í±¥ =====
                holding_days += 1
                
                # ‚úÖ ÌòÑÏû¨Í∞Ä: ÎãπÏùº Ï¢ÖÍ∞Ä (ETH_Close)
                current_price = row.get('ETH_Close', entry_price)
                if pd.isna(current_price):
                    current_price = entry_price
                
                current_return = (current_price / entry_price - 1)
                trend_strength = self._calculate_trend_strength(merged_df, idx)
                
                should_exit = (
                    current_return <= self.config.swing_stop_loss or
                    current_return >= self.config.swing_take_profit or
                    trend_strength < self.config.swing_exit_threshold or
                    row['pred_direction'] == 0 or
                    holding_days >= self.config.swing_max_holding or
                    idx == len(merged_df) - 1
                )
                
                if should_exit:
                    exit_price = current_price
                    exit_cost = position_size * self.config.trading_cost
                    
                    gross_pnl = position_size * (exit_price / entry_price - 1)
                    net_pnl = gross_pnl - entry_cost - exit_cost
                    
                    capital += net_pnl
                    
                    trades.append({
                        'entry_date': entry_date,
                        'exit_date': row['date'],
                        'type': 'swing_trade',
                        'entry_price': entry_price,
                        'exit_price': exit_price,
                        'position_size': position_size,
                        'holding_days': holding_days,
                        'pnl': net_pnl,
                        'pnl_pct': net_pnl / position_size,
                        'exit_reason': self._get_exit_reason(
                            current_return, trend_strength, 
                            row['pred_direction'], holding_days, 
                            idx, len(merged_df)
                        )
                    })
                    
                    in_position = False
            
            daily_capital.append(capital)
        
        return self._calculate_metrics(trades, daily_capital, market_df)
    
    def _calculate_trend_strength(self, df: pd.DataFrame, idx: int) -> float:
        """ÏµúÍ∑º NÏùº Ìä∏Î†åÎìú Í∞ïÎèÑ Í≥ÑÏÇ∞"""
        start_idx = max(0, idx - self.config.swing_trend_window + 1)
        window_df = df.iloc[start_idx:idx+1]
        
        if len(window_df) == 0:
            return 0
        
        up_ratio = window_df['pred_direction'].sum() / len(window_df)
        avg_confidence = window_df.get('confidence', pd.Series([0.5]*len(window_df))).mean()
        
        return up_ratio * avg_confidence
    
    def _get_exit_reason(self, return_pct: float, trend: float, 
                        pred: int, days: int, idx: int, total_len: int) -> str:
        """Ï≤≠ÏÇ∞ ÏÇ¨Ïú†"""
        if idx == total_len - 1:
            return 'end_of_period'
        if return_pct <= self.config.swing_stop_loss:
            return 'stop_loss'
        elif return_pct >= self.config.swing_take_profit:
            return 'take_profit'
        elif trend < self.config.swing_exit_threshold:
            return 'trend_weakening'
        elif pred == 0:
            return 'reversal_signal'
        elif days >= self.config.swing_max_holding:
            return 'max_holding'
        return 'unknown'
    
    def _calculate_metrics(self, trades: List, daily_capital: List, 
                          market_df: pd.DataFrame) -> Dict:
        """ÏÑ±Í≥º ÏßÄÌëú"""
        backtester = DayTradingBacktester(self.config)
        return backtester._calculate_metrics(trades, daily_capital, market_df)

# ==================== Buy & Hold Î∞±ÌÖåÏä§ÌÑ∞ ====================
class BuyHoldBacktester:
    def __init__(self, config: Config):
        self.config = config
    
    def backtest(self, market_df: pd.DataFrame) -> Dict:
        """Buy & Hold Î∞±ÌÖåÏä§ÌåÖ"""
        if 'next_open' not in market_df.columns or 'next_close' not in market_df.columns:
            return {
                'total_return_pct': 0, 'annual_return_pct': 0,
                'num_trades': 0, 'win_rate': 0, 'profit_factor': 0,
                'max_drawdown': 0, 'sharpe_ratio': 0, 'sortino_ratio': 0,
                'final_capital': self.config.initial_capital, 'trades': []
            }
        
        entry_price = market_df['next_open'].iloc[0]
        exit_price = market_df['next_close'].iloc[-1]
        
        capital = self.config.initial_capital
        position_size = capital
        
        # Îß§Ïàò/Îß§ÎèÑ ÎπÑÏö©
        entry_cost = position_size * self.config.trading_cost
        exit_cost = position_size * self.config.trading_cost
        
        gross_pnl = position_size * (exit_price / entry_price - 1)
        net_pnl = gross_pnl - entry_cost - exit_cost
        
        final_capital = capital + net_pnl
        total_return = (final_capital / capital - 1) * 100
        
        days = len(market_df)
        annual_return = ((final_capital / capital) ** (252 / days) - 1) * 100 if days > 0 else 0
        
        return {
            'total_return_pct': total_return,
            'annual_return_pct': annual_return,
            'num_trades': 1,
            'win_rate': 1 if net_pnl > 0 else 0,
            'profit_factor': 0,
            'max_drawdown': 0,
            'sharpe_ratio': 0,
            'sortino_ratio': 0,
            'final_capital': final_capital,
            'trades': [{
                'entry_date': market_df['date'].iloc[0],
                'exit_date': market_df['date'].iloc[-1],
                'entry_price': entry_price,
                'exit_price': exit_price,
                'pnl': net_pnl
            }]
        }

# ==================== Î©îÏù∏ Î∞±ÌÖåÏä§ÌåÖ ÏóîÏßÑ ====================
class BacktestEngine:
    def __init__(self, config: Config):
        self.config = config
        self.loader = ImprovedDataLoader(config)  # ‚úÖ Í∞úÏÑ†Îêú Î°úÎçî ÏÇ¨Ïö©
        self.day_backtester = DayTradingBacktester(config)
        self.swing_backtester = SwingTradingBacktester(config)
        self.buyhold_backtester = BuyHoldBacktester(config)
        
        self.results = defaultdict(list)
        self.buyhold_results = {}
        self.market_regimes = {}
    
    def run_all_backtests(self):
        """Ï†ÑÏ≤¥ Î∞±ÌÖåÏä§ÌåÖ Ïã§Ìñâ"""
        print("=" * 80)
        print("Ïù¥ÎçîÎ¶¨ÏõÄ Ìä∏Î†àÏù¥Îî© Î∞±ÌÖåÏä§ÌåÖ ÏãúÏä§ÌÖú (Data Leakage Î∞©ÏßÄ)")
        print("=" * 80)
        
        total_backtests = len(self.config.folds) * len(self.config.models) * \
                         len(self.config.position_strategies) * 2  # Day + Swing
        
        print(f"\nÏ≤òÎ¶¨ Ï§ë: {len(self.config.folds)} Folds √ó {len(self.config.models)} Models √ó "
              f"{len(self.config.position_strategies)} Strategies √ó 2 Methods = {total_backtests} backtests\n")
        
        # FoldÎ≥Ñ Ï≤òÎ¶¨
        for fold in self.config.folds:
            print(f"Processing Fold {fold}...")
            
            try:
                market_df, predictions = self.loader.load_fold_data(fold)
                
                # ÏãúÏû• Ï≤¥Ï†ú Î∂ÑÎ•ò
                regime = self.loader.calculate_market_regime(market_df)
                self.market_regimes[fold] = regime
                
                # Buy & Hold Î∞±ÌÖåÏä§ÌåÖ
                buyhold_result = self.buyhold_backtester.backtest(market_df)
                self.buyhold_results[fold] = buyhold_result
                
                # Î™®Îç∏Î≥Ñ Î∞±ÌÖåÏä§ÌåÖ
                for model in self.config.models:
                    if model not in predictions:
                        continue
                    
                    pred_df = predictions[model]
                    
                    for strategy in self.config.position_strategies.keys():
                        # Day Trading
                        try:
                            day_result = self.day_backtester.backtest(market_df, pred_df, strategy)
                            self._store_result('day_trading', fold, model, strategy, 
                                              day_result, buyhold_result, regime)
                        except Exception as e:
                            print(f"  ‚ö†Ô∏è  {model} Day Trading Ïò§Î•ò: {str(e)[:50]}")
                        
                        # Swing Trading
                        try:
                            swing_result = self.swing_backtester.backtest(market_df, pred_df, strategy)
                            self._store_result('swing_trading', fold, model, strategy, 
                                              swing_result, buyhold_result, regime)
                        except Exception as e:
                            print(f"  ‚ö†Ô∏è  {model} Swing Trading Ïò§Î•ò: {str(e)[:50]}")
            
            except Exception as e:
                print(f"  ‚ùå Fold {fold} Ï≤òÎ¶¨ Ïã§Ìå®: {str(e)}")
                continue
        
        self._save_results()
        self._print_summary()
    
    def _store_result(self, method: str, fold: int, model: str, strategy: str,
                     result: Dict, buyhold_result: Dict, regime: str):
        """Í≤∞Í≥º Ï†ÄÏû•"""
        alpha = result['total_return_pct'] - buyhold_result['total_return_pct']
        
        self.results[method].append({
            'fold': fold,
            'model': model,
            'strategy': strategy,
            'regime': regime,
            'total_return_pct': result['total_return_pct'],
            'annual_return_pct': result['annual_return_pct'],
            'alpha': alpha,
            'num_trades': result['num_trades'],
            'win_rate': result['win_rate'],
            'profit_factor': result['profit_factor'],
            'max_drawdown': result['max_drawdown'],
            'sharpe_ratio': result['sharpe_ratio'],
            'sortino_ratio': result['sortino_ratio'],
            'final_capital': result['final_capital'],
            'buyhold_return': buyhold_result['total_return_pct']
        })
    
    def _save_results(self):
        """Í≤∞Í≥º ÌååÏùº Ï†ÄÏû•"""
        print("\nÍ≤∞Í≥º Ï†ÄÏû• Ï§ë...")
        
        # 1. Day Trading Results
        if self.results['day_trading']:
            day_df = pd.DataFrame(self.results['day_trading'])
            day_df.to_csv(self.config.output_path / "day_trading_results.csv", index=False)
        
        # 2. Swing Trading Results
        if self.results['swing_trading']:
            swing_df = pd.DataFrame(self.results['swing_trading'])
            swing_df.to_csv(self.config.output_path / "swing_trading_results.csv", index=False)
        
        # 3. Buy & Hold Benchmark
        if self.buyhold_results:
            buyhold_list = []
            for fold, result in self.buyhold_results.items():
                buyhold_list.append({
                    'fold': fold,
                    'regime': self.market_regimes.get(fold, 'UNKNOWN'),
                    'total_return_pct': result['total_return_pct'],
                    'annual_return_pct': result['annual_return_pct']
                })
            buyhold_df = pd.DataFrame(buyhold_list)
            buyhold_df.to_csv(self.config.output_path / "buy_hold_benchmark.csv", index=False)
        
        # 4. Comparison Summary
        if self.results['day_trading'] or self.results['swing_trading']:
            summary_list = []
            
            day_df = pd.DataFrame(self.results['day_trading']) if self.results['day_trading'] else pd.DataFrame()
            swing_df = pd.DataFrame(self.results['swing_trading']) if self.results['swing_trading'] else pd.DataFrame()
            
            if not day_df.empty:
                day_df['method'] = 'day_trading'
            if not swing_df.empty:
                swing_df['method'] = 'swing_trading'
            
            all_df = pd.concat([day_df, swing_df], ignore_index=True)
            
            if not all_df.empty:
                for (model, strategy, method), group in all_df.groupby(['model', 'strategy', 'method']):
                    bull_alpha = group[group['regime'] == 'BULL']['alpha'].mean() if 'BULL' in group['regime'].values else 0
                    bear_alpha = group[group['regime'] == 'BEAR']['alpha'].mean() if 'BEAR' in group['regime'].values else 0
                    profitable_folds = (group['alpha'] > 0).sum()
                    
                    summary_list.append({
                        'model': model,
                        'strategy': strategy,
                        'method': method,
                        'avg_alpha': group['alpha'].mean(),
                        'avg_return': group['total_return_pct'].mean(),
                        'bull_alpha': bull_alpha,
                        'bear_alpha': bear_alpha,
                        'profitable_folds': profitable_folds,
                        'avg_win_rate': group['win_rate'].mean(),
                        'avg_sharpe': group['sharpe_ratio'].mean()
                    })
                
                summary_df = pd.DataFrame(summary_list)
                summary_df.to_csv(self.config.output_path / "comparison_summary.csv", index=False)
                
                # 5. Top 20 Models
                top20 = summary_df.nlargest(20, 'avg_alpha')
                top20.to_csv(self.config.output_path / "top_20_models.csv", index=False)
        
        print(f"‚úì Í≤∞Í≥º Ï†ÄÏû• ÏôÑÎ£å: {self.config.output_path}")
    
    def _print_summary(self):
        """ÏöîÏïΩ Ï∂úÎ†•"""
        print("\n" + "=" * 80)
        print("Î∞±ÌÖåÏä§ÌåÖ ÏôÑÎ£å - ÏöîÏïΩ")
        print("=" * 80)
        
        # ÏãúÏû• Ï≤¥Ï†ú
        if self.market_regimes:
            regimes = pd.Series(self.market_regimes.values()).value_counts()
            print(f"\nÏãúÏû• Ï≤¥Ï†ú: BULL {regimes.get('BULL', 0)}Í∞ú, BEAR {regimes.get('BEAR', 0)}Í∞ú, "
                  f"SIDEWAYS {regimes.get('SIDEWAYS', 0)}Í∞ú")
        
        # TOP 20
        summary_path = self.config.output_path / "comparison_summary.csv"
        if summary_path.exists():
            summary_df = pd.read_csv(summary_path)
            top20 = summary_df.nlargest(20, 'avg_alpha')
            
            print("\n" + "-" * 80)
            print("TOP 20 MODELS BY ALPHA")
            print("-" * 80)
            
            for idx, row in top20.head(10).iterrows():
                print(f"[Rank {idx+1}] {row['model']} + {row['strategy']} + {row['method']}")
                print(f"  Avg Alpha: {row['avg_alpha']:+.1f}%, Profitable: {int(row['profitable_folds'])}/7, "
                      f"Bull Alpha: {row['bull_alpha']:+.1f}%, Bear Alpha: {row['bear_alpha']:+.1f}%")
            
            # Î∞©Î≤ïÎ°† ÎπÑÍµê
            print("\n" + "-" * 80)
            print("METHOD COMPARISON")
            print("-" * 80)
            
            day_results = summary_df[summary_df['method'] == 'day_trading']
            swing_results = summary_df[summary_df['method'] == 'swing_trading']
            
            if not day_results.empty and not swing_results.empty:
                day_avg_return = day_results['avg_return'].mean()
                day_avg_alpha = day_results['avg_alpha'].mean()
                swing_avg_return = swing_results['avg_return'].mean()
                swing_avg_alpha = swing_results['avg_alpha'].mean()
                
                if self.buyhold_results:
                    buyhold_avg = pd.DataFrame([r for r in self.buyhold_results.values()])['total_return_pct'].mean()
                    print(f"Buy & Hold: {buyhold_avg:+.1f}% (baseline)")
                
                print(f"Day Trading: {day_avg_return:+.1f}% (Alpha {day_avg_alpha:+.1f}%)")
                print(f"Swing Trading: {swing_avg_return:+.1f}% (Alpha {swing_avg_alpha:+.1f}%)")
                
                best_method = 'Swing Trading' if swing_avg_alpha > day_avg_alpha else 'Day Trading'
                best_alpha = max(swing_avg_alpha, day_avg_alpha)
                print(f"\n‚Üí {best_method}Ïù¥ {best_alpha:+.1f}% Ïö∞Ïàò")
        
        print("\n" + "=" * 80)

# ==================== Í∞ïÌôîÌïôÏäµ Í∏∞Î∞ò ÎèôÏ†Å Ìè¨ÏßÄÏÖò Í¥ÄÎ¶¨ ====================
class RLPositionManager:
    """Q-Learning Í∏∞Î∞ò Ìè¨ÏßÄÏÖò ÌÅ¨Í∏∞ Í≤∞Ï†ï"""
    def __init__(self, config: Config):
        self.config = config
        self.q_table = {}
        self.epsilon = 0.1
        self.alpha = 0.1
        self.gamma = 0.95
        self.actions = [0.1, 0.3, 0.5, 0.7]
    
    def get_state(self, confidence: float, recent_performance: float, 
                  volatility: float) -> tuple:
        """ÏÉÅÌÉú Ïù¥ÏÇ∞Ìôî"""
        conf_bin = int(confidence * 10) / 10
        perf_bin = 'good' if recent_performance > 0.02 else 'bad' if recent_performance < -0.02 else 'neutral'
        vol_bin = 'high' if volatility > 0.03 else 'low'
        return (conf_bin, perf_bin, vol_bin)
    
    def select_action(self, state: tuple, training: bool = True) -> float:
        """ÌñâÎèô ÏÑ†ÌÉù (Œµ-greedy)"""
        if state not in self.q_table:
            self.q_table[state] = {a: 0.0 for a in self.actions}
        
        if training and np.random.random() < self.epsilon:
            return np.random.choice(self.actions)
        
        return max(self.q_table[state].items(), key=lambda x: x[1])[0]
    
    def update_q_value(self, state: tuple, action: float, reward: float, 
                       next_state: tuple):
        """Q-ÌÖåÏù¥Î∏î ÏóÖÎç∞Ïù¥Ìä∏"""
        if state not in self.q_table:
            self.q_table[state] = {a: 0.0 for a in self.actions}
        if next_state not in self.q_table:
            self.q_table[next_state] = {a: 0.0 for a in self.actions}
        
        current_q = self.q_table[state][action]
        max_next_q = max(self.q_table[next_state].values())
        
        new_q = current_q + self.alpha * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action] = new_q

# ==================== ÎèôÏ†Å Ìè¨ÏßÄÏÖò ÌÅ¨Í∏∞ Ï°∞Ï†ï ====================
class DynamicPositionManager(PositionManager):
    """Î≥ÄÎèôÏÑ±Í≥º Ï∂îÏÑ∏ Í∏∞Î∞ò ÎèôÏ†Å Ìè¨ÏßÄÏÖò Ï°∞Ï†ï"""
    
    def calculate_dynamic_position(self, capital: float, confidence: float,
                                   recent_volatility: float, trend_strength: float,
                                   win_streak: int) -> float:
        """Îã§Ï§ë ÏöîÏÜå Í∏∞Î∞ò Ìè¨ÏßÄÏÖò ÌÅ¨Í∏∞"""
        base_size = 0.3
        
        confidence_factor = confidence * 0.5 + 0.5
        volatility_factor = max(0.5, 1.0 - recent_volatility * 10)
        trend_factor = 0.8 + trend_strength * 0.4
        streak_bonus = 1.0 + min(win_streak, 3) * 0.05
        
        final_size = base_size * confidence_factor * volatility_factor * trend_factor * streak_bonus
        final_size = np.clip(final_size, 0.05, 0.7)
        
        return capital * final_size

# ==================== ÌôïÏû•Îêú Î∞±ÌÖåÏä§ÌåÖ ÏóîÏßÑ ====================
class EnhancedBacktestEngine(BacktestEngine):
    """Í∞ïÌôîÌïôÏäµ Î∞è LLM ÌÜµÌï© Î∞±ÌÖåÏä§ÌåÖ"""
    
    def __init__(self, config: Config):
        super().__init__(config)
        self.rl_manager = RLPositionManager(config)
        self.dynamic_manager = DynamicPositionManager(config)
    
    def run_enhanced_backtests(self):
        """Í∞ïÌôîÌïôÏäµ/ÎèôÏ†Å Ìè¨ÏßÄÏÖò Î∞±ÌÖåÏä§ÌåÖ"""
        print("\n" + "=" * 80)
        print("Í≥†Í∏â Ï†ÑÎûµ Î∞±ÌÖåÏä§ÌåÖ (RL + Dynamic)")
        print("=" * 80)
        
        enhanced_results = []
        
        for fold in self.config.folds:
            try:
                market_df, predictions = self.loader.load_fold_data(fold)
                regime = self.market_regimes.get(fold, self.loader.calculate_market_regime(market_df))
                
                for model in self.config.models:
                    if model not in predictions:
                        continue
                    
                    pred_df = predictions[model]
                    
                    # RL Í∏∞Î∞ò
                    rl_result = self._backtest_with_rl(market_df, pred_df, fold)
                    enhanced_results.append({
                        'fold': fold, 'model': model, 'strategy': 'rl_position',
                        'method': 'swing_trading', 'regime': regime, **rl_result
                    })
                    
                    # ÎèôÏ†Å Ìè¨ÏßÄÏÖò
                    dynamic_result = self._backtest_with_dynamic(market_df, pred_df, fold)
                    enhanced_results.append({
                        'fold': fold, 'model': model, 'strategy': 'dynamic_position',
                        'method': 'day_trading', 'regime': regime, **dynamic_result
                    })
            
            except Exception as e:
                print(f"  ‚ö†Ô∏è  Fold {fold} Í≥†Í∏â Ï†ÑÎûµ Ïò§Î•ò: {str(e)[:50]}")
                continue
        
        if enhanced_results:
            enhanced_df = pd.DataFrame(enhanced_results)
            enhanced_df.to_csv(self.config.output_path / "enhanced_strategies.csv", index=False)
            print(f"‚úì Í≥†Í∏â Ï†ÑÎûµ Î∞±ÌÖåÏä§ÌåÖ ÏôÑÎ£å: {len(enhanced_results)} Í≤∞Í≥º")
    
    def _backtest_with_rl(self, market_df: pd.DataFrame, pred_df: pd.DataFrame,
                         fold: int) -> dict:
        """Í∞ïÌôîÌïôÏäµ Í∏∞Î∞ò Ìè¨ÏßÄÏÖò Í¥ÄÎ¶¨"""
        capital = self.config.initial_capital
        trades = []
        recent_returns = []
        
        merged_df = pred_df.merge(market_df, on='date')
        
        for idx, row in merged_df.iterrows():
            if row['pred_direction'] == 1 and pd.notna(row.get('next_open')) and pd.notna(row.get('next_close')):
                recent_perf = np.mean(recent_returns[-10:]) if recent_returns else 0
                volatility = np.std(recent_returns[-20:]) if len(recent_returns) >= 20 else 0.02
                
                state = self.rl_manager.get_state(row.get('confidence', 0.5), recent_perf, volatility)
                action = self.rl_manager.select_action(state, training=True)
                
                position_size = capital * action
                
                entry_price = row['next_open']
                exit_price = row['next_close']
                
                pnl = position_size * (exit_price / entry_price - 1) - \
                      position_size * self.config.trading_cost * 2
                
                capital += pnl
                return_pct = pnl / position_size
                recent_returns.append(return_pct)
                
                reward = return_pct * 100
                next_state = self.rl_manager.get_state(row.get('confidence', 0.5), return_pct, volatility)
                self.rl_manager.update_q_value(state, action, reward, next_state)
                
                trades.append({'pnl': pnl, 'pnl_pct': return_pct})
        
        return self._calculate_simple_metrics(trades, capital)
    
    def _backtest_with_dynamic(self, market_df: pd.DataFrame, 
                              pred_df: pd.DataFrame, fold: int) -> dict:
        """ÎèôÏ†Å Ìè¨ÏßÄÏÖò Í¥ÄÎ¶¨"""
        capital = self.config.initial_capital
        trades = []
        win_streak = 0
        recent_returns = []
        
        merged_df = pred_df.merge(market_df, on='date')
        
        for idx in range(len(merged_df)):
            row = merged_df.iloc[idx]
            
            if row['pred_direction'] == 1 and pd.notna(row.get('next_open')) and pd.notna(row.get('next_close')):
                volatility = np.std(recent_returns[-20:]) if len(recent_returns) >= 20 else 0.02
                
                start_idx = max(0, idx - 5)
                trend_strength = merged_df.iloc[start_idx:idx+1]['pred_direction'].mean()
                
                position_size = self.dynamic_manager.calculate_dynamic_position(
                    capital, row.get('confidence', 0.5), volatility, trend_strength, win_streak
                )
                
                entry_price = row['next_open']
                exit_price = row['next_close']
                
                pnl = position_size * (exit_price / entry_price - 1) - \
                      position_size * self.config.trading_cost * 2
                
                capital += pnl
                return_pct = pnl / position_size
                recent_returns.append(return_pct)
                
                if pnl > 0:
                    win_streak += 1
                else:
                    win_streak = 0
                
                trades.append({'pnl': pnl, 'pnl_pct': return_pct})
        
        return self._calculate_simple_metrics(trades, capital)
    
    def _calculate_simple_metrics(self, trades: List, final_capital: float) -> dict:
        """Í∞ÑÎã®Ìïú Î©îÌä∏Î¶≠"""
        if not trades:
            return {'total_return_pct': 0, 'alpha': 0, 'num_trades': 0, 'win_rate': 0}
        
        total_return = (final_capital / self.config.initial_capital - 1) * 100
        wins = [t for t in trades if t['pnl'] > 0]
        
        return {
            'total_return_pct': total_return,
            'alpha': total_return,
            'num_trades': len(trades),
            'win_rate': len(wins) / len(trades),
            'final_capital': final_capital
        }

# ==================== Ïã§Ìñâ ====================
if __name__ == "__main__":
    config = Config()
    
    # ===== Í∏∞Î≥∏ Î∞±ÌÖåÏä§ÌåÖ =====
    print("\nüöÄ Í∏∞Î≥∏ Î∞±ÌÖåÏä§ÌåÖ ÏãúÏûë")
    engine = BacktestEngine(config)
    engine.run_all_backtests()
    
    # ===== Í≥†Í∏â Î∞±ÌÖåÏä§ÌåÖ (ÏòµÏÖò) =====
    print("\nüöÄ Í≥†Í∏â Î∞±ÌÖåÏä§ÌåÖ ÏãúÏûë")
    enhanced_engine = EnhancedBacktestEngine(config)
    enhanced_engine.run_enhanced_backtests()
    
    print("\n‚úì Î™®Îì† Î∞±ÌÖåÏä§ÌåÖ ÏôÑÎ£å!")
    print(f"‚úì Í≤∞Í≥º ÌååÏùº ÏúÑÏπò: {config.output_path}")



üöÄ Í∏∞Î≥∏ Î∞±ÌÖåÏä§ÌåÖ ÏãúÏûë
Loading raw price data from: ../macro_data/macro_data/macro_crypto_data.csv
‚úì Loaded 3219 rows of price data
Ïù¥ÎçîÎ¶¨ÏõÄ Ìä∏Î†àÏù¥Îî© Î∞±ÌÖåÏä§ÌåÖ ÏãúÏä§ÌÖú (Data Leakage Î∞©ÏßÄ)

Ï≤òÎ¶¨ Ï§ë: 7 Folds √ó 29 Models √ó 8 Strategies √ó 2 Methods = 3248 backtests

Processing Fold 1...
  ‚ö†Ô∏è  SVM Day Trading Ïò§Î•ò: float division by zero
Processing Fold 2...
  ‚ö†Ô∏è  SVM Day Trading Ïò§Î•ò: float division by zero
Processing Fold 3...
  ‚ö†Ô∏è  SVM Day Trading Ïò§Î•ò: float division by zero
Processing Fold 4...


KeyboardInterrupt: 

In [4]:
"""
Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù Ïä§ÌÅ¨Î¶ΩÌä∏ - Î∞±ÌÖåÏä§ÌåÖ Ï†Ñ Îç∞Ïù¥ÌÑ∞ Ïù¥ÏÉÅ ÌÉêÏßÄ
"""

import pandas as pd
import numpy as np
from pathlib import Path

# ÏÑ§Ï†ï
BASE_PATH = Path("../model_results/2025-10-26")
RAW_DATA_PATH = Path("../macro_data/macro_data/macro_crypto_data.csv")
FOLDS = list(range(1, 8))
MODELS_TO_CHECK = ['SVM', 'RandomForest', 'LightGBM', 'XGBoost']  

def load_raw_prices():
    """ÏõêÎ≥∏ Í∞ÄÍ≤© Îç∞Ïù¥ÌÑ∞ Î°úÎìú"""
    print("Loading raw price data...")
    raw_prices = pd.read_csv(RAW_DATA_PATH)
    raw_prices['date'] = pd.to_datetime(raw_prices['date'])
    
    price_cols = ['date', 'ETH_Open', 'ETH_High', 'ETH_Low', 'ETH_Close', 'ETH_Volume']
    available_cols = [col for col in price_cols if col in raw_prices.columns]
    raw_prices = raw_prices[available_cols].copy()
    
    print(f"Loaded {len(raw_prices)} rows")
    return raw_prices

def load_fold_data(fold, raw_prices):
    """FoldÎ≥Ñ Îç∞Ïù¥ÌÑ∞ Î°úÎìú"""
    fold_name = f"fold_{fold}_walk_forward_rolling_reverse"
    
    # ÏãúÏû• Îç∞Ïù¥ÌÑ∞
    market_path = BASE_PATH / "raw_data/direction/walk_forward" / fold_name / "test_raw.csv"
    
    if not market_path.exists():
        print(f"Market data not found: {market_path}")
        return None, {}
    
    market_df = pd.read_csv(market_path)
    market_df['date'] = pd.to_datetime(market_df['date'])
    
    # Target Ïª¨Îüº
    target_cols = ['date', 'next_open', 'next_close', 'next_direction', 'next_log_return']
    available_targets = [col for col in target_cols if col in market_df.columns]
    market_df = market_df[available_targets].copy()
    
    # ÏõêÎ≥∏ Í∞ÄÍ≤© Î≥ëÌï©
    market_df = market_df.merge(raw_prices, on='date', how='left')
    
    # Î™®Îç∏ ÏòàÏ∏° Î°úÎìú
    predictions = {}
    pred_base = BASE_PATH / "fold_results/direction" / fold_name
    
    for model in MODELS_TO_CHECK:
        pred_path = pred_base / f"{model}_predictions.csv"
        if pred_path.exists():
            pred_df = pd.read_csv(pred_path)
            pred_df['date'] = pd.to_datetime(pred_df['date'])
            predictions[model] = pred_df
    
    return market_df, predictions

def validate_data(market_df, pred_df, model_name, fold):
    """Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù"""
    print(f"\n{'='*80}")
    print(f"Fold {fold} - {model_name} Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù")
    print(f"{'='*80}")
    
    if market_df is None or market_df.empty:
        print("ERROR: Market data is empty or None")
        return
    
    if pred_df is None or pred_df.empty:
        print("ERROR: Prediction data is empty or None")
        return
    
    print(f"\n[Îç∞Ïù¥ÌÑ∞ ÌÅ¨Í∏∞]")
    print(f"Market rows: {len(market_df)}")
    print(f"Pred rows: {len(pred_df)}")
    
    # ÎÇ†Ïßú Î≤îÏúÑ
    print(f"\n[ÎÇ†Ïßú Î≤îÏúÑ]")
    print(f"Market: {market_df['date'].min()} ~ {market_df['date'].max()}")
    print(f"Pred: {pred_df['date'].min()} ~ {pred_df['date'].max()}")
    
    # Í∞ÄÍ≤© ÌÜµÍ≥Ñ
    print(f"\n[Í∞ÄÍ≤© ÌÜµÍ≥Ñ]")
    price_cols = ['next_open', 'next_close', 'ETH_Open', 'ETH_Close']
    available_price_cols = [col for col in price_cols if col in market_df.columns]
    
    if available_price_cols:
        print(market_df[available_price_cols].describe())
    else:
        print("WARNING: No price columns found")
    
    # ÏòàÏ∏° Î∂ÑÌè¨
    print(f"\n[ÏòàÏ∏° Î∂ÑÌè¨]")
    if 'pred_direction' in pred_df.columns:
        print(pred_df['pred_direction'].value_counts())
        print(f"Upward predictions: {(pred_df['pred_direction'] == 1).sum()}")
        print(f"Downward predictions: {(pred_df['pred_direction'] == 0).sum()}")
    else:
        print("WARNING: pred_direction column not found")
    
    # Confidence ÌÜµÍ≥Ñ
    if 'confidence' in pred_df.columns:
        print(f"\n[Confidence ÌÜµÍ≥Ñ]")
        print(pred_df['confidence'].describe())
    
    # NaN Í∞úÏàò
    print(f"\n[NaN Í∞úÏàò]")
    for col in available_price_cols:
        nan_count = market_df[col].isna().sum()
        print(f"{col}: {nan_count}")
    
    # 0 Ïù¥Ìïò Í∞ÄÍ≤©
    print(f"\n[Ïù¥ÏÉÅ Í∞ÄÍ≤© (0 Ïù¥Ìïò)]")
    for col in available_price_cols:
        zero_or_below = (market_df[col] <= 0).sum()
        if zero_or_below > 0:
            print(f"WARNING: {col} has {zero_or_below} values <= 0")
            print(f"  Min value: {market_df[col].min()}")
        else:
            print(f"{col}: OK")
    
    # Í∑πÎã®Í∞í
    print(f"\n[Í∑πÎã® Í∞ÄÍ≤© Î≥ÄÌôî]")
    if 'next_open' in market_df.columns and 'next_close' in market_df.columns:
        market_df_clean = market_df[['next_open', 'next_close']].dropna()
        if len(market_df_clean) > 0:
            returns = (market_df_clean['next_close'] / market_df_clean['next_open'] - 1)
            extreme_returns = returns[abs(returns) > 0.5]
            
            if len(extreme_returns) > 0:
                print(f"WARNING: {len(extreme_returns)} extreme returns (>50%)")
                print(f"  Max return: {returns.max():.2%}")
                print(f"  Min return: {returns.min():.2%}")
            else:
                print("OK: No extreme returns")
    
    # Î≥ëÌï© ÌõÑ Îç∞Ïù¥ÌÑ∞ ÏÜêÏã§ Ï≤¥ÌÅ¨
    print(f"\n[Î≥ëÌï© Í≤ÄÏ¶ù]")
    merged = pred_df.merge(market_df, on='date', how='left')
    merged_valid = merged[['next_open', 'next_close']].dropna()
    
    print(f"Original pred rows: {len(pred_df)}")
    print(f"After merge: {len(merged)}")
    print(f"Valid (no NaN): {len(merged_valid)}")
    print(f"Data loss: {len(pred_df) - len(merged_valid)} rows ({(len(pred_df) - len(merged_valid))/len(pred_df)*100:.1f}%)")

def main():
    """Î©îÏù∏ Ïã§Ìñâ"""
    print("="*80)
    print("Î∞±ÌÖåÏä§ÌåÖ Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù Ïä§ÌÅ¨Î¶ΩÌä∏")
    print("="*80)
    
    # ÏõêÎ≥∏ Í∞ÄÍ≤© Î°úÎìú
    raw_prices = load_raw_prices()
    
    # FoldÎ≥Ñ Í≤ÄÏ¶ù
    for fold in FOLDS:
        print(f"\n\nProcessing Fold {fold}...")
        
        market_df, predictions = load_fold_data(fold, raw_prices)
        
        if market_df is None:
            print(f"Skipping Fold {fold}: Data loading failed")
            continue
        
        # Í∞Å Î™®Îç∏Î≥Ñ Í≤ÄÏ¶ù
        for model_name in MODELS_TO_CHECK:
            if model_name in predictions:
                pred_df = predictions[model_name]
                validate_data(market_df, pred_df, model_name, fold)
            else:
                print(f"\nWARNING: {model_name} predictions not found for Fold {fold}")
    
    print("\n" + "="*80)
    print("Í≤ÄÏ¶ù ÏôÑÎ£å")
    print("="*80)

if __name__ == "__main__":
    main()


Î∞±ÌÖåÏä§ÌåÖ Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù Ïä§ÌÅ¨Î¶ΩÌä∏
Loading raw price data...
Loaded 3219 rows


Processing Fold 1...

Fold 1 - SVM Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù

[Îç∞Ïù¥ÌÑ∞ ÌÅ¨Í∏∞]
Market rows: 150
Pred rows: 150

[ÎÇ†Ïßú Î≤îÏúÑ]
Market: 2022-12-07 00:00:00 ~ 2023-05-05 00:00:00
Pred: 2022-12-07 00:00:00 ~ 2023-05-05 00:00:00

[Í∞ÄÍ≤© ÌÜµÍ≥Ñ]
         next_open   next_close     ETH_Open    ETH_Close
count   150.000000   150.000000   150.000000   150.000000
mean   1604.183429  1608.541800  1599.357256  1604.089905
std     247.956502   247.375085   247.333680   248.098470
min    1167.882690  1167.609863  1167.882690  1167.609863
25%    1431.515656  1441.899292  1420.734833  1431.533752
50%    1633.073853  1637.197021  1630.098022  1632.986145
75%    1794.596893  1803.997589  1793.613190  1795.031555
max    2120.001221  2120.005859  2120.001221  2120.005859

[ÏòàÏ∏° Î∂ÑÌè¨]
1    136
0     14
Name: pred_direction, dtype: int64
Upward predictions: 136
Downward predictions: 14

[Confidence ÌÜµÍ≥Ñ]
count    150.0


Fold 3 - SVM Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù

[Îç∞Ïù¥ÌÑ∞ ÌÅ¨Í∏∞]
Market rows: 150
Pred rows: 150

[ÎÇ†Ïßú Î≤îÏúÑ]
Market: 2023-10-03 00:00:00 ~ 2024-02-29 00:00:00
Pred: 2023-10-03 00:00:00 ~ 2024-02-29 00:00:00

[Í∞ÄÍ≤© ÌÜµÍ≥Ñ]
         next_open   next_close     ETH_Open    ETH_Close
count   150.000000   150.000000   150.000000   150.000000
mean   2207.666781  2219.380427  2196.472924  2207.524639
std     401.014524   410.803404   392.483401   401.029175
min    1539.432861  1539.612427  1539.432861  1539.612427
25%    1940.605164  1960.981384  1908.845581  1943.020477
50%    2233.593872  2236.552124  2232.305420  2232.611572
75%    2358.306824  2368.834412  2356.872253  2358.443848
max    3386.802734  3435.053955  3386.802734  3385.703857

[ÏòàÏ∏° Î∂ÑÌè¨]
0    91
1    59
Name: pred_direction, dtype: int64
Upward predictions: 59
Downward predictions: 91

[Confidence ÌÜµÍ≥Ñ]
count    150.000000
mean       0.090512
std        0.066779
min        0.000000
25%        0.032755
50%        0.081989
75%   

count    150.000000
mean       0.122158
std        0.088272
min        0.000300
25%        0.036740
50%        0.105022
75%        0.212012
max        0.264644
Name: confidence, dtype: float64

[NaN Í∞úÏàò]
next_open: 0
next_close: 0
ETH_Open: 0
ETH_Close: 0

[Ïù¥ÏÉÅ Í∞ÄÍ≤© (0 Ïù¥Ìïò)]
next_open: OK
next_close: OK
ETH_Open: OK
ETH_Close: OK

[Í∑πÎã® Í∞ÄÍ≤© Î≥ÄÌôî]
OK: No extreme returns

[Î≥ëÌï© Í≤ÄÏ¶ù]
Original pred rows: 150
After merge: 150
Valid (no NaN): 150
Data loss: 0 rows (0.0%)


Processing Fold 6...

Fold 6 - SVM Îç∞Ïù¥ÌÑ∞ Í≤ÄÏ¶ù

[Îç∞Ïù¥ÌÑ∞ ÌÅ¨Í∏∞]
Market rows: 150
Pred rows: 150

[ÎÇ†Ïßú Î≤îÏúÑ]
Market: 2024-12-26 00:00:00 ~ 2025-05-24 00:00:00
Pred: 2024-12-26 00:00:00 ~ 2025-05-24 00:00:00

[Í∞ÄÍ≤© ÌÜµÍ≥Ñ]
         next_open   next_close     ETH_Open    ETH_Close
count   150.000000   150.000000   150.000000   150.000000
mean   2444.672059  2439.465636  2451.088153  2444.662048
std     630.206443   626.054112   635.962964   630.213524
min    1472.601440  1472.553101  1472

In [6]:
"""
next_open/next_close shift Í≤ÄÏ¶ù - Data Leakage ÌôïÏù∏
"""

import pandas as pd
import numpy as np
from pathlib import Path

BASE_PATH = Path("../model_results/2025-10-26")
RAW_DATA_PATH = Path("../macro_data/macro_data/macro_crypto_data.csv")
FOLDS = [1, 2, 3]  # ÏùºÎ∂ÄÎßå ÌôïÏù∏

def load_raw_prices():
    """ÏõêÎ≥∏ Í∞ÄÍ≤© Îç∞Ïù¥ÌÑ∞ Î°úÎìú"""
    raw_prices = pd.read_csv(RAW_DATA_PATH)
    raw_prices['date'] = pd.to_datetime(raw_prices['date'])
    price_cols = ['date', 'ETH_Open', 'ETH_Close']
    return raw_prices[price_cols].copy()

def validate_shift_operation(fold, raw_prices):
    """shift(-1) Ïó∞ÏÇ∞ Í≤ÄÏ¶ù"""
    fold_name = f"fold_{fold}_walk_forward_rolling_reverse"
    
    # test_raw.csv Î°úÎìú
    market_path = BASE_PATH / "raw_data/direction/walk_forward" / fold_name / "test_raw.csv"
    
    if not market_path.exists():
        print(f"Fold {fold}: File not found")
        return
    
    market_df = pd.read_csv(market_path)
    market_df['date'] = pd.to_datetime(market_df['date'])
    
    # ÏõêÎ≥∏ Í∞ÄÍ≤© Î≥ëÌï©
    merged_df = market_df.merge(raw_prices, on='date', how='left')
    
    print(f"\n{'='*80}")
    print(f"Fold {fold} - shift(-1) Í≤ÄÏ¶ù")
    print(f"{'='*80}")
    
    print(f"\n[1Îã®Í≥Ñ] test_raw.csvÏóêÏÑú next_open/next_close ÌôïÏù∏")
    print(f"  - next_open Ïª¨Îüº Ï°¥Ïû¨: {'next_open' in market_df.columns}")
    print(f"  - next_close Ïª¨Îüº Ï°¥Ïû¨: {'next_close' in market_df.columns}")
    
    if 'next_open' not in market_df.columns or 'next_close' not in market_df.columns:
        print("  ERROR: Target columns missing!")
        return
    
    print(f"\n[2Îã®Í≥Ñ] ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ Î≥ëÌï© ÌôïÏù∏")
    print(f"  - ETH_Open Ïª¨Îüº Ï°¥Ïû¨: {'ETH_Open' in merged_df.columns}")
    print(f"  - ETH_Close Ïª¨Îüº Ï°¥Ïû¨: {'ETH_Close' in merged_df.columns}")
    
    if 'ETH_Open' not in merged_df.columns or 'ETH_Close' not in merged_df.columns:
        print("  ERROR: Original price columns missing!")
        return
    
    print(f"\n[3Îã®Í≥Ñ] shift(-1) Í≤ÄÏ¶ù: next_open == ETH_OpenÏùò Îã§Ïùå Ìñâ?")
    print("-" * 80)
    
    # Ï≤òÏùå 10Í∞úÎßå ÏÉòÌîå Ï∂úÎ†•
    sample_df = merged_df[['date', 'next_open', 'ETH_Open', 'next_close', 'ETH_Close']].head(10).copy()
    
    # Îã§Ïùå ÎÇ† ETH_Open Í≥ÑÏÇ∞
    sample_df['next_day_ETH_Open'] = sample_df['ETH_Open'].shift(-1)
    sample_df['next_day_ETH_Close'] = sample_df['ETH_Close'].shift(-1)
    
    # Ï∞®Ïù¥ Í≥ÑÏÇ∞
    sample_df['diff_open'] = sample_df['next_open'] - sample_df['next_day_ETH_Open']
    sample_df['diff_close'] = sample_df['next_close'] - sample_df['next_day_ETH_Close']
    
    print(sample_df.to_string(index=False))
    
    print(f"\n[4Îã®Í≥Ñ] Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞ Ï∞®Ïù¥ ÌÜµÍ≥Ñ")
    merged_df['expected_next_open'] = merged_df['ETH_Open'].shift(-1)
    merged_df['expected_next_close'] = merged_df['ETH_Close'].shift(-1)
    
    merged_df['diff_open'] = merged_df['next_open'] - merged_df['expected_next_open']
    merged_df['diff_close'] = merged_df['next_close'] - merged_df['expected_next_close']
    
    # NaN Ï†úÏô∏ (ÎßàÏßÄÎßâ ÌñâÏùÄ shift(-1)Î°ú NaN)
    diff_open_clean = merged_df['diff_open'].dropna()
    diff_close_clean = merged_df['diff_close'].dropna()
    
    print(f"\nÏ∞®Ïù¥ ÌÜµÍ≥Ñ (next_open - ETH_Open.shift(-1)):")
    print(f"  Count: {len(diff_open_clean)}")
    print(f"  Mean: {diff_open_clean.mean():.6f}")
    print(f"  Std: {diff_open_clean.std():.6f}")
    print(f"  Max abs diff: {diff_open_clean.abs().max():.6f}")
    
    print(f"\nÏ∞®Ïù¥ ÌÜµÍ≥Ñ (next_close - ETH_Close.shift(-1)):")
    print(f"  Count: {len(diff_close_clean)}")
    print(f"  Mean: {diff_close_clean.mean():.6f}")
    print(f"  Std: {diff_close_clean.std():.6f}")
    print(f"  Max abs diff: {diff_close_clean.abs().max():.6f}")
    
    print(f"\n[5Îã®Í≥Ñ] ÏµúÏ¢Ö ÌåêÏ†ï")
    
    # ÌóàÏö© Ïò§Ï∞®: 0.01 (1ÏÑºÌä∏)
    tolerance = 0.01
    
    if diff_open_clean.abs().max() < tolerance and diff_close_clean.abs().max() < tolerance:
        print(f"  STATUS: PASS")
        print(f"  - next_openÏùÄ Ï†ïÌôïÌûà Îã§Ïùå ÎÇ† ETH_OpenÏûÖÎãàÎã§.")
        print(f"  - next_closeÎäî Ï†ïÌôïÌûà Îã§Ïùå ÎÇ† ETH_CloseÏûÖÎãàÎã§.")
        print(f"  - shift(-1) Ïó∞ÏÇ∞Ïù¥ Ïò¨Î∞îÎ•¥Í≤å Ï†ÅÏö©ÎêòÏóàÏäµÎãàÎã§.")
    else:
        print(f"  STATUS: FAIL")
        print(f"  - Ï∞®Ïù¥Í∞Ä ÌóàÏö© Î≤îÏúÑ({tolerance})Î•º Ï¥àÍ≥ºÌï©ÎãàÎã§!")
        print(f"  - Data Leakage ÎòêÎäî Îç∞Ïù¥ÌÑ∞ Ïò§Ï†ïÎ†¨ Í∞ÄÎä•ÏÑ±Ïù¥ ÏûàÏäµÎãàÎã§!")
        
        # Í∞ÄÏû• ÌÅ∞ Ï∞®Ïù¥Í∞Ä ÎÇòÎäî Ìñâ Ï∂úÎ†•
        max_diff_idx_open = diff_open_clean.abs().idxmax()
        max_diff_idx_close = diff_close_clean.abs().idxmax()
        
        print(f"\n  Í∞ÄÏû• ÌÅ∞ Ï∞®Ïù¥ (Open):")
        problem_row_open = merged_df.loc[max_diff_idx_open, ['date', 'next_open', 'expected_next_open', 'diff_open']]
        print(f"    {problem_row_open.to_dict()}")
        
        print(f"\n  Í∞ÄÏû• ÌÅ∞ Ï∞®Ïù¥ (Close):")
        problem_row_close = merged_df.loc[max_diff_idx_close, ['date', 'next_close', 'expected_next_close', 'diff_close']]
        print(f"    {problem_row_close.to_dict()}")
    
    print(f"\n[6Îã®Í≥Ñ] Î∞±ÌÖåÏä§ÌåÖ ÏãúÎÆ¨Î†àÏù¥ÏÖò")
    print("-" * 80)
    
    # Ïã§Ï†ú Î∞±ÌÖåÏä§ÌåÖ ÏãúÎÇòÎ¶¨Ïò§ Ïû¨ÌòÑ
    test_idx = 5  # 6Î≤àÏß∏ ÎÇ†
    test_row = merged_df.iloc[test_idx]
    
    print(f"\nÏãúÎÇòÎ¶¨Ïò§: {test_row['date']} Ï¢ÖÍ∞ÄÏóê ÏòàÏ∏° ÏàòÌñâ")
    print(f"  - ÏòàÏ∏° ÎÇ†Ïßú: {test_row['date']}")
    print(f"  - ÎãπÏùº Ï¢ÖÍ∞Ä: ${test_row['ETH_Close']:,.2f}")
    print(f"\nÏòàÏ∏° Í≤∞Í≥º: ÏÉÅÏäπ ÏòàÏ∏° (Îß§Ïàò Ïã†Ìò∏)")
    print(f"  ‚Üí Îã§Ïùå ÎÇ† ({merged_df.iloc[test_idx+1]['date']}) ÏãúÍ∞ÄÏóê ÏßÑÏûÖ")
    print(f"  ‚Üí Îã§Ïùå ÎÇ† Ï¢ÖÍ∞ÄÏóê Ï≤≠ÏÇ∞")
    
    print(f"\nÎ∞±ÌÖåÏä§ÌåÖ ÏÇ¨Ïö© Í∞ÄÍ≤©:")
    print(f"  - ÏßÑÏûÖÍ∞Ä (next_open): ${test_row['next_open']:,.2f}")
    print(f"  - Ï≤≠ÏÇ∞Í∞Ä (next_close): ${test_row['next_close']:,.2f}")
    
    print(f"\nÏã§Ï†ú Îã§Ïùå ÎÇ† Í∞ÄÍ≤© (Í≤ÄÏ¶ù):")
    next_day = merged_df.iloc[test_idx + 1]
    print(f"  - Ïã§Ï†ú Îã§Ïùå ÎÇ† ÏãúÍ∞Ä (ETH_Open): ${next_day['ETH_Open']:,.2f}")
    print(f"  - Ïã§Ï†ú Îã§Ïùå ÎÇ† Ï¢ÖÍ∞Ä (ETH_Close): ${next_day['ETH_Close']:,.2f}")
    
    print(f"\nÏùºÏπò Ïó¨Î∂Ä:")
    open_match = abs(test_row['next_open'] - next_day['ETH_Open']) < tolerance
    close_match = abs(test_row['next_close'] - next_day['ETH_Close']) < tolerance
    
    print(f"  - ÏßÑÏûÖÍ∞Ä ÏùºÏπò: {'YES' if open_match else 'NO'} (Ï∞®Ïù¥: ${abs(test_row['next_open'] - next_day['ETH_Open']):.6f})")
    print(f"  - Ï≤≠ÏÇ∞Í∞Ä ÏùºÏπò: {'YES' if close_match else 'NO'} (Ï∞®Ïù¥: ${abs(test_row['next_close'] - next_day['ETH_Close']):.6f})")
    
    if open_match and close_match:
        print(f"\n  RESULT: Î∞±ÌÖåÏä§ÌåÖ Í∞ÄÍ≤©Ïù¥ Ï†ïÌôïÌï©ÎãàÎã§. Data Leakage ÏóÜÏùå.")
    else:
        print(f"\n  RESULT: Î∞±ÌÖåÏä§ÌåÖ Í∞ÄÍ≤©Ïù¥ Î∂ÄÏ†ïÌôïÌï©ÎãàÎã§. Îç∞Ïù¥ÌÑ∞ Ïò§Î•ò Í∞ÄÎä•ÏÑ±!")

def main():
    """Î©îÏù∏ Ïã§Ìñâ"""
    print("="*80)
    print("shift(-1) Ïó∞ÏÇ∞ Î∞è Data Leakage Í≤ÄÏ¶ù")
    print("="*80)
    
    raw_prices = load_raw_prices()
    
    for fold in FOLDS:
        validate_shift_operation(fold, raw_prices)
    
    print("\n" + "="*80)
    print("Í≤ÄÏ¶ù ÏôÑÎ£å")
    print("="*80)

if __name__ == "__main__":
    main()


shift(-1) Ïó∞ÏÇ∞ Î∞è Data Leakage Í≤ÄÏ¶ù

Fold 1 - shift(-1) Í≤ÄÏ¶ù

[1Îã®Í≥Ñ] test_raw.csvÏóêÏÑú next_open/next_close ÌôïÏù∏
  - next_open Ïª¨Îüº Ï°¥Ïû¨: True
  - next_close Ïª¨Îüº Ï°¥Ïû¨: True

[2Îã®Í≥Ñ] ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ Î≥ëÌï© ÌôïÏù∏
  - ETH_Open Ïª¨Îüº Ï°¥Ïû¨: True
  - ETH_Close Ïª¨Îüº Ï°¥Ïû¨: True

[3Îã®Í≥Ñ] shift(-1) Í≤ÄÏ¶ù: next_open == ETH_OpenÏùò Îã§Ïùå Ìñâ?
--------------------------------------------------------------------------------
      date   next_open    ETH_Open  next_close   ETH_Close  next_day_ETH_Open  next_day_ETH_Close  diff_open  diff_close
2022-12-07 1232.451782 1271.553101 1281.116333 1232.437500        1232.451782         1281.116333        0.0         0.0
2022-12-08 1281.077271 1232.451782 1264.284790 1281.116333        1281.077271         1264.284790        0.0         0.0
2022-12-09 1264.375488 1281.077271 1266.384155 1264.284790        1264.375488         1266.384155        0.0         0.0
2022-12-10 1266.417847 1264.375488 1263.868530 1266.384155       