In [6]:
!pip install boruta

Collecting boruta
  Downloading Boruta-0.4.3-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: boruta
Successfully installed boruta-0.4.3


In [1]:
import pandas as pd
import numpy as np
import os
import pandas_ta as ta
from datetime import datetime, timedelta
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_percentage_error, r2_score, accuracy_score, mean_squared_error
import warnings
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.var_model import VAR
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore')

# ============================================================================ 
# 1. 날짜 파싱 및 CSV 로드 함수
# ============================================================================ 
def standardize_date_column(df,file_name):
    """날짜 컬럼 자동 탐지 + datetime 통일 + tz 제거 + 시각 제거"""

    date_cols = [col for col in df.columns if 'date' in col.lower()]
    if not date_cols:
        print("[Warning] 날짜 컬럼을 찾을 수 없습니다.")
        return df
    date_col = date_cols[0]
    

    if date_col != 'date':
        df.rename(columns={date_col: 'date'}, inplace=True)
    

    if file_name == 'eth_onchain.csv':
        df['date'] = pd.to_datetime(df['date'], format='%y-%m-%d', errors='coerce')
    else:
        df['date'] = pd.to_datetime(df['date'], errors='coerce', infer_datetime_format=True)
    
    #print(df.shape)
    df = df.dropna(subset=['date'])
    #print(df.shape)
    df['date'] = df['date'].dt.normalize()  
    if pd.api.types.is_datetime64tz_dtype(df['date']):
        df['date'] = df['date'].dt.tz_convert(None)
    else:
        df['date'] = df['date'].dt.tz_localize(None)
    #print(df.shape)
    return df

def load_and_standardize_data(filepath):

    df = pd.read_csv(filepath)
    df = standardize_date_column(df,filepath)
    return df
# ============================================================================ 
# 2. 데이터 로딩
# ============================================================================ 
DATA_DIR = './macro_data'

def load_from_macro_data(filename):
    return load_and_standardize_data(os.path.join(DATA_DIR, filename))

macro_df = load_from_macro_data('macro_crypto_data.csv')
news_df = load_from_macro_data('news_data.csv')
eth_onchain_df = load_from_macro_data('eth_onchain.csv')
fear_greed_df = load_from_macro_data('fear_greed.csv')
usdt_eth_mcap_df = load_from_macro_data('usdt_eth_mcap.csv')
aave_tvl_df = load_from_macro_data('aave_eth_tvl.csv')
lido_tvl_df = load_from_macro_data('lido_eth_tvl.csv')
makerdao_tvl_df = load_from_macro_data('makerdao_eth_tvl.csv')
eth_chain_tvl_df = load_from_macro_data('eth_chain_tvl.csv')
eth_funding_df = load_from_macro_data('eth_funding_rate.csv')
sp500_df = load_from_macro_data('SP500.csv')
vix_df = load_from_macro_data('VIX.csv')
gold_df = load_from_macro_data('GOLD.csv')
dxy_df = load_from_macro_data('DXY.csv')

# ============================================================================ 
# 3. 기준 날짜 설정 (Lido TVL 시작일 기준)
# ============================================================================ 
train_start_date = pd.to_datetime('2020-12-19')
lookback_start_date = train_start_date - timedelta(days=200)
end_date= pd.to_datetime('2025-10-06')

# ============================================================================ 
# 4. 뉴스 감성 피처 생성 
# ============================================================================ 
def create_sentiment_features(news_df):
    """
    한국어 뉴스 감성 지표 생성 (고급 버전)
    출처: "Cryptocurrency Price Prediction Model Based on Sentiment Analysis" (2024)
    """
    sentiment_agg = news_df.groupby('date').agg(
        # ===== 기본 통계 =====
        sentiment_mean=('label', 'mean'),
        sentiment_std=('label', 'std'),
        news_count=('label', 'count'),
        positive_ratio=('label', lambda x: (x == 1).sum() / len(x)),
        negative_ratio=('label', lambda x: (x == -1).sum() / len(x)),
        
        # ===== 추가 지표 =====
        # 1. 극단 감성 카운트
        extreme_positive_count=('label', lambda x: (x == 1).sum()),
        extreme_negative_count=('label', lambda x: (x == -1).sum()),
        
        # 2. 총 감성 점수
        sentiment_sum=('label', 'sum'),
    ).reset_index()
    
    sentiment_agg = sentiment_agg.fillna(0)
    
    # ===== 파생 지표 계산 =====
    
    # 1. Sentiment Polarity (극성 강도) - 핵심 지표!
    sentiment_agg['sentiment_polarity'] = (
        sentiment_agg['positive_ratio'] - sentiment_agg['negative_ratio']
    )
    
    # 2. Sentiment Intensity (감성 강도) - 중립 제외한 강한 의견 비율
    sentiment_agg['sentiment_intensity'] = (
        sentiment_agg['positive_ratio'] + sentiment_agg['negative_ratio']
    )
    
    # 3. Sentiment Disagreement (의견 불일치) - 극단 의견이 공존할 때 높음
    sentiment_agg['sentiment_disagreement'] = (
        sentiment_agg['positive_ratio'] * sentiment_agg['negative_ratio']
    )
    
    # 4. Bull/Bear Ratio (상승/하락 비율)
    sentiment_agg['bull_bear_ratio'] = (
        sentiment_agg['positive_ratio'] / (sentiment_agg['negative_ratio'] + 1e-10)
    )
    
    # 5. Weighted Sentiment (뉴스 개수 가중)
    sentiment_agg['weighted_sentiment'] = (
        sentiment_agg['sentiment_mean'] * np.log1p(sentiment_agg['news_count'])
    )
    
    # 6. Extremity Index (극단 감성 비율)
    sentiment_agg['extremity_index'] = (
        (sentiment_agg['extreme_positive_count'] + sentiment_agg['extreme_negative_count']) / 
        (sentiment_agg['news_count'] + 1e-10)
    )
    
    # ===== 시계열 파생 지표 (이동 평균) =====
    
    for window in [3, 7, 14]:
        # 감성 이동 평균
        sentiment_agg[f'sentiment_ma{window}'] = (
            sentiment_agg['sentiment_mean'].rolling(window=window, min_periods=1).mean()
        )
        
        # 감성 변동성 (이동 표준편차)
        sentiment_agg[f'sentiment_volatility_{window}'] = (
            sentiment_agg['sentiment_mean'].rolling(window=window, min_periods=1).std()
        )
    
    # 7. Sentiment Trend (감성 변화 방향)
    sentiment_agg['sentiment_trend'] = sentiment_agg['sentiment_mean'].diff()
    
    # 8. Sentiment Acceleration (감성 변화 가속도)
    sentiment_agg['sentiment_acceleration'] = sentiment_agg['sentiment_trend'].diff()
    
    # 9. News Volume Change (뉴스 양 변화율)
    sentiment_agg['news_volume_change'] = sentiment_agg['news_count'].pct_change()
    
    # 10. News Volume MA (뉴스 양 이동 평균)
    for window in [7, 14]:
        sentiment_agg[f'news_volume_ma{window}'] = (
            sentiment_agg['news_count'].rolling(window=window, min_periods=1).mean()
        )
    
    print(f"✓ 감성 지표 생성 완료: {sentiment_agg.shape[1] - 1}개 (date 제외)")
    sentiment_agg = sentiment_agg.fillna(0)
    
    return sentiment_agg


sentiment_features = create_sentiment_features(news_df)



# ============================================================================ 
# 5. 데이터 병합
# ============================================================================ 
def add_prefix(df, prefix):
    df.columns = [prefix + '_' + col if col != 'date' else col for col in df.columns]
    return df

eth_onchain_df = add_prefix(eth_onchain_df, 'eth')
fear_greed_df = add_prefix(fear_greed_df, 'fg')
usdt_eth_mcap_df = add_prefix(usdt_eth_mcap_df, 'usdt')
aave_tvl_df = add_prefix(aave_tvl_df, 'aave')
lido_tvl_df = add_prefix(lido_tvl_df, 'lido')
makerdao_tvl_df = add_prefix(makerdao_tvl_df, 'makerdao')
eth_chain_tvl_df = add_prefix(eth_chain_tvl_df, 'chain')
eth_funding_df = add_prefix(eth_funding_df, 'funding')
sp500_df = add_prefix(sp500_df, 'sp500')
vix_df = add_prefix(vix_df, 'vix')
gold_df = add_prefix(gold_df, 'gold')
dxy_df = add_prefix(dxy_df, 'dxy')

date_range = pd.date_range(start=lookback_start_date, end=end_date, freq='D')
df_merged = pd.DataFrame(date_range, columns=['date'])

# List of all dataframes to be merged
dataframes_to_merge = [
    macro_df, sentiment_features, eth_onchain_df, fear_greed_df, usdt_eth_mcap_df,
    aave_tvl_df, lido_tvl_df, makerdao_tvl_df, eth_chain_tvl_df,
    eth_funding_df, sp500_df, vix_df, gold_df, dxy_df
]

# Sequentially merge all dataframes onto the master date range
for df_to_merge in dataframes_to_merge:
    df_merged = pd.merge(df_merged, df_to_merge, on='date', how='left')

    
################## 감정분석 결측치 따로 걍 처리하자..############################


sentiment_cols = [col for col in df_merged.columns 
                 if any(x in col for x in ['extreme'])]

print(f"\n병합 후 감성 지표 결측치 처리:")
for col in sentiment_cols:
    missing_before = df_merged[col].isnull().sum()
    if missing_before > 0:
        df_merged[col] = df_merged[col].fillna(0)
        print(f"  {col}: {missing_before}개 → 0")
    
    
# ============================================================================
# 5.1. lookback 기간 동안 모든 값이 결측치인 컬럼 제거
# ============================================================================
# Define the 60-day lookback period
lookback_period_df = df_merged[(df_merged['date'] >= lookback_start_date) & (df_merged['date'] < train_start_date)]

# Find columns where all values in this period are NaN
cols_to_drop = [col for col in lookback_period_df.columns if lookback_period_df[col].isnull().all() and not col.startswith("lido")]

if cols_to_drop:
    print(f"Dropping columns with all NaN values during the lookback period ({lookback_start_date.date()} to {train_start_date.date()}):")
    print(cols_to_drop)
    df_merged.drop(columns=cols_to_drop, inplace=True)
else:
    print("No columns to drop; all columns have at least one value in the lookback period.")

# lookback_start_date 이후만 사용
df_merged = df_merged[df_merged['date'] >= lookback_start_date].reset_index(drop=True)


# 최종 shape 및 날짜 범위 확인
print("최종 데이터 shape:", df_merged.shape)
print("날짜 범위:", df_merged['date'].min(), "~", df_merged['date'].max())

✓ 감성 지표 생성 완료: 25개 (date 제외)

병합 후 감성 지표 결측치 처리:
  extreme_positive_count: 39개 → 0
  extreme_negative_count: 39개 → 0
Dropping columns with all NaN values during the lookback period (2020-06-02 to 2020-12-19):
['usdt_totalBridgedToUSD']
최종 데이터 shape: (1953, 95)
날짜 범위: 2020-06-02 00:00:00 ~ 2025-10-06 00:00:00


In [69]:
macro_df.tail(10)

Unnamed: 0,date,BTC_Open,BTC_High,BTC_Low,BTC_Close,BTC_Volume,ETH_Open,ETH_High,ETH_Low,ETH_Close,...,AVAX_Open,AVAX_High,AVAX_Low,AVAX_Close,AVAX_Volume,DOT_Open,DOT_High,DOT_Low,DOT_Close,DOT_Volume
3192,2025-09-28,109681.945312,112375.484375,109236.945312,112122.640625,33371048505,4018.659668,4143.003906,3969.792969,4141.476562,...,28.790525,30.125532,27.963934,30.007946,839460800.0,3.892598,4.014097,3.799078,3.996965,197272471.0
3193,2025-09-29,112117.875,114473.570312,111589.953125,114400.382812,60000147466,4141.356445,4234.782715,4087.927246,4217.341797,...,30.008083,30.791185,29.210913,30.452417,1273919000.0,3.997001,4.020795,3.871955,3.983713,274838183.0
3194,2025-09-30,114396.523438,114836.617188,112740.5625,114056.085938,58986330258,4217.055176,4238.671387,4095.443604,4145.95752,...,30.452972,30.657635,28.84609,30.003811,981285200.0,3.983701,3.989437,3.814803,3.907708,206562308.0
3195,2025-10-01,114057.59375,118648.929688,113981.398438,118648.929688,71328680132,4146.033691,4351.112305,4125.541992,4351.112305,...,30.003811,31.077721,29.648453,30.704655,1013651000.0,3.907708,4.121581,3.889161,4.121088,280950394.0
3196,2025-10-02,118652.382812,121086.40625,118383.15625,120681.257812,71415163912,4352.240723,4517.665039,4336.526367,4487.923828,...,30.70508,31.361811,29.621159,31.023582,1558734000.0,4.121101,4.337359,4.10423,4.308879,367515728.0
3197,2025-10-03,120656.984375,123944.703125,119344.3125,122266.53125,83941392228,4486.93457,4591.443848,4431.479004,4514.870605,...,31.023441,31.529491,30.045366,31.361156,1167826000.0,4.308879,4.377396,4.188099,4.320178,327374794.0
3198,2025-10-04,122267.46875,122857.640625,121577.570312,122425.429688,36769171735,4514.90918,4519.526855,4444.012695,4489.197266,...,31.361156,31.40214,29.905663,30.142843,675889600.0,4.320165,4.326745,4.142659,4.201499,226165346.0
3199,2025-10-05,122419.671875,125559.210938,122191.960938,123513.476562,73689317763,4489.053223,4616.533203,4472.138672,4515.422852,...,30.142824,31.152348,29.699591,30.085981,788787400.0,4.201445,4.365485,4.092994,4.135801,292655150.0
3200,2025-10-06,123510.453125,126198.070312,123196.046875,124752.53125,72568881188,4515.300781,4736.208984,4492.870117,4687.771484,...,30.085981,30.975527,29.979013,30.712078,805717500.0,4.135799,4.41482,4.118031,4.387514,378617545.0
3201,2025-10-07,124724.65625,125012.789062,123526.539062,123536.101562,70358523904,4686.296387,4726.128906,4643.724121,4643.724121,...,30.710276,30.710276,29.633904,29.633904,807789400.0,4.388586,4.426277,4.255881,4.255881,410121056.0


In [70]:
df_merged.tail(3)

Unnamed: 0,date,BTC_Open,BTC_High,BTC_Low,BTC_Close,BTC_Volume,ETH_Open,ETH_High,ETH_Low,ETH_Close,...,usdt_totalUnreleased,aave_aave_eth_tvl,lido_lido_eth_tvl,makerdao_makerdao_eth_tvl,chain_eth_chain_tvl,funding_fundingRate,sp500_SP500,vix_VIX,gold_GOLD,dxy_DXY
1950,2025-10-04,122267.46875,122857.640625,121577.570312,122425.429688,36769171735,4514.90918,4519.526855,4444.012695,4489.197266,...,1379634000.0,35415262516,38371520000.0,6117770105,200146271727,3.7e-05,,,,
1951,2025-10-05,122419.671875,125559.210938,122191.960938,123513.476562,73689317763,4489.053223,4616.533203,4472.138672,4515.422852,...,1121429000.0,35432813372,38274800000.0,6262956668,199387628316,8.5e-05,,,,
1952,2025-10-06,123510.453125,126198.070312,123196.046875,124752.53125,72568881188,4515.300781,4736.208984,4492.870117,4687.771484,...,1238672000.0,35558310333,38534840000.0,6258821993,200181178768,9.3e-05,6740.279785,16.370001,3984.399902,98.170998


In [71]:
# ============================================================================ 
# 추가: 파일별 날짜 범위 확인
# ============================================================================ 
print("\n--- 파일별 날짜 범위 ---")

dataframes_info = [
    ('macro_crypto_data.csv', macro_df),
    ('news_data.csv', news_df),
    ('eth_onchain.csv', eth_onchain_df),
    ('fear_greed.csv', fear_greed_df),
    ('usdt_eth_mcap.csv', usdt_eth_mcap_df),
    ('aave_eth_tvl.csv', aave_tvl_df),
    ('lido_eth_tvl.csv', lido_tvl_df),
    ('makerdao_eth_tvl.csv', makerdao_tvl_df),
    ('eth_chain_tvl.csv', eth_chain_tvl_df),
    ('eth_funding_rate.csv', eth_funding_df),
    ('SP500.csv', sp500_df),
    ('VIX.csv', vix_df),
    ('GOLD.csv', gold_df),
    ('DXY.csv', dxy_df)
]

for name, df in dataframes_info:
    if 'date' in df.columns:
        start_date = df['date'].min().strftime('%Y-%m-%d')
        end_date = df['date'].max().strftime('%Y-%m-%d')
        print(f"**{name.ljust(25)}**: 시작일={start_date}, 종료일={end_date}, 행 수={len(df)}")
    else:
        print(f"**{name.ljust(25)}**: 날짜 컬럼 ('date')을 찾을 수 없습니다.")

print("----------------------\n")


--- 파일별 날짜 범위 ---
**macro_crypto_data.csv    **: 시작일=2017-01-01, 종료일=2025-10-07, 행 수=3202
**news_data.csv            **: 시작일=2020-01-01, 종료일=2025-10-06, 행 수=26005
**eth_onchain.csv          **: 시작일=2015-08-07, 종료일=2025-10-07, 행 수=3715
**fear_greed.csv           **: 시작일=2018-02-01, 종료일=2025-10-07, 행 수=2802
**usdt_eth_mcap.csv        **: 시작일=2017-11-29, 종료일=2025-10-07, 행 수=2870
**aave_eth_tvl.csv         **: 시작일=2020-05-20, 종료일=2025-10-07, 행 수=1968
**lido_eth_tvl.csv         **: 시작일=2020-12-19, 종료일=2025-10-07, 행 수=1754
**makerdao_eth_tvl.csv     **: 시작일=2019-01-04, 종료일=2025-10-07, 행 수=2470
**eth_chain_tvl.csv        **: 시작일=2017-09-27, 종료일=2025-10-07, 행 수=2933
**eth_funding_rate.csv     **: 시작일=2019-11-27, 종료일=2025-10-07, 행 수=2142
**SP500.csv                **: 시작일=2017-01-03, 종료일=2025-10-06, 행 수=2202
**VIX.csv                  **: 시작일=2017-01-03, 종료일=2025-10-06, 행 수=2202
**GOLD.csv                 **: 시작일=2017-01-03, 종료일=2025-10-06, 행 수=2203
**DXY.csv                  **: 시작일=2017-01-0

In [2]:

def add_indicator_to_df(df_ta, indicator):
    """pandas_ta 지표 결과를 DataFrame에 안전하게 추가"""
    if indicator is None:
        return

    if isinstance(indicator, pd.DataFrame) and not indicator.empty:
        for col in indicator.columns:
            df_ta[col] = indicator[col]
    elif isinstance(indicator, pd.Series) and not indicator.empty:
        colname = indicator.name if indicator.name else 'Unnamed'
        df_ta[colname] = indicator

def safe_add(df_ta, func, *args, **kwargs):
    """지표 생성 시 오류 방지를 위한 래퍼 함수"""
    try:
        result = func(*args, **kwargs)
        add_indicator_to_df(df_ta, result)
        return True
    except Exception as e:
        func_name = func.__name__ if hasattr(func, '__name__') else str(func)
        print(f"    ⚠ {func_name.upper()} 생성 실패: {str(e)[:50]}")
        return False

def calculate_technical_indicators(df):
    """
    최적화된 기술적 지표 생성 (논문 기반 2024-2025)
    출처: 
    - "CryptoPulse: Short-Term Cryptocurrency Forecasting" (2024)
    - "Enhancing Price Prediction in Cryptocurrency Using Transformer" (2024)
    - "Bitcoin Trend Prediction with Attention-Based Deep Learning" (2024)
    """
    print("\n=== 기술적 지표 생성 중 ===")
    df = df.sort_values('date').reset_index(drop=True)
    df_ta = df.copy()

    close = df['ETH_Close']
    high = df.get('ETH_High', close)
    low = df.get('ETH_Low', close)
    volume = df.get('ETH_Volume', pd.Series(index=df.index, data=1))
    open_ = df.get('ETH_Open', close)

    try:
        # ===== [핵심] MOMENTUM INDICATORS =====
        print("  - Momentum 지표 생성 중...")
        
        # RSI (필수! - 92.4% accuracy 달성)
        df_ta['RSI_14'] = ta.rsi(close, length=14)
        df_ta['RSI_30'] = ta.rsi(close, length=30)
        df_ta['RSI_200'] = ta.rsi(close, length=200)  # 장기 RSI 추가
        
        # MACD (필수! - top feature importance)
        safe_add(df_ta, ta.macd, close, fast=12, slow=26, signal=9)
        
        # Stochastic Oscillator (%K, %D - 논문에서 핵심 지표)
        safe_add(df_ta, ta.stoch, high, low, close, k=14, d=3)
        safe_add(df_ta, ta.stoch, high, low, close, k=30, d=3)  # 30일 추가
        safe_add(df_ta, ta.stoch, high, low, close, k=200, d=3)  # 200일 추가
        
        # Williams %R
        df_ta['WILLR_14'] = ta.willr(high, low, close, length=14)
        
        # ROC (Rate of Change)
        df_ta['ROC_10'] = ta.roc(close, length=10)
        df_ta['ROC_20'] = ta.roc(close, length=20)
        
        # MOM (Momentum - 다양한 기간)
        df_ta['MOM_10'] = ta.mom(close, length=10)
        df_ta['MOM_30'] = ta.mom(close, length=30)  # 추가
        
        # CCI (Commodity Channel Index)
        df_ta['CCI_20'] = ta.cci(high, low, close, length=20)
      
        # TSI (True Strength Index)
        safe_add(df_ta, ta.tsi, close, fast=13, slow=25, signal=13)
        
        # UO (Ultimate Oscillator)
        try:
            df_ta['UO_7_14_28'] = ta.uo(high, low, close)
        except:
            pass
        
        # KST Oscillator
        safe_add(df_ta, ta.kst, close)
        
        # =====  Ichimoku Cloud (암호화폐 트렌드 분석에 효과적) =====
        try:
            ichimoku = ta.ichimoku(high, low, close)
            if ichimoku is not None and isinstance(ichimoku, tuple):
                ichimoku_df = ichimoku[0]
                if ichimoku_df is not None:
                    for col in ichimoku_df.columns:
                        df_ta[col] = ichimoku_df[col]
        except Exception as e:
            print(f"    ⚠ ICHIMOKU 생성 실패")

        # ===== [핵심] OVERLAP INDICATORS =====
        print("  - Overlap 지표 생성 중...")
        
        # SMA (필수! - Golden/Death Cross)
        df_ta['SMA_10'] = ta.sma(close, length=10)
        df_ta['SMA_20'] = ta.sma(close, length=20)
        df_ta['SMA_50'] = ta.sma(close, length=50)
        df_ta['SMA_200'] = ta.sma(close, length=200)
        
        # EMA (필수!)
        df_ta['EMA_12'] = ta.ema(close, length=12)
        df_ta['EMA_26'] = ta.ema(close, length=26)
        df_ta['EMA_50'] = ta.ema(close, length=50)
        df_ta['EMA_200'] = ta.ema(close, length=200)  # 추가
        
        # TEMA (Triple EMA - 논문에서 high importance)
        df_ta['TEMA_10'] = ta.tema(close, length=10)
        df_ta['TEMA_30'] = ta.tema(close, length=30)  # 추가
        
        # WMA (Weighted Moving Average)
        df_ta['WMA_10'] = ta.wma(close, length=10)
        df_ta['WMA_20'] = ta.wma(close, length=20)  # 추가
        
        # HMA (Hull Moving Average)
        df_ta['HMA_9'] = ta.hma(close, length=9)
        
        # DEMA (Double EMA)
        df_ta['DEMA_10'] = ta.dema(close, length=10)
        
        # TRIMA
        df_ta['TRIMA_10'] = ta.trima(close, length=10)
        
        # VWMA (Volume Weighted)
        df_ta['VWMA_20'] = ta.vwma(close, volume, length=20)
        
        # ZLMA (Zero Lag MA)
        safe_add(df_ta, ta.zlma, close, length=20)
        
        # 가격 조합
        df_ta['HL2'] = ta.hl2(high, low)
        df_ta['HLC3'] = ta.hlc3(high, low, close)
        df_ta['OHLC4'] = ta.ohlc4(open_, high, low, close)

        # ===== [핵심] VOLATILITY INDICATORS =====
        print("  - Volatility 지표 생성 중...")
        
        # Bollinger Bands (필수! - 다양한 기간)
        safe_add(df_ta, ta.bbands, close, length=20, std=2)
        safe_add(df_ta, ta.bbands, close, length=50, std=2)  # 50일 추가
        
        # ATR (필수!)
        df_ta['ATR_7'] = ta.atr(high, low, close, length=7)
        df_ta['ATR_14'] = ta.atr(high, low, close, length=14)
        df_ta['ATR_21'] = ta.atr(high, low, close, length=21)  # 추가
        
        # NATR (Normalized ATR)
        df_ta['NATR_14'] = ta.natr(high, low, close, length=14)
        
        # True Range
        try:
            tr = ta.true_range(high, low, close)
            if isinstance(tr, pd.Series) and not tr.empty:
                df_ta['TRUERANGE'] = tr
            elif isinstance(tr, pd.DataFrame) and not tr.empty:
                df_ta['TRUERANGE'] = tr.iloc[:, 0]
        except:
            pass
        
        # Keltner Channel
        safe_add(df_ta, ta.kc, high, low, close, length=20)
        
        # Donchian Channel (안전하게 처리)
        try:
            dc = ta.donchian(high, low, lower_length=20, upper_length=20)
            if dc is not None and isinstance(dc, pd.DataFrame) and not dc.empty:
                for col in dc.columns:
                    df_ta[col] = dc[col]
        except:
            pass
        
        # MASSI (Mass Index)
        try:
            massi = ta.massi(high, low)
            if isinstance(massi, pd.Series) and not massi.empty:
                df_ta['MASSI_9_25'] = massi
        except:
            pass

        # ===== [핵심] VOLUME INDICATORS =====
        print("  - Volume 지표 생성 중...")
        
        # OBV (필수! - On-Balance Volume)
        df_ta['OBV'] = ta.obv(close, volume)
        
        # AD (Accumulation/Distribution)
        df_ta['AD'] = ta.ad(high, low, close, volume)
        
        # ADOSC
        df_ta['ADOSC_3_10'] = ta.adosc(high, low, close, volume, fast=3, slow=10)
        
        # MFI (Money Flow Index)
        df_ta['MFI_14'] = ta.mfi(high, low, close, volume, length=14)
        
        # CMF (Chaikin Money Flow - 논문에서 중요 지표)
        df_ta['CMF_20'] = ta.cmf(high, low, close, volume, length=20)
        
        # EFI (Elder Force Index)
        df_ta['EFI_13'] = ta.efi(close, volume, length=13)
        
        # EOM (Ease of Movement)
        safe_add(df_ta, ta.eom, high, low, close, volume, length=14)
        
        # NVI/PVI
        safe_add(df_ta, ta.nvi, close, volume)
        safe_add(df_ta, ta.pvi, close, volume)
        
        # PVT (Price Volume Trend)
        df_ta['PVT'] = ta.pvt(close, volume)
        
        # VWAP (Volume Weighted Average Price) - 추가
        try:
            df_ta['VWAP'] = ta.vwap(high, low, close, volume)
        except:
            pass

        # ===== [핵심] TREND INDICATORS =====
        print("  - Trend 지표 생성 중...")
        
        # ADX (필수! - Average Directional Index)
        safe_add(df_ta, ta.adx, high, low, close, length=14)
        
        # Aroon (안전하게 처리)
        try:
            aroon = ta.aroon(high, low, length=25)
            if aroon is not None and isinstance(aroon, pd.DataFrame):
                for col in aroon.columns:
                    df_ta[col] = aroon[col]
        except:
            pass
        
        # PSAR (Parabolic SAR - 안전하게 처리)
        try:
            psar = ta.psar(high, low, close)
            if psar is not None:
                if isinstance(psar, pd.DataFrame) and not psar.empty:
                    for col in psar.columns:
                        df_ta[col] = psar[col]
                elif isinstance(psar, pd.Series) and not psar.empty:
                    df_ta[psar.name] = psar
        except:
            pass
        
        # Vortex
        safe_add(df_ta, ta.vortex, high, low, close, length=14)
        
        # QStick
        safe_add(df_ta, ta.qstick, open_, close, length=14)
        
        # DPO (Detrended Price Oscillator) - 추가
        try:
            df_ta['DPO_20'] = ta.dpo(close, length=20)
        except:
            pass

        # ===== [추가] 파생 지표 (논문 검증) =====
        print("  - 파생 지표 생성 중...")
        
        # 가격 변화율 (다양한 기간)
        df_ta['PRICE_CHANGE'] = close.pct_change()
        df_ta['PRICE_CHANGE_2'] = close.pct_change(periods=2)
        df_ta['PRICE_CHANGE_5'] = close.pct_change(periods=5)
        df_ta['PRICE_CHANGE_10'] = close.pct_change(periods=10)  # 추가
        
        # 변동성 (Rolling Std)
        df_ta['VOLATILITY_5'] = close.pct_change().rolling(window=5).std()
        df_ta['VOLATILITY_10'] = close.pct_change().rolling(window=10).std()
        df_ta['VOLATILITY_20'] = close.pct_change().rolling(window=20).std()
        df_ta['VOLATILITY_30'] = close.pct_change().rolling(window=30).std()  # 추가
        
        # 모멘텀 (Price Ratio)
        df_ta['MOMENTUM_5'] = close / close.shift(5) - 1
        df_ta['MOMENTUM_10'] = close / close.shift(10) - 1
        df_ta['MOMENTUM_20'] = close / close.shift(20) - 1
        df_ta['MOMENTUM_30'] = close / close.shift(30) - 1  # 추가
        
        # 이동평균 대비 위치 (필수! - high feature importance)
        df_ta['PRICE_VS_SMA10'] = close / df_ta['SMA_10'] - 1
        df_ta['PRICE_VS_SMA20'] = close / df_ta['SMA_20'] - 1
        df_ta['PRICE_VS_SMA50'] = close / df_ta['SMA_50'] - 1
        df_ta['PRICE_VS_SMA200'] = close / df_ta['SMA_200'] - 1
        df_ta['PRICE_VS_EMA12'] = close / df_ta['EMA_12'] - 1  # 추가
        df_ta['PRICE_VS_EMA26'] = close / df_ta['EMA_26'] - 1  # 추가
        
        # 크로스 신호 (Golden/Death Cross)
        df_ta['SMA_CROSS_SIGNAL'] = (df_ta['SMA_10'] > df_ta['SMA_20']).astype(int)
        df_ta['SMA_GOLDEN_CROSS'] = (df_ta['SMA_50'] > df_ta['SMA_200']).astype(int)  # 추가
        df_ta['EMA_CROSS_SIGNAL'] = (df_ta['EMA_12'] > df_ta['EMA_26']).astype(int)
        
        # 거래량 지표
        df_ta['VOLUME_SMA_20'] = ta.sma(volume, length=20)
        df_ta['VOLUME_RATIO'] = volume / (df_ta['VOLUME_SMA_20'] + 1e-10)
        df_ta['VOLUME_CHANGE'] = volume.pct_change()
        df_ta['VOLUME_CHANGE_5'] = volume.pct_change(periods=5)  # 추가
        
        # Range 지표
        df_ta['HIGH_LOW_RANGE'] = (high - low) / (close + 1e-10)
        df_ta['HIGH_CLOSE_RANGE'] = np.abs(high - close.shift()) / (close + 1e-10)
        df_ta['CLOSE_LOW_RANGE'] = (close - low) / (close + 1e-10)
        
        # 일중 가격 위치 (Intraday Position)
        df_ta['INTRADAY_POSITION'] = (close - low) / ((high - low) + 1e-10)  # 추가
        
        # Linear Regression Slope
        try:
            df_ta['SLOPE_5'] = ta.linreg(close, length=5, slope=True)
            df_ta['SLOPE_10'] = ta.linreg(close, length=10, slope=True)
            df_ta['LINREG_14'] = ta.linreg(close, length=14)
        except:
            # 자체 구현
            df_ta['SLOPE_5'] = close.rolling(window=5).apply(
                lambda x: np.polyfit(np.arange(len(x)), x, 1)[0] if len(x) == 5 else np.nan, raw=True
            )
            df_ta['SLOPE_10'] = close.rolling(window=10).apply(
                lambda x: np.polyfit(np.arange(len(x)), x, 1)[0] if len(x) == 10 else np.nan, raw=True
            )
        
        # Increasing/Decreasing 신호
        df_ta['INC_1'] = (close > close.shift(1)).astype(int)
        df_ta['DEC_1'] = (close < close.shift(1)).astype(int)
        df_ta['INC_3'] = (close > close.shift(3)).astype(int)
        df_ta['INC_5'] = (close > close.shift(5)).astype(int)  # 추가
        
        # BOP (Balance of Power)
        df_ta['BOP'] = (close - open_) / ((high - low) + 1e-10)
        df_ta['BOP'] = df_ta['BOP'].fillna(0)
        
        # ===== [추가] 고급 파생 지표 =====
        print("  - 고급 파생 지표 생성 중...")
        
        # Bollinger Bands 관련 파생
        if 'BBL_20_2.0' in df_ta.columns and 'BBU_20_2.0' in df_ta.columns:
            df_ta['BB_WIDTH'] = (df_ta['BBU_20_2.0'] - df_ta['BBL_20_2.0']) / df_ta['BBM_20_2.0']
            df_ta['BB_POSITION'] = (close - df_ta['BBL_20_2.0']) / (df_ta['BBU_20_2.0'] - df_ta['BBL_20_2.0'])
        
        # RSI 파생 (Overbought/Oversold)
        df_ta['RSI_OVERBOUGHT'] = (df_ta['RSI_14'] > 70).astype(int)
        df_ta['RSI_OVERSOLD'] = (df_ta['RSI_14'] < 30).astype(int)
        
        # MACD 히스토그램 변화율
        if 'MACDh_12_26_9' in df_ta.columns:
            df_ta['MACD_HIST_CHANGE'] = df_ta['MACDh_12_26_9'].diff()
        
        # Volume Profile (상대적 거래량 강도)
        df_ta['VOLUME_STRENGTH'] = volume / volume.rolling(window=50).mean()
        
        # Price Acceleration (2차 미분)
        df_ta['PRICE_ACCELERATION'] = close.pct_change().diff()
        
        # Gap (시가-전일종가)
        df_ta['GAP'] = (open_ - close.shift(1)) / (close.shift(1) + 1e-10)
        
        # 추가 통계 지표
        df_ta['ROLLING_MAX_20'] = close.rolling(window=20).max()
        df_ta['ROLLING_MIN_20'] = close.rolling(window=20).min()
        df_ta['DISTANCE_FROM_HIGH'] = (df_ta['ROLLING_MAX_20'] - close) / (df_ta['ROLLING_MAX_20'] + 1e-10)
        df_ta['DISTANCE_FROM_LOW'] = (close - df_ta['ROLLING_MIN_20']) / (close + 1e-10)

        added = df_ta.shape[1] - df.shape[1]
        print(f"\n✓ 기술적 지표 생성 완료: {added}개 추가")
        print(f"  총 컬럼 수: {df_ta.shape[1]}")
                
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()

    return df_ta



def add_pairwise_correlation_features(df, window_sizes=[7, 14, 30]):
    """
    BTC와 ETH 간의 이동 상관관계 피처 추가.
    df에는 ETH_Close, BTC_Close 컬럼이 있어야 함.
    """
    df2 = df.copy()
    for w in window_sizes:
        col = f"corr_ETH_BTC_{w}"
        df2[col] = df2['ETH_Close'].pct_change().rolling(window=w).corr(
            df2['BTC_Close'].pct_change()
        )
    return df2

def add_volatility_correlation(df, window_sizes=[14, 30]):
    """
    BTC와 ETH 간의 변동성 (예: 절대 수익률 또는 제곱 수익률) 상관관계 추가
    """
    df2 = df.copy()
    # 예: 절대 수익률 또는 제곱 수익률
    ret_eth = df2['ETH_Close'].pct_change()
    ret_btc = df2['BTC_Close'].pct_change()
    abs_eth = ret_eth.abs()
    abs_btc = ret_btc.abs()
    for w in window_sizes:
        df2[f"volcorr_abs_{w}"] = abs_eth.rolling(w).corr(abs_btc)
        df2[f"volcorr_sq_{w}"] = (ret_eth**2).rolling(w).corr(ret_btc**2)
    return df2

def add_granger_causality_features(df, maxlag=5):
    """
    BTC → ETH, ETH → BTC 방향의 그레인지 인과성 유의성 p-value 등을 특징으로 추가
    (주의: 계산 비용 큼)
    """
    df2 = df.copy()
    # 수익률 시리즈
    ret_eth = df2['ETH_Close'].pct_change().dropna()
    ret_btc = df2['BTC_Close'].pct_change().dropna()

    # 합쳐서 DataFrame
    tmp = pd.concat([ret_eth, ret_btc], axis=1).dropna()
    tmp.columns = ['eth_ret', 'btc_ret']

    # granger tests
    gc = grangercausalitytests(tmp[['eth_ret', 'btc_ret']], maxlag=maxlag, verbose=False)
    # 예: eth_ret이 btc_ret을 설명하는 lag p-value (BTC → ETH 방향)
    for lag in range(1, maxlag + 1):
        p_val = gc[lag][0]['ssr_ftest'][1]
        df2.loc[:, f"p_btc_to_eth_lag{lag}"] = p_val
    # 반대 방향
    gc2 = grangercausalitytests(tmp[['btc_ret', 'eth_ret']], maxlag=maxlag, verbose=False)
    for lag in range(1, maxlag + 1):
        p_val = gc2[lag][0]['ssr_ftest'][1]
        df2.loc[:, f"p_eth_to_btc_lag{lag}"] = p_val

    return df2

def add_var_impulse_response_features(df, lags=5, irf_steps=3):
    """
    VAR 모델을 통해 BTC/ETH 간의 충격 반응 (impulse-response) 계수를 피처로 추가.
    (이 기능은 계산 비용이 있고 sample이 충분히 커야 안정됨)
    """
    df2 = df.copy()
    # 수익률 시리즈만 쓰기
    tmp = df2[['ETH_Close', 'BTC_Close']].pct_change().dropna()
    model = VAR(tmp)
    try:
        res = model.fit(lags)
        irf = res.irf(irf_steps)
        # ex: 충격이 BTC에 가해졌을 때 ETH의 반응 등
        for i in range(1, irf_steps + 1):
            # shock to BTC → response in ETH
            val = irf.irfs[i][1, 0]  # 이더리움(1번 idx)이 비트코인 충격(0번 idx)에 대해 반응
            df2.loc[:, f"irf_eth_from_btc_step{i}"] = val
            # 반대 방향: shock ETH → response BTC
            val2 = irf.irfs[i][0, 1]
            df2.loc[:, f"irf_btc_from_eth_step{i}"] = val2
    except Exception as e:
        print("VAR/IRF 계산 실패:", e)
    return df2

In [3]:
# ============================================================================
# 2. Lag 적용
# ============================================================================
def apply_lag_features(df, news_lag=2, onchain_lag=1):
    """
    Lag 피처 적용 (원본 유지 + lag 추가)
    
    핵심 원칙:
    1. 원본(lag0) 피처는 그대로 유지
    2. lag1, lag2 피처를 추가로 생성
    3. 이동평균/차분은 lag 불필요 (이미 과거 참조)
    4. 이벤트는 lag 없음 (당일 반영)
    
    출처: "Seeing Beyond Noise" (2024), scikit-learn
    """
    print("\n=== Lag 피처 적용 중 (원본 유지) ===")
    
    df_lagged = df.copy()
    
    # ===== Lag 적용 대상: 원본 감성 지표만 =====
    raw_sentiment_cols = [
        'sentiment_mean', 'sentiment_std', 'sentiment_sum',
        'news_count', 'positive_ratio', 'negative_ratio',
        'sentiment_polarity', 'sentiment_intensity', 
        'sentiment_disagreement', 'bull_bear_ratio',
        'weighted_sentiment', 'extremity_index',
        'extreme_positive_count', 'extreme_negative_count'
    ]
    
    # ===== Lag 제외: 이동평균, 차분 (이미 과거 참조) =====
    no_lag_patterns = [
        '_ma', '_volatility_', '_trend', '_acceleration', 
        '_volume_change', '_volume_ma'
    ]
    
    # ===== 온체인 데이터 =====
    onchain_cols = [col for col in df.columns if any(keyword in col.lower() 
                    for keyword in ['eth_tx', 'eth_active', 'eth_new', 
                                  'eth_large', 'eth_token', 'eth_contract',
                                  'eth_avg_gas', 'eth_total_gas', 
                                  'eth_avg_block'])]
    
    # ===== 기타 외부 변수 =====
    other_cols = [col for col in df.columns if any(keyword in col.lower() 
                  for keyword in ['tvl', 'funding', 'lido_', 'aave_', 'makerdao_', 
                                'chain_', 'usdt_', 'sp500_', 'vix_', 'gold_', 'dxy_', 'fg_'])]
    
    # ===== 제외 컬럼 =====
    exclude_cols = [
        'date', 'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Volume', 'ETH_Open',
        'BTC_Close', 'BTC_High', 'BTC_Low', 'BTC_Volume', 'BTC_Open',
        'BNB_Close', 'XRP_Close', 'SOL_Close', 'ADA_Close', 'DOGE_Close',
        'AVAX_Close', 'DOT_Close'
    ]
    
    # 이벤트는 lag 없음
    exclude_cols.extend([col for col in df.columns if 'event_' in col or 'period_' in col])
    
    # 이미 lag가 있는 컬럼 제외
    exclude_cols.extend([col for col in df.columns if '_lag' in col])
    
    lag_count = 0
    
    # ===== 1. 원본 감성 지표에만 lag 적용 (원본 유지!) =====
    print("  [감성 지표 Lag]")
    for col in raw_sentiment_cols:
        if col in df.columns:
            # 이동평균/차분이 아닌지 확인
            is_derived = any(pattern in col for pattern in no_lag_patterns)
            
            if not is_derived:
                # 원본은 그대로 유지하고, lag만 추가
                for lag in range(1, news_lag + 1):
                    new_col = f"{col}_lag{lag}"
                    df_lagged[new_col] = df[col].shift(lag)
                    lag_count += 1
                print(f"    {col}: 원본 유지 + lag1~{news_lag} 추가")
    
    # ===== 2. 온체인 lag (원본 유지!) =====
    print("  [온체인 지표 Lag]")
    onchain_lag_count = 0
    for col in onchain_cols:
        if col not in exclude_cols:
            df_lagged[f"{col}_lag1"] = df[col].shift(onchain_lag)
            onchain_lag_count += 1
    print(f"    {onchain_lag_count}개 컬럼: 원본 유지 + lag1 추가")
    
    # ===== 3. 기타 외부 변수 lag (원본 유지!) =====
    print("  [기타 외부 변수 Lag]")
    other_lag_count = 0
    for col in other_cols:
        if col not in exclude_cols:
            df_lagged[f"{col}_lag1"] = df[col].shift(1)
            other_lag_count += 1
    print(f"    {other_lag_count}개 컬럼: 원본 유지 + lag1 추가")
    
    # ===== 4. 이동평균/차분은 lag 없음 (명시) =====
    no_lag_cols = [col for col in df.columns if any(p in col for p in no_lag_patterns)]
    print(f"  [Lag 미적용] {len(no_lag_cols)}개 컬럼 (이동평균/차분 - 이미 과거 참조)")
    
    total_lag = lag_count + onchain_lag_count + other_lag_count
    print(f"\n✓ 총 Lag 피처 추가: {total_lag}개")
    print(f"  총 컬럼 수: {df_lagged.shape[1]} (원본 {df.shape[1]} + lag {total_lag})")
    
    return df_lagged



def add_price_lag_features_first(df):
    """
    과거 가격을 피처로 추가 (기술적 지표보다 먼저!)
    """
    print("\n=== [STEP 0] 과거 가격 피처 추가 ===")
    
    df_new = df.copy()
    close = df['ETH_Close']
    high = df['ETH_High']
    low = df['ETH_Low']
    volume = df['ETH_Volume']
    
    # 과거 종가 (핵심!)
    for lag in [1, 2, 3, 5, 7, 14, 21, 30]:
        df_new[f'close_lag{lag}'] = close.shift(lag)
    
    # 과거 고가/저가
    for lag in [1, 2, 3, 5, 7]:
        df_new[f'high_lag{lag}'] = high.shift(lag)
        df_new[f'low_lag{lag}'] = low.shift(lag)
    
    # 과거 거래량
    for lag in [1, 2, 3, 5, 7]:
        df_new[f'volume_lag{lag}'] = volume.shift(lag)
    
    # 과거 수익률
    for lag in [1, 2, 3, 5, 7]:
        df_new[f'return_lag{lag}'] = close.pct_change(periods=lag).shift(1)
    
    # 과거 가격 비율
    for lag in [1, 7, 30]:
        df_new[f'close_ratio_lag{lag}'] = close / close.shift(lag)
    
    added = df_new.shape[1] - df.shape[1]
    print(f"  ✓ 추가된 과거 가격 피처: {added}개")
    
    return df_new


# ============================================================================
# 3. 타겟 변수 생성
# ============================================================================
def create_targets(df):
    """타겟 변수 생성"""
    print("\n=== 타겟 변수 생성 ===")
    
    df_target = df.copy()
    close = df['ETH_Close']

    # 내일 종가
    next_close = close.shift(-1)
    
    # 오늘 → 내일 로그 수익률
    df_target['next_log_return'] = np.log(next_close / close)
    
    # 오늘 → 내일 방향성
    df_target['next_direction'] = (next_close > close).astype(int)
    
    # 참고: 내일 실제 종가
    df_target['next_close'] = next_close
    
    print("✓ 타겟 변수 생성 완료")
    
    # 확인
    print(f"\n  검증:")
    print(f"  오늘 종가 평균: {close.mean():.2f}")
    print(f"  내일 종가 평균: {next_close.mean():.2f}")
    print(f"  로그 수익률 평균: {df_target['next_log_return'].mean():.6f}")
    print(f"  상승 비율: {df_target['next_direction'].mean():.2%}")
    
    return df_target

# ============================================================================
# 4. Train-Val-Test 분할
# ============================================================================
def split_train_val_test(df, train_start_date, train_ratio=0.7, val_ratio=0.15):
    """시계열 기반 데이터 분할"""
    print("\n=== Train-Val-Test 분할 ===")
    
    df_train_period = df[df['date'] >= train_start_date].copy()
    df_clean = df_train_period.dropna(subset=['next_log_return', 'next_direction','next_close'])
    
    print(f"결측치 제거 후: {len(df_clean)} samples")
    print(f"기간: {df_clean['date'].min()} ~ {df_clean['date'].max()}")
    
    n = len(df_clean)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    
    train_df = df_clean.iloc[:train_end].copy()
    val_df = df_clean.iloc[train_end:val_end].copy()
    test_df = df_clean.iloc[val_end:].copy()
    
    print(f"Train: {len(train_df)} ({train_df['date'].min()} ~ {train_df['date'].max()})")
    print(f"Val: {len(val_df)} ({val_df['date'].min()} ~ {val_df['date'].max()})")
    print(f"Test: {len(test_df)} ({test_df['date'].min()} ~ {test_df['date'].max()})")
    
    return train_df, val_df, test_df



# ============================================================================
# 5. 결측치 처리 
# ============================================================================
def handle_missing_values_paper_based(df, train_start_date):
    """
    암호화폐 시계열 결측치 처리 (논문 기반)
    
    참고문헌:
    1. "Quantifying Cryptocurrency Unpredictability" (2025)
       - 암호화폐는 브라운 운동과 유사, 복잡한 전처리 불필요
       - 원본 데이터 최대한 보존
    
    2. "Time Series Data Forecasting" - 시계열 결측치 처리 원칙
       - Forward fill: 과거→현재만 (데이터 누수 방지)
       - 기술적 지표 초기값: 자연스러운 결측 (유지)
    
    처리 원칙:
    - Train 시작 이전 데이터만 lookback으로 사용
    - 미래 정보 누수 차단
    - 최소한의 개입
    """
    print("\n=== 결측치 처리 ===")
    
    df_processed = df.copy()
    initial_shape = df_processed.shape
    
    # ===== 1. 기술적 지표 (자연적 결측 유지) =====
    tech_cols = [col for col in df.columns if any(x in col for x in 
                ['RSI', 'MACD', 'SMA', 'EMA', 'WMA', 'HMA', 'ATR', 'BBL', 'BBM', 'BBU',
                 'WILLR', 'ROC', 'MOM', 'CCI', 'STOCH', 'ADX', 'AROON', 'TSI', 'UO',
                 'FISHER', 'KST', 'VWMA', 'ZLMA', 'NATR', 'UI', 'MASSI', 'CHOP',
                 'DPO', 'ICS', 'LINREG', 'SLOPE'])]
    
    tech_missing = df_processed[tech_cols].isnull().sum().sum() if tech_cols else 0
    print(f"  기술적 지표: {tech_missing:,}개 (초기 계산 기간, 유지)")
    
    # ===== 2. 외부 변수 (Forward Fill) =====
    external_cols = [col for col in df.columns if any(x in col for x in 
                    ['eth_', 'fg_', 'usdt_', 'aave_', 'lido_', 'makerdao_',
                     'chain_', 'funding_', 'sp500_', 'vix_', 'gold_', 'dxy_'])]
    
    if external_cols:
        ext_before = df_processed[external_cols].isnull().sum().sum()
        df_processed[external_cols] = df_processed[external_cols].fillna(method='ffill')
        
        # 여전히 NaN이면 0으로 (초기 기간)
        df_processed[external_cols] = df_processed[external_cols].fillna(0)
        ext_after = df_processed[external_cols].isnull().sum().sum()
        print(f"  외부 변수: {ext_before:,} → {ext_after:,}개")
    
    # ===== 3. 감성 지표 (Forward Fill + 초기값 0) =====
    sentiment_cols = [col for col in df.columns if any(x in col for x in 
                     ['sentiment', 'news', 'positive_ratio', 'negative_ratio',
                      'bull_bear', 'weighted_sentiment', 'extremity'])]
    
    if sentiment_cols:
        sent_before = df_processed[sentiment_cols].isnull().sum().sum()
        df_processed[sentiment_cols] = df_processed[sentiment_cols].fillna(method='ffill')
        
        # 초기 기간 0으로
        df_processed[sentiment_cols] = df_processed[sentiment_cols].fillna(0)
        sent_after = df_processed[sentiment_cols].isnull().sum().sum()
        print(f"  감성 지표: {sent_before:,} → {sent_after:,}개")
    
    # ===== 4. 이벤트 지표 (결측치 없어야 함) =====
    event_cols = [col for col in df.columns if 'event_' in col or 'period_' in col]
    
    if event_cols:
        event_missing = df_processed[event_cols].isnull().sum().sum()
        if event_missing > 0:
            print(f"  [경고] 이벤트 지표 결측치: {event_missing}개 → 0으로 채움")
            df_processed[event_cols] = df_processed[event_cols].fillna(0).astype(int)
        else:
            print(f"  이벤트 지표: 결측치 없음 ✓")
    
    # ===== 5. Lag 피처 (자연적 결측) =====
    lag_cols = [col for col in df.columns if '_lag' in col]
    lag_missing = df_processed[lag_cols].isnull().sum().sum() if lag_cols else 0
    print(f"  Lag 피처: {lag_missing:,}개 (초기 기간, 유지)")
    
    # ===== 6. 가격 데이터 확인 =====
    price_cols = ['ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Volume', 'ETH_Open']
    price_missing = df_processed[price_cols].isnull().sum().sum()
    if price_missing > 0:
        print(f"  [오류] 가격 데이터 결측: {price_missing}개 - 확인 필요!")
    else:
        print(f"  가격 데이터: 결측치 없음 ✓")
    
    # ===== 7. Lookback Period 제거 =====
    before_count = len(df_processed)
    df_processed = df_processed[df_processed['date'] >= train_start_date].reset_index(drop=True)
    removed = before_count - len(df_processed)
    print(f"  Lookback 제거: {removed}행")
    
    # ===== 8. 최종 통계 =====
    total_missing = df_processed.isnull().sum().sum()
    total_cells = df_processed.shape[0] * df_processed.shape[1]
    missing_pct = (total_missing / total_cells) * 100
    
    print(f"\n✓ 최종 결측치: {total_missing:,}/{total_cells:,} ({missing_pct:.2f}%)")
    print(f"  Shape: {initial_shape} → {df_processed.shape}")
    print(f"  기간: {df_processed['date'].min().date()} ~ {df_processed['date'].max().date()}")
    
    # ===== 9. 높은 결측치 컬럼 경고 =====
    col_missing_pct = (df_processed.isnull().sum() / len(df_processed) * 100).sort_values(ascending=False)
    high_missing = col_missing_pct[col_missing_pct > 20]
    
    if len(high_missing) > 0:
        print(f"\n  [주의] 결측치 20% 이상: {len(high_missing)}개 컬럼")
        for col, pct in high_missing.head(5).items():
            print(f"    - {col}: {pct:.1f}%")
    
    return df_processed


# ============================================================================
# 7. 데이터 품질 검증
# ============================================================================
def validate_data_leakage(train_df, val_df, test_df):
    """
    데이터 누수 검증
    - Train < Val < Test 날짜 순서 확인
    - 겹치는 날짜 없음 확인
    """
    print("\n=== 데이터 누수 검증 ===")
    
    train_dates = set(train_df['date'])
    val_dates = set(val_df['date'])
    test_dates = set(test_df['date'])
    
    train_max = train_df['date'].max()
    val_min = val_df['date'].min()
    val_max = val_df['date'].max()
    test_min = test_df['date'].min()
    
    overlap_train_val = len(train_dates & val_dates)
    overlap_val_test = len(val_dates & test_dates)
    overlap_train_test = len(train_dates & test_dates)
    
    print(f"날짜 순서:")
    print(f"  Train: ~ {train_max.date()}")
    print(f"  Val:   {val_min.date()} ~ {val_max.date()}")
    print(f"  Test:  {test_min.date()} ~")
    
    print(f"\n날짜 중복:")
    print(f"  Train-Val: {overlap_train_val}개")
    print(f"  Val-Test: {overlap_val_test}개")
    print(f"  Train-Test: {overlap_train_test}개")
    
    if overlap_train_val + overlap_val_test + overlap_train_test > 0:
        print("[경고] 데이터 누수 가능성 발견!")
        return False
    
    if train_max >= val_min or val_max >= test_min:
        print("[경고] 날짜 순서 오류!")
        return False
    
    print("[통과] 데이터 누수 없음")
    return True

In [4]:
def create_ethereum_event_features_complete(df):
    """
    이더리움 완전 타임라인 (2013-2025.10.07)
    출처: Ethereum.org 공식 문서 (2025년 8월 업데이트)
    """
    print("\n=== 이더리움 이벤트 지표 생성 (2013-2025 완전판) ===")
    
    df_event = df.copy()
    event_count = 0
    
    # ============================================================================
    # PRE-LAUNCH (2013-2014)
    # ============================================================================
    
    # Whitepaper Released (2013-11-27) - 비탈릭 부테린 백서 발표
    whitepaper_date = pd.to_datetime('2013-11-27')
    df_event['event_whitepaper'] = (df_event['date'] == whitepaper_date).astype(int)
    event_count += 1
    
    # Yellowpaper Released (2014-04-01) - Gavin Wood 기술 명세서
    yellowpaper_date = pd.to_datetime('2014-04-01')
    df_event['event_yellowpaper'] = (df_event['date'] == yellowpaper_date).astype(int)
    event_count += 1
    
    # Ether Sale (2014-07-22 ~ 2014-09-02) - ICO 시작/종료
    ether_sale_start = pd.to_datetime('2014-07-22')
    ether_sale_end = pd.to_datetime('2014-09-02')
    df_event['event_ether_sale_start'] = (df_event['date'] == ether_sale_start).astype(int)
    df_event['event_ether_sale_end'] = (df_event['date'] == ether_sale_end).astype(int)
    event_count += 2
    
    # ============================================================================
    # 2015: GENESIS & FRONTIER
    # ============================================================================
    
    # Frontier Launch (2015-07-30) - 이더리움 메인넷 공식 출시
    frontier_date = pd.to_datetime('2015-07-30')
    df_event['event_frontier'] = (df_event['date'] == frontier_date).astype(int)
    event_count += 1
    
    # Frontier Thawing (2015-09-07) - 가스 제한 해제
    frontier_thawing_date = pd.to_datetime('2015-09-07')
    df_event['event_frontier_thawing'] = (df_event['date'] == frontier_thawing_date).astype(int)
    event_count += 1
    
    # Ice Age (2015-09-08) - Difficulty Bomb 도입
    ice_age_date = pd.to_datetime('2015-09-08')
    df_event['event_ice_age'] = (df_event['date'] == ice_age_date).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2016: HOMESTEAD & DAO CRISIS
    # ============================================================================
    
    # Homestead (2016-03-14) - 첫 메이저 업그레이드
    homestead_date = pd.to_datetime('2016-03-14')
    df_event['event_homestead'] = (df_event['date'] == homestead_date).astype(int)
    event_count += 1
    
    # DAO Created (2016-04-30) - The DAO 크라우드세일 시작
    dao_created = pd.to_datetime('2016-04-30')
    df_event['event_dao_created'] = (df_event['date'] == dao_created).astype(int)
    event_count += 1
    
    # DAO Hack (2016-06-17) - $50M 해킹 사건
    dao_hack_date = pd.to_datetime('2016-06-17')
    df_event['event_dao_hack'] = (df_event['date'] == dao_hack_date).astype(int)
    event_count += 1
    
    # DAO Hard Fork (2016-07-20) - ETH/ETC 분리
    dao_fork_date = pd.to_datetime('2016-07-20')
    df_event['event_dao_fork'] = (df_event['date'] == dao_fork_date).astype(int)
    event_count += 1
    
    # Tangerine Whistle (2016-10-18) - DoS 공격 대응
    tangerine_date = pd.to_datetime('2016-10-18')
    df_event['event_tangerine_whistle'] = (df_event['date'] == tangerine_date).astype(int)
    event_count += 1
    
    # Spurious Dragon (2016-11-22) - DoS 공격 추가 대응
    spurious_date = pd.to_datetime('2016-11-22')
    df_event['event_spurious_dragon'] = (df_event['date'] == spurious_date).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2017: ICO BOOM & METROPOLIS
    # ============================================================================
    
    # Enterprise Ethereum Alliance (2017-02-28) - 기업 연합 출범
    eea_date = pd.to_datetime('2017-02-28')
    df_event['event_eea_launch'] = (df_event['date'] == eea_date).astype(int)
    event_count += 1
    
    # Byzantium (2017-10-16) - Metropolis Part 1
    byzantium_date = pd.to_datetime('2017-10-16')
    df_event['event_byzantium'] = (df_event['date'] == byzantium_date).astype(int)
    event_count += 1
    
    # Bitcoin ATH (2017-12-17) - BTC $19,783
    btc_peak_2017 = pd.to_datetime('2017-12-17')
    df_event['event_btc_peak_2017'] = (df_event['date'] == btc_peak_2017).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2018: BEAR MARKET
    # ============================================================================
    
    # Crypto Crash 2018 (2018-01-16) - 대규모 하락 시작
    crash_2018 = pd.to_datetime('2018-01-16')
    df_event['event_crash_2018'] = (df_event['date'] == crash_2018).astype(int)
    event_count += 1
    
    # Coincheck Hack (2018-01-26) - $530M NEM 해킹
    coincheck_hack = pd.to_datetime('2018-01-26')
    df_event['event_coincheck_hack'] = (df_event['date'] == coincheck_hack).astype(int)
    event_count += 1
    
    # ETH Bottom (2018-12-15) - $83 최저점
    eth_bottom_2018 = pd.to_datetime('2018-12-15')
    df_event['event_eth_bottom_2018'] = (df_event['date'] == eth_bottom_2018).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2019: RECOVERY & ISTANBUL
    # ============================================================================
    
    # Constantinople (2019-02-28) - Metropolis Part 2
    constantinople_date = pd.to_datetime('2019-02-28')
    df_event['event_constantinople'] = (df_event['date'] == constantinople_date).astype(int)
    event_count += 1
    
    # Istanbul (2019-12-08) - 프라이버시 강화
    istanbul_date = pd.to_datetime('2019-12-08')
    df_event['event_istanbul'] = (df_event['date'] == istanbul_date).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2020: DEFI SUMMER & ETH 2.0 BEGIN
    # ============================================================================
    
    # Muir Glacier (2020-01-02) - Difficulty Bomb 연기
    muir_glacier = pd.to_datetime('2020-01-02')
    df_event['event_muir_glacier'] = (df_event['date'] == muir_glacier).astype(int)
    event_count += 1
    
    # COVID-19 Black Thursday (2020-03-12) - 역사적 급락
    covid_crash = pd.to_datetime('2020-03-12')
    df_event['event_covid_crash'] = (df_event['date'] == covid_crash).astype(int)
    event_count += 1
    
    # Bitcoin Halving (2020-05-11)
    btc_halving_2020 = pd.to_datetime('2020-05-11')
    df_event['event_btc_halving_2020'] = (df_event['date'] == btc_halving_2020).astype(int)
    event_count += 1
    
    # DeFi Summer (2020-06-15) - COMP 토큰 배포
    defi_summer_start = pd.to_datetime('2020-06-15')
    df_event['event_defi_summer_start'] = (df_event['date'] == defi_summer_start).astype(int)
    event_count += 1
    
    # Staking Deposit Contract (2020-11-04) - ETH 2.0 준비
    deposit_contract = pd.to_datetime('2020-11-04')
    df_event['event_deposit_contract'] = (df_event['date'] == deposit_contract).astype(int)
    event_count += 1
    
    # Beacon Chain Genesis (2020-12-01) - ETH 2.0 시작!
    beacon_genesis = pd.to_datetime('2020-12-01')
    df_event['event_beacon_genesis'] = (df_event['date'] == beacon_genesis).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2021: BULL RUN & NFT BOOM
    # ============================================================================
    
    # Berlin (2021-04-15)
    berlin_date = pd.to_datetime('2021-04-15')
    df_event['event_berlin'] = (df_event['date'] == berlin_date).astype(int)
    event_count += 1
    
    # ETH ATH May (2021-05-12) - $4,362
    eth_ath_may = pd.to_datetime('2021-05-12')
    df_event['event_eth_ath_may2021'] = (df_event['date'] == eth_ath_may).astype(int)
    event_count += 1
    
    # China Crypto Ban (2021-05-21)
    china_ban = pd.to_datetime('2021-05-21')
    df_event['event_china_ban'] = (df_event['date'] == china_ban).astype(int)
    event_count += 1
    
    # London (2021-08-05) - EIP-1559!
    london_date = pd.to_datetime('2021-08-05')
    df_event['event_london'] = (df_event['date'] == london_date).astype(int)
    event_count += 1
    
    # Altair (2021-10-27)
    altair_date = pd.to_datetime('2021-10-27')
    df_event['event_altair'] = (df_event['date'] == altair_date).astype(int)
    event_count += 1
    
    # ETH ATH Nov (2021-11-16) - $4,891 역대 최고가
    eth_ath_nov = pd.to_datetime('2021-11-16')
    df_event['event_eth_ath_nov2021'] = (df_event['date'] == eth_ath_nov).astype(int)
    event_count += 1
    
    # Arrow Glacier (2021-12-09)
    arrow_glacier = pd.to_datetime('2021-12-09')
    df_event['event_arrow_glacier'] = (df_event['date'] == arrow_glacier).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2022: CRYPTO WINTER & THE MERGE
    # ============================================================================
    
    # Terra/LUNA Collapse (2022-05-09)
    terra_collapse = pd.to_datetime('2022-05-09')
    df_event['event_terra_collapse'] = (df_event['date'] == terra_collapse).astype(int)
    event_count += 1
    
    # Gray Glacier (2022-06-30)
    gray_glacier = pd.to_datetime('2022-06-30')
    df_event['event_gray_glacier'] = (df_event['date'] == gray_glacier).astype(int)
    event_count += 1
    
    # Celsius Bankruptcy (2022-07-13)
    celsius = pd.to_datetime('2022-07-13')
    df_event['event_celsius_bankruptcy'] = (df_event['date'] == celsius).astype(int)
    event_count += 1
    
    # Bellatrix (2022-09-06) - Merge 준비
    bellatrix = pd.to_datetime('2022-09-06')
    df_event['event_bellatrix'] = (df_event['date'] == bellatrix).astype(int)
    event_count += 1
    
    # THE MERGE (2022-09-15) - PoW→PoS!
    merge_date = pd.to_datetime('2022-09-15')
    df_event['event_merge'] = (df_event['date'] == merge_date).astype(int)
    event_count += 1
    
    # FTX Collapse (2022-11-11)
    ftx_collapse = pd.to_datetime('2022-11-11')
    df_event['event_ftx_collapse'] = (df_event['date'] == ftx_collapse).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2023: STAKING WITHDRAWALS & RECOVERY
    # ============================================================================
    
    # SVB Collapse (2023-03-10)
    svb_collapse = pd.to_datetime('2023-03-10')
    df_event['event_svb_collapse'] = (df_event['date'] == svb_collapse).astype(int)
    event_count += 1
    
    # Shanghai/Capella (2023-04-12) - 스테이킹 출금!
    shanghai = pd.to_datetime('2023-04-12')
    df_event['event_shanghai'] = (df_event['date'] == shanghai).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2024: PROTO-DANKSHARDING & ETF
    # ============================================================================
    
    # Dencun (2024-03-13) - Blob transactions (EIP-4844)
    dencun = pd.to_datetime('2024-03-13')
    df_event['event_dencun'] = (df_event['date'] == dencun).astype(int)
    event_count += 1
    
    # Bitcoin Halving (2024-04-19)
    btc_halving_2024 = pd.to_datetime('2024-04-19')
    df_event['event_btc_halving_2024'] = (df_event['date'] == btc_halving_2024).astype(int)
    event_count += 1
    
    # ETH Spot ETF Approval (2024-05-23) - SEC 승인!
    eth_etf_approval = pd.to_datetime('2024-05-23')
    df_event['event_eth_etf_approval'] = (df_event['date'] == eth_etf_approval).astype(int)
    event_count += 1
    
    # ETH Spot ETF Trading (2024-07-23) - 거래 시작
    eth_etf_trading = pd.to_datetime('2024-07-23')
    df_event['event_eth_etf_trading'] = (df_event['date'] == eth_etf_trading).astype(int)
    event_count += 1
    
    # ============================================================================
    # 2025: PECTRA & CURRENT
    # ============================================================================
    
    # Pectra (2025-05-07) - EIP-7251, EIP-7702
    pectra = pd.to_datetime('2025-05-07')
    df_event['event_pectra'] = (df_event['date'] == pectra).astype(int)
    event_count += 1
    
    # 현재까지 (2025-10-07) - 추가 이벤트 없음
    
    # ============================================================================
    # PERIOD EVENTS (장기 트렌드)
    # ============================================================================
    
    periods = {
        # ICO 광풍 (2017)
        'ico_boom': (pd.to_datetime('2017-01-01'), pd.to_datetime('2018-01-31')),
        
        # 2018 베어마켓
        'bear_2018': (pd.to_datetime('2018-02-01'), pd.to_datetime('2019-03-31')),
        
        # DeFi Summer (2020)
        'defi_summer': (pd.to_datetime('2020-06-01'), pd.to_datetime('2020-09-30')),
        
        # NFT Boom (2021)
        'nft_boom': (pd.to_datetime('2021-01-01'), pd.to_datetime('2021-12-31')),
        
        # Crypto Winter (2022-2023)
        'crypto_winter': (pd.to_datetime('2022-05-01'), pd.to_datetime('2023-03-31')),
        
        # L2 Scaling Era (2023~)
        'l2_scaling': (pd.to_datetime('2023-04-01'), pd.to_datetime('2025-10-07')),
    }
    
    for period_name, (start, end) in periods.items():
        df_event[f'period_{period_name}'] = (
            (df_event['date'] >= start) & (df_event['date'] <= end)
        ).astype(int)
        event_count += 1
    
    # ============================================================================
    # EVENT WINDOWS (주요 이벤트만)
    # ============================================================================
    
    critical_events = {
        'event_dao_fork': dao_fork_date,
        'event_london': london_date,
        'event_merge': merge_date,
        'event_shanghai': shanghai,
        'event_dencun': dencun,
        'event_pectra': pectra,
        'event_eth_etf_approval': eth_etf_approval,
    }
    
    for event_name, event_date in critical_events.items():
        # 사전 30일
        df_event[f'{event_name}_pre30'] = (
            (df_event['date'] > event_date - timedelta(days=30)) &
            (df_event['date'] < event_date)
        ).astype(int)
        event_count += 1
        
        # 사후 7일
        df_event[f'{event_name}_post7'] = (
            (df_event['date'] > event_date) &
            (df_event['date'] <= event_date + timedelta(days=7))
        ).astype(int)
        event_count += 1
        
        # 사후 30일
        df_event[f'{event_name}_post30'] = (
            (df_event['date'] > event_date + timedelta(days=7)) &
            (df_event['date'] <= event_date + timedelta(days=30))
        ).astype(int)
        event_count += 1
    
    # ============================================================================
    # META INDICATORS
    # ============================================================================
    
    all_event_dates = [
        whitepaper_date, yellowpaper_date, ether_sale_start, frontier_date,
        homestead_date, dao_hack_date, dao_fork_date, byzantium_date,
        constantinople_date, istanbul_date, covid_crash, btc_halving_2020,
        defi_summer_start, beacon_genesis, berlin_date, london_date,
        eth_ath_nov, terra_collapse, merge_date, ftx_collapse,
        shanghai, dencun, btc_halving_2024, eth_etf_approval, pectra
    ]
    
    df_event['days_since_last_event'] = 0
    df_event['event_count_90d'] = 0
    
    for idx, row in df_event.iterrows():
        current_date = row['date']
        
        # 마지막 이벤트 이후 경과
        past_events = [d for d in all_event_dates if d < current_date]
        if past_events:
            df_event.at[idx, 'days_since_last_event'] = (current_date - max(past_events)).days
        
        # 최근 90일 이벤트 수
        recent = [d for d in all_event_dates 
                 if current_date - timedelta(days=90) <= d < current_date]
        df_event.at[idx, 'event_count_90d'] = len(recent)
    
    event_count += 2
    
    print(f"✓ 이벤트 지표 추가: {event_count}개")
    print(f"  기간: 2013-11-27 ~ 2025-10-07")
    
    return df_event


In [5]:


# ============================================================================
# 실행
# ============================================================================
print("=== 데이터 전처리 파이프라인 시작 ===\n")


# 4. 이벤트 지표 (2015-2025 완전판!)
#df_with_events = create_ethereum_event_features_complete(df_merged)
df_with_events=df_merged

# 5. 과거 가격 피처
df_with_price_lags = add_price_lag_features_first(df_with_events)

# 6. 기술적 지표
df_with_indicators = calculate_technical_indicators(df_with_price_lags)

# 7. 상관관계
df = add_pairwise_correlation_features(df_with_indicators, window_sizes=[7,14,30])
df = add_volatility_correlation(df, window_sizes=[14,30])

# 8. Lag 적용 (원본 유지!)
df_with_lags = apply_lag_features(df, news_lag=2, onchain_lag=1)

# 9. 타겟 생성
df_final = create_targets(df_with_lags)

# 10. 결측치 처리 (강화 버전!)
df_clean = handle_missing_values_paper_based(df_final, train_start_date)
df_clean = df_clean.dropna(subset=['next_log_return', 'next_close', 'next_direction'])

# 특정 컬럼 추가 처리
if 'ICS_26' in df_clean.columns:
    df_clean['ICS_26'] = df_clean['ICS_26'].fillna(method='ffill')
if 'DPO_20' in df_clean.columns:
    df_clean['DPO_20'] = df_clean['DPO_20'].fillna(method='ffill')

# lag 결측치 제거
lag_cols = [col for col in df_clean.columns if '_lag' in col]
df_clean = df_clean.dropna(subset=lag_cols).reset_index(drop=True)

# 11. 분할
train_df, val_df, test_df = split_train_val_test(df_clean, train_start_date)

# 12. 검증
validate_data_leakage(train_df, val_df, test_df)

print(f"\n=== 최종 결과 ===")
print(f"총 피처: {df_clean.shape[1] - 4}개")

event_features = [col for col in df_clean.columns if 'event_' in col or 'period_' in col]
print(f"이벤트 피처: {len(event_features)}개")



=== 데이터 전처리 파이프라인 시작 ===


=== [STEP 0] 과거 가격 피처 추가 ===
  ✓ 추가된 과거 가격 피처: 31개

=== 기술적 지표 생성 중 ===
  - Momentum 지표 생성 중...
  - Overlap 지표 생성 중...
  - Volatility 지표 생성 중...
  - Volume 지표 생성 중...
  - Trend 지표 생성 중...
  - 파생 지표 생성 중...
  - 고급 파생 지표 생성 중...

✓ 기술적 지표 생성 완료: 124개 추가
  총 컬럼 수: 250

=== Lag 피처 적용 중 (원본 유지) ===
  [감성 지표 Lag]
    sentiment_mean: 원본 유지 + lag1~2 추가
    sentiment_std: 원본 유지 + lag1~2 추가
    sentiment_sum: 원본 유지 + lag1~2 추가
    news_count: 원본 유지 + lag1~2 추가
    positive_ratio: 원본 유지 + lag1~2 추가
    negative_ratio: 원본 유지 + lag1~2 추가
    sentiment_polarity: 원본 유지 + lag1~2 추가
    sentiment_intensity: 원본 유지 + lag1~2 추가
    sentiment_disagreement: 원본 유지 + lag1~2 추가
    bull_bear_ratio: 원본 유지 + lag1~2 추가
    weighted_sentiment: 원본 유지 + lag1~2 추가
    extremity_index: 원본 유지 + lag1~2 추가
    extreme_positive_count: 원본 유지 + lag1~2 추가
    extreme_negative_count: 원본 유지 + lag1~2 추가
  [온체인 지표 Lag]
    10개 컬럼: 원본 유지 + lag1 추가
  [기타 외부 변수 Lag]
    14개 컬럼: 원본 유지 + lag1 추가
  [Lag 미적용]

In [None]:
##### 위의 내용이 지표 생성 및 결측치 처리 train-val-test 셋 분리 #######

In [90]:
df_clean.tail(5)

Unnamed: 0,date,BTC_Open,BTC_High,BTC_Low,BTC_Close,BTC_Volume,ETH_Open,ETH_High,ETH_Low,ETH_Close,...,makerdao_makerdao_eth_tvl_lag1,chain_eth_chain_tvl_lag1,funding_fundingRate_lag1,sp500_SP500_lag1,vix_VIX_lag1,gold_GOLD_lag1,dxy_DXY_lag1,next_log_return,next_direction,next_close
1747,2025-10-01,114057.59375,118648.929688,113981.398438,118648.929688,71328680132,4146.033691,4351.112305,4125.541992,4351.112305,...,5826659000.0,188625100000.0,2.7e-07,6688.459961,16.280001,3840.800049,97.769997,0.030959,1,4487.923828
1748,2025-10-02,118652.382812,121086.40625,118383.15625,120681.257812,71415163912,4352.240723,4517.665039,4336.526367,4487.923828,...,6071020000.0,186607100000.0,8.433333e-07,6711.200195,16.290001,3867.5,97.709999,0.005986,1,4514.870605
1749,2025-10-03,120656.984375,123944.703125,119344.3125,122266.53125,83941392228,4486.93457,4591.443848,4431.479004,4514.870605,...,6211323000.0,193734000000.0,2.751e-05,6715.350098,16.629999,3839.699951,97.849998,-0.005703,0,4489.197266
1750,2025-10-04,122267.46875,122857.640625,121577.570312,122425.429688,36769171735,4514.90918,4519.526855,4444.012695,4489.197266,...,6367490000.0,198772200000.0,4.946e-05,6715.790039,16.65,3880.800049,97.720001,0.005825,1,4515.422852
1751,2025-10-05,122419.671875,125559.210938,122191.960938,123513.476562,73689317763,4489.053223,4616.533203,4472.138672,4515.422852,...,6117770000.0,200146300000.0,3.698333e-05,6715.790039,16.65,3880.800049,97.720001,0.037458,1,4687.771484


In [6]:
# ============================================================================
# df_clean 결측치 상세 분석
# ============================================================================

def analyze_missing_values(df, name="df_clean"):
    """
    DataFrame의 결측치를 상세하게 분석하는 함수
    """
    print(f"\n{'='*80}")
    print(f"결측치 분석: {name}")
    print(f"{'='*80}")
    
    # 1. 전체 통계
    total_cells = df.shape[0] * df.shape[1]
    total_missing = df.isnull().sum().sum()
    missing_pct = (total_missing / total_cells) * 100
    
    print(f"\n[전체 통계]")
    print(f"  Shape: {df.shape}")
    print(f"  전체 셀: {total_cells:,}개")
    print(f"  결측치: {total_missing:,}개 ({missing_pct:.2f}%)")
    print(f"  날짜 범위: {df['date'].min().date()} ~ {df['date'].max().date()}")
    
    # 2. 컬럼별 결측치 (결측치 있는 것만)
    col_missing = df.isnull().sum()
    col_missing_pct = (col_missing / len(df) * 100)
    
    missing_cols = col_missing[col_missing > 0].sort_values(ascending=False)
    
    if len(missing_cols) == 0:
        print(f"\n✓ 결측치가 없습니다!")
        return
    
    print(f"\n[컬럼별 결측치] (총 {len(missing_cols)}개 컬럼)")
    print(f"{'컬럼명':<50} {'결측치 개수':>12} {'비율':>10}")
    print("-" * 80)
    
    for col, count in missing_cols.items():
        pct = col_missing_pct[col]
        print(f"{col:<50} {int(count):>12,}개 {pct:>9.2f}%")
    
    # 3. 카테고리별 분류
    print(f"\n[카테고리별 결측치]")
    
    categories = {
        '기술적 지표': ['RSI', 'MACD', 'SMA', 'EMA', 'WMA', 'HMA', 'ATR', 'BBL', 'BBM', 'BBU',
                     'WILLR', 'ROC', 'MOM', 'CCI', 'STOCH', 'ADX', 'AROON', 'TSI', 'UO',
                     'DPO', 'ICS', 'LINREG', 'SLOPE', 'TEMA', 'DEMA', 'VWMA', 'NATR'],
        '감성 지표': ['sentiment', 'news', 'positive_ratio', 'negative_ratio', 'bull_bear'],
        '온체인': ['eth_tx', 'eth_active', 'eth_new', 'eth_large', 'eth_token', 'eth_contract', 'eth_avg'],
        'TVL/펀딩': ['tvl', 'lido', 'aave', 'makerdao', 'funding'],
        '거시경제': ['sp500', 'vix', 'gold', 'dxy'],
        '이벤트': ['event_', 'period_'],
        'Lag 피처': ['_lag'],
        '가격': ['Close', 'High', 'Low', 'Volume', 'Open'],
        '타겟': ['next_']
    }
    
    for cat_name, keywords in categories.items():
        cat_cols = [col for col in missing_cols.index 
                   if any(kw in col for kw in keywords)]
        if cat_cols:
            cat_missing = sum(col_missing[col] for col in cat_cols)
            print(f"  {cat_name}: {len(cat_cols)}개 컬럼, {cat_missing:,}개 결측치")
    
    # 4. 행별 결측치 분포
    row_missing = df.isnull().sum(axis=1)
    rows_with_missing = (row_missing > 0).sum()
    
    print(f"\n[행별 결측치 분포]")
    print(f"  결측치 있는 행: {rows_with_missing:,}개 / {len(df):,}개 ({rows_with_missing/len(df)*100:.2f}%)")
    
    if rows_with_missing > 0:
        print(f"\n  결측치 개수별 행 분포:")
        missing_counts = row_missing.value_counts().sort_index()
        for num_missing, count in missing_counts.items():
            if num_missing > 0:
                print(f"    {int(num_missing):3}개 결측: {int(count):>6,}행")
    
    # 5. 결측치가 가장 많은 행 확인
    if rows_with_missing > 0:
        print(f"\n[결측치가 가장 많은 상위 10개 행]")
        top_missing_rows = row_missing.nlargest(10)
        
        for idx, num_missing in top_missing_rows.items():
            if num_missing > 0:
                date = df.loc[idx, 'date'].date()
                missing_cols_in_row = df.loc[idx][df.loc[idx].isnull()].index.tolist()
                print(f"  행 {idx} ({date}): {int(num_missing)}개 결측")
                print(f"    결측 컬럼 (처음 5개): {missing_cols_in_row[:5]}")
    
    # 6. 특정 기간의 결측치 패턴
    print(f"\n[시간대별 결측치 패턴]")
    df_temp = df.copy()
    df_temp['year_month'] = df_temp['date'].dt.to_period('M')
    df_temp['missing_count'] = df_temp.isnull().sum(axis=1)
    
    period_missing = df_temp.groupby('year_month')['missing_count'].agg(['mean', 'sum'])
    period_missing = period_missing[period_missing['sum'] > 0].sort_values('sum', ascending=False).head(10)
    
    if len(period_missing) > 0:
        print(f"  결측치가 많은 상위 10개 월:")
        for period, row in period_missing.iterrows():
            print(f"    {period}: 평균 {row['mean']:.1f}개/행, 총 {int(row['sum'])}개")
    
    # 7. 권장 사항
    print(f"\n[권장 처리 방법]")
    
    for col in missing_cols.head(10).index:
        pct = col_missing_pct[col]
        
        if 'RSI' in col or 'MACD' in col or 'SMA' in col or 'EMA' in col:
            suggestion = "→ 초기 계산 기간 결측 (자연스러움, 유지 권장)"
        elif '_lag' in col:
            suggestion = "→ Lag로 인한 자연 결측 (유지)"
        elif 'sentiment' in col or 'news' in col:
            suggestion = "→ df_clean['{}'].fillna(method='ffill').fillna(0)".format(col)
        elif 'eth_' in col or 'tvl' in col:
            suggestion = "→ df_clean['{}'].fillna(method='ffill')".format(col)
        elif 'event_' in col:
            suggestion = "→ df_clean['{}'].fillna(0).astype(int)".format(col)
        else:
            suggestion = "→ 원인 확인 필요"
        
        print(f"  {col} ({pct:.1f}%): {suggestion}")

# ============================================================================
# 실행
# ============================================================================

# df_clean 분석
analyze_missing_values(df_clean, "df_clean")

# 추가: 특정 컬럼 상세 확인
print(f"\n{'='*80}")
print("특정 컬럼 상세 확인")
print(f"{'='*80}")

# 결측치 많은 컬럼 10개 선택
top_missing_cols = df_clean.isnull().sum().sort_values(ascending=False).head(10)

for col in top_missing_cols.index:
    if top_missing_cols[col] > 0:
        print(f"\n[{col}]")
        print(f"  결측치: {top_missing_cols[col]}개")
        print(f"  데이터 타입: {df_clean[col].dtype}")
        print(f"  비결측치 통계:")
        print(df_clean[col].describe())
        
        # 결측치가 있는 날짜 확인 (처음 5개)
        missing_dates = df_clean[df_clean[col].isnull()]['date'].head()
        print(f"  결측 날짜 (처음 5개): {missing_dates.dt.date.tolist()}")



결측치 분석: df_clean

[전체 통계]
  Shape: (1752, 312)
  전체 셀: 546,624개
  결측치: 0개 (0.00%)
  날짜 범위: 2020-12-19 ~ 2025-10-05

✓ 결측치가 없습니다!

특정 컬럼 상세 확인


In [None]:
############ 1006 버전 일단 복사 #################
############ 1006 버전 일단 복사 #################
############ 1006 버전 일단 복사 #################
############ 1006 버전 일단 복사 #################
############ 1006 버전 일단 복사 #################
############ 1006 버전 일단 복사 #################


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    AdaBoostClassifier, VotingClassifier, StackingClassifier, 
    BaggingClassifier, ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, roc_curve, confusion_matrix
)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')


def prepare_feature_target(df, task='classification'):
    exclude_cols = ['date', 'next_log_return', 'next_direction','next_close',
                   'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    for f in feature_cols:
        if f in exclude_cols:
            print(f)
    X = df[feature_cols].copy()
    y = df['next_direction'].copy() if task == 'classification' else df['next_log_return'].copy()
    dates = df['date'].copy()
    prices = df['ETH_Close'].copy()
    
    return X, y, dates, prices, feature_cols


def feature_selection_before_scaling(X_train, y_train, X_val, X_test, n_features=100):
    print(f"\n[FEATURE SELECTION] {X_train.shape[1]} -> {n_features}")
    
    selector = SelectKBest(score_func=f_classif, k=min(n_features, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_val_selected = selector.transform(X_val)
    X_test_selected = selector.transform(X_test)
    
    selected_features = X_train.columns[selector.get_support()].tolist()
    print(selected_features)
    print(f"  Selected: {len(selected_features)} features")
    
    return X_train_selected, X_val_selected, X_test_selected, selected_features


def check_data_leakage(X_train, X_val, X_test, train_dates, val_dates, test_dates):
    print("\n" + "="*100)
    print("DATA LEAKAGE VERIFICATION")
    print("="*100)
    
    issues = []
    
    train_idx = set(X_train.index)
    val_idx = set(X_val.index)
    test_idx = set(X_test.index)
    
    if len(train_idx & val_idx) > 0:
        issues.append("CRITICAL: Train and validation index overlap")
    if len(train_idx & test_idx) > 0:
        issues.append("CRITICAL: Train and test index overlap")
    if len(val_idx & test_idx) > 0:
        issues.append("CRITICAL: Validation and test index overlap")
    
    if train_dates.max() >= val_dates.min():
        issues.append("CRITICAL: Train dates overlap with validation dates")
    if val_dates.max() >= test_dates.min():
        issues.append("CRITICAL: Validation dates overlap with test dates")
    
    if X_train.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Train has {X_train.isnull().sum().sum()} NaN values")
    if X_val.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Validation has {X_val.isnull().sum().sum()} NaN values")
    if X_test.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Test has {X_test.isnull().sum().sum()} NaN values")
    
    if np.isinf(X_train).sum().sum() > 0:
        issues.append(f"WARNING: Train has {np.isinf(X_train).sum().sum()} inf values")
    if np.isinf(X_val).sum().sum() > 0:
        issues.append(f"WARNING: Validation has {np.isinf(X_val).sum().sum()} inf values")
    if np.isinf(X_test).sum().sum() > 0:
        issues.append(f"WARNING: Test has {np.isinf(X_test).sum().sum()} inf values")
    
    print(f"\nTrain: {len(X_train)} samples, {train_dates.min()} to {train_dates.max()}")
    print(f"Val:   {len(X_val)} samples, {val_dates.min()} to {val_dates.max()}")
    print(f"Test:  {len(X_test)} samples, {test_dates.min()} to {test_dates.max()}")
    
    if len(issues) == 0:
        print("\nNo data leakage detected")
        return True
    else:
        print("\nData leakage issues detected:")
        for issue in issues:
            print(f"  {issue}")
        return False


def scale_features(X_train, X_val, X_test):
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler


def get_all_models():
    base_models = {
        'RandomForest': RandomForestClassifier(
            n_estimators=200, max_depth=10, min_samples_split=20,
            min_samples_leaf=10, random_state=42, n_jobs=-1
        ),
        'GradientBoosting': GradientBoostingClassifier(
            n_estimators=200, max_depth=4, learning_rate=0.03,
            subsample=0.75, random_state=42
        ),
        'ExtraTrees': ExtraTreesClassifier(
            n_estimators=200, max_depth=10, min_samples_split=20,
            min_samples_leaf=10, random_state=42, n_jobs=-1
        ),
        'AdaBoost': AdaBoostClassifier(
            n_estimators=100, learning_rate=0.5, random_state=42
        ),
        'DecisionTree': DecisionTreeClassifier(
            max_depth=8, min_samples_split=20, min_samples_leaf=10,
            random_state=42
        ),
        'LogisticRegression': LogisticRegression(
            C=0.1, penalty='l2', max_iter=2000, random_state=42, n_jobs=-1
        ),
        'RidgeClassifier': RidgeClassifier(
            alpha=1.0, random_state=42
        ),
        'SVM_RBF': SVC(
            kernel='rbf', C=1.0, gamma='scale', 
            probability=True, random_state=42
        ),
        'SVM_Linear': SVC(
            kernel='linear', C=0.1,
            probability=True, random_state=42
        ),
        'MLP_Small': MLPClassifier(
            hidden_layer_sizes=(64, 32), activation='relu',
            solver='adam', alpha=0.01, batch_size=64,
            learning_rate='adaptive', max_iter=300,
            early_stopping=True, random_state=42
        ),
        'MLP_Medium': MLPClassifier(
            hidden_layer_sizes=(128, 64, 32), activation='relu',
            solver='adam', alpha=0.001, batch_size=64,
            learning_rate='adaptive', max_iter=300,
            early_stopping=True, random_state=42
        ),
        'KNN': KNeighborsClassifier(
            n_neighbors=15, weights='distance', n_jobs=-1
        ),
        'NaiveBayes': GaussianNB(),
        'Bagging_RF': BaggingClassifier(
            estimator=DecisionTreeClassifier(max_depth=8, random_state=42),
            n_estimators=50, random_state=42, n_jobs=-1
        ),
        'XGBoost_GPU': XGBClassifier(
            n_estimators=200,
            learning_rate=0.03,
            max_depth=3,
            min_child_weight=5,
            subsample=0.7,
            colsample_bytree=0.7,
            gamma=0.1,
            reg_alpha=0.1,
            reg_lambda=5,
            tree_method='gpu_hist',
            random_state=42
        ),
        'LightGBM_GPU': LGBMClassifier(
            n_estimators=200,
            learning_rate=0.03,
            num_leaves=20,
            max_depth=4,
            min_child_samples=20,
            subsample=0.7,
            colsample_bytree=0.7,
            reg_alpha=0.1,
            reg_lambda=5,
            device='gpu',
            random_state=42,
            verbose=-1,
            n_jobs=-1
        ),
    }
    
    voting_soft = VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('lr', LogisticRegression(C=0.1, random_state=42, n_jobs=-1))
        ],
        voting='soft'
    )
    
    voting_hard = VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('lr', LogisticRegression(C=0.1, random_state=42, n_jobs=-1))
        ],
        voting='hard'
    )
    
    stacking = StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('et', ExtraTreesClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1))
        ],
        final_estimator=LogisticRegression(C=0.1, random_state=42),
        cv=5
    )
    
    base_models['Voting_Soft'] = voting_soft
    base_models['Voting_Hard'] = voting_hard
    base_models['Stacking'] = stacking
    
    xgb_calibrated = CalibratedClassifierCV(
        base_models['XGBoost_GPU'],
        method='isotonic',
        cv=TimeSeriesSplit(n_splits=3)
    )
    lgb_calibrated = CalibratedClassifierCV(
        base_models['LightGBM_GPU'],
        method='isotonic',
        cv=TimeSeriesSplit(n_splits=3)
    )
    
    base_models['XGBoost_Calibrated'] = xgb_calibrated
    base_models['LightGBM_Calibrated'] = lgb_calibrated
    
    return base_models


def optimize_threshold_on_validation(y_val, y_proba_val, val_dates, val_prices):
    best_sharpe = -np.inf
    best_thresholds = (0.55, 0.45)
    
    for buy_th in np.arange(0.50, 0.70, 0.05):
        for sell_th in np.arange(0.30, 0.50, 0.05):
            predictions_temp = (y_proba_val > 0.5).astype(int)
            
            temp_result = calculate_trading_performance_corrected(
                predictions_temp, y_proba_val, val_dates, val_prices, y_val,
                initial_capital=10000, transaction_cost=0.002, slippage=0.001,
                buy_threshold=buy_th, sell_threshold=sell_th
            )
            
            if temp_result['sharpe_ratio'] > best_sharpe:
                best_sharpe = temp_result['sharpe_ratio']
                best_thresholds = (buy_th, sell_th)
    
    return best_thresholds


def train_all_models(X_train, y_train, X_val, y_val, X_test, y_test, 
                     val_dates=None, val_prices=None, optimize_thresholds=False):
    print("\n" + "="*100)
    print("MODEL TRAINING AND EVALUATION")
    print("="*100)
    
    models_config = get_all_models()
    
    results = []
    models_trained = {}
    predictions = {}
    probabilities = {}
    thresholds = {}
    
    print(f"\n{'Model':<30} {'Train Acc':<12} {'Val Acc':<12} {'Test Acc':<12} {'Test AUC':<12} Status")
    print("-" * 110)
    
    for name, model in models_config.items():
        try:
            model.fit(X_train, y_train)
            
            train_pred = model.predict(X_train)
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)
            
            if hasattr(model, 'predict_proba'):
                train_proba = model.predict_proba(X_train)[:, 1]
                val_proba = model.predict_proba(X_val)[:, 1]
                test_proba = model.predict_proba(X_test)[:, 1]
            else:
                train_proba = train_pred
                val_proba = val_pred
                test_proba = test_pred
            
            if optimize_thresholds and val_dates is not None and val_prices is not None:
                buy_th, sell_th = optimize_threshold_on_validation(
                    y_val, val_proba, val_dates, val_prices
                )
                thresholds[name] = (buy_th, sell_th)
            else:
                thresholds[name] = (0.55, 0.45)
            
            train_acc = accuracy_score(y_train, train_pred)
            val_acc = accuracy_score(y_val, val_pred)
            test_acc = accuracy_score(y_test, test_pred)
            
            test_precision = precision_score(y_test, test_pred, zero_division=0)
            test_recall = recall_score(y_test, test_pred, zero_division=0)
            test_f1 = f1_score(y_test, test_pred, zero_division=0)
            
            try:
                train_auc = roc_auc_score(y_train, train_proba)
                val_auc = roc_auc_score(y_val, val_proba)
                test_auc = roc_auc_score(y_test, test_proba)
            except:
                train_auc = val_auc = test_auc = 0.5
            
            results.append({
                'Model': name,
                'Train_Acc': train_acc,
                'Val_Acc': val_acc,
                'Test_Acc': test_acc,
                'Train_AUC': train_auc,
                'Val_AUC': val_auc,
                'Test_AUC': test_auc,
                'Test_Precision': test_precision,
                'Test_Recall': test_recall,
                'Test_F1': test_f1,
                'Overfit_Gap': train_acc - test_acc,
                'Buy_Threshold': thresholds[name][0],
                'Sell_Threshold': thresholds[name][1]
            })
            
            models_trained[name] = model
            predictions[name] = test_pred
            probabilities[name] = test_proba
            
            status = "OK"
            print(f"{name:<30} {train_acc:<12.4f} {val_acc:<12.4f} {test_acc:<12.4f} {test_auc:<12.4f} {status}")
            
        except Exception as e:
            print(f"{name:<30} {'ERROR':<12} {'ERROR':<12} {'ERROR':<12} {'ERROR':<12} {str(e)[:20]}")
    
    return pd.DataFrame(results), models_trained, predictions, probabilities, thresholds


def walk_forward_validation(df, models_config, n_splits=5, n_features=100):
    print("\n" + "="*100)
    print("WALK-FORWARD VALIDATION")
    print("="*100)
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    X, y, dates, prices, feature_cols = prepare_feature_target(df)
    all_results = []
    fold_num = 0
    
    for train_idx, test_idx in tscv.split(X):
        fold_num += 1
        print(f"\n--- Fold {fold_num}/{n_splits} ---")
        
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_test_fold = y.iloc[test_idx]
        
        print(f"Train: {len(train_idx)} samples | Test: {len(test_idx)} samples")
        
        selector = SelectKBest(score_func=f_classif, k=min(n_features, X_train_fold.shape[1]))
        X_train_selected = selector.fit_transform(X_train_fold, y_train_fold)
        X_test_selected = selector.transform(X_test_fold)
        
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        X_test_scaled = scaler.transform(X_test_selected)
        
        for name, model in models_config.items():
            try:
                from sklearn.base import clone
                model_copy = clone(model)
                
                model_copy.fit(X_train_scaled, y_train_fold)
                test_pred = model_copy.predict(X_test_scaled)
                
                if hasattr(model_copy, 'predict_proba'):
                    test_proba = model_copy.predict_proba(X_test_scaled)[:, 1]
                else:
                    test_proba = test_pred
                
                test_acc = accuracy_score(y_test_fold, test_pred)
                test_precision = precision_score(y_test_fold, test_pred, zero_division=0)
                test_recall = recall_score(y_test_fold, test_pred, zero_division=0)
                test_f1 = f1_score(y_test_fold, test_pred, zero_division=0)
                
                try:
                    test_auc = roc_auc_score(y_test_fold, test_proba)
                except:
                    test_auc = 0.5
                
                all_results.append({
                    'Fold': fold_num,
                    'Model': name,
                    'Test_Acc': test_acc,
                    'Test_Precision': test_precision,
                    'Test_Recall': test_recall,
                    'Test_F1': test_f1,
                    'Test_AUC': test_auc
                })
                
            except Exception as e:
                print(f"  {name}: Error - {str(e)[:50]}")
    
    results_df = pd.DataFrame(all_results)
    
    print("\n" + "="*100)
    print("WALK-FORWARD VALIDATION SUMMARY")
    print("="*100)
    
    summary = results_df.groupby('Model').agg({
        'Test_Acc': ['mean', 'std'],
        'Test_Precision': ['mean', 'std'],
        'Test_Recall': ['mean', 'std'],
        'Test_F1': ['mean', 'std'],
        'Test_AUC': ['mean', 'std']
    }).round(4)
    
    print("\n", summary)
    
    return results_df, summary


def calculate_trading_performance_corrected(predictions, probabilities, dates, prices, y_true,
                                           initial_capital=10000, transaction_cost=0.002, slippage=0.001,
                                           buy_threshold=0.55, sell_threshold=0.45):
    df_backtest = pd.DataFrame({
        'date': dates.values,
        'price': prices.values,
        'prediction': predictions,
        'probability': probabilities,
        'actual_direction': y_true.values
    })
    
    capital = initial_capital
    position = 0
    eth_holdings = 0
    portfolio_values = [initial_capital]
    trades = []
    
    total_cost = transaction_cost + slippage
    
    for idx in range(len(df_backtest) - 1):
        current_row = df_backtest.iloc[idx]
        signal = current_row['prediction']
        confidence = current_row['probability']
        
        trade_price = df_backtest.iloc[idx + 1]['price']
        
        if signal == 1 and position == 0 and confidence > buy_threshold:
            eth_to_buy = (capital * 0.95) / trade_price
            cost = eth_to_buy * trade_price * (1 + total_cost)
            if cost <= capital:
                eth_holdings = eth_to_buy
                capital -= cost
                position = 1
                trades.append({'action': 'BUY', 'price': trade_price, 'date': df_backtest.iloc[idx + 1]['date']})

        elif (signal == 0 or confidence < sell_threshold) and position == 1:
            revenue = eth_holdings * trade_price * (1 - total_cost)
            capital += revenue
            eth_holdings = 0
            position = 0
            trades.append({'action': 'SELL', 'price': trade_price, 'date': df_backtest.iloc[idx + 1]['date']})
            
        eod_portfolio_value = capital + (eth_holdings * trade_price)
        portfolio_values.append(eod_portfolio_value)

    final_value = portfolio_values[-1]
    total_return = (final_value - initial_capital) / initial_capital * 100
    buy_hold_return = (df_backtest.iloc[-1]['price'] - df_backtest.iloc[0]['price']) / df_backtest.iloc[0]['price'] * 100
    
    portfolio_values = np.array(portfolio_values)
    if len(portfolio_values) > 1:
        returns = (portfolio_values[1:] / portfolio_values[:-1]) - 1
        returns = returns[~np.isnan(returns) & ~np.isinf(returns)]
        
        sharpe_ratio = np.mean(returns) / np.std(returns) * np.sqrt(252) if len(returns) > 0 and np.std(returns) > 0 else 0
        
        cummax = np.maximum.accumulate(portfolio_values)
        drawdown = (portfolio_values - cummax) / cummax
        max_drawdown = np.min(drawdown) * 100 if len(drawdown) > 0 else 0
    else:
        sharpe_ratio = 0
        max_drawdown = 0
        
    n_trades = len(trades)
    n_buys = len([t for t in trades if t['action'] == 'BUY'])
    
    return {
        'final_value': final_value,
        'total_return': total_return,
        'buy_hold_return': buy_hold_return,
        'sharpe_ratio': sharpe_ratio,
        'max_drawdown': max_drawdown,
        'n_trades': n_trades,
        'n_buys': n_buys
    }


def backtest_all_models(models, predictions, probabilities, test_dates, test_prices, y_test, thresholds=None):
    print("\n" + "="*100)
    print("BACKTESTING RESULTS (Improved: Slippage 0.1% + Transaction Cost 0.2%)")
    print("="*100)
    
    backtest_results = []
    
    buy_hold_return = (test_prices.iloc[-1] - test_prices.iloc[0]) / test_prices.iloc[0] * 100
    
    print(f"\n{'Model':<30} {'Final Value':<15} {'Return %':<12} {'vs B&H':<12} {'Sharpe':<10} {'Max DD %':<12} {'Trades':<10}")
    print("-" * 110)
    
    for name in models.keys():
        try:
            if thresholds and name in thresholds:
                buy_th, sell_th = thresholds[name]
            else:
                buy_th, sell_th = 0.55, 0.45
            
            results = calculate_trading_performance_corrected(
                predictions[name], 
                probabilities[name],
                test_dates, 
                test_prices, 
                y_test,
                initial_capital=10000,
                transaction_cost=0.002,
                slippage=0.001,
                buy_threshold=buy_th,
                sell_threshold=sell_th
            )
            
            outperformance = results['total_return'] - buy_hold_return
            
            backtest_results.append({
                'Model': name,
                'Final_Value': results['final_value'],
                'Total_Return': results['total_return'],
                'Buy_Hold_Return': buy_hold_return,
                'Outperformance': outperformance,
                'Sharpe_Ratio': results['sharpe_ratio'],
                'Max_Drawdown': results['max_drawdown'],
                'N_Trades': results['n_trades'],
                'N_Buys': results['n_buys'],
                'Buy_Threshold': buy_th,
                'Sell_Threshold': sell_th
            })
            
            print(f"{name:<30} ${results['final_value']:<14,.2f} {results['total_return']:<11.2f}% "
                  f"{outperformance:<11.2f}% {results['sharpe_ratio']:<9.3f} "
                  f"{results['max_drawdown']:<11.2f}% {results['n_trades']:<10}")
            
        except Exception as e:
            print(f"{name:<30} Error: {str(e)[:50]}")
    
    print("-" * 110)
    print(f"{'Buy & Hold Baseline':<30} ${10000 * (1 + buy_hold_return/100):<14,.2f} {buy_hold_return:<11.2f}% "
          f"{'0.00':<11}% {'N/A':<9} {'N/A':<11} {'0':<10}")
    
    return pd.DataFrame(backtest_results)


def create_comprehensive_report(results_df):
    print("\n" + "="*100)
    print("DETAILED PERFORMANCE REPORT")
    print("="*100)
    
    results_sorted = results_df.sort_values('Test_AUC', ascending=False).reset_index(drop=True)
    
    print(f"\n{'Rank':<6} {'Model':<30} {'Acc':<10} {'Prec':<10} {'Recall':<10} {'F1':<10} {'AUC':<10} {'Overfit':<10}")
    print("-" * 110)
    
    for idx, row in results_sorted.iterrows():
        print(f"{idx+1:<6} {row['Model']:<30} {row['Test_Acc']:<10.4f} {row['Test_Precision']:<10.4f} "
              f"{row['Test_Recall']:<10.4f} {row['Test_F1']:<10.4f} {row['Test_AUC']:<10.4f} {row['Overfit_Gap']:<10.4f}")
    
    print("\n" + "="*100)
    print("STATISTICAL SUMMARY")
    print("="*100)
    print(f"Best Test Accuracy:  {results_sorted.iloc[0]['Model']} ({results_sorted.iloc[0]['Test_Acc']:.4f})")
    print(f"Best Test AUC:       {results_sorted.iloc[0]['Model']} ({results_sorted.iloc[0]['Test_AUC']:.4f})")
    print(f"Best Test F1:        {results_sorted.nlargest(1, 'Test_F1').iloc[0]['Model']} ({results_sorted['Test_F1'].max():.4f})")
    print(f"\nMean Test Accuracy:  {results_df['Test_Acc'].mean():.4f} +/- {results_df['Test_Acc'].std():.4f}")
    print(f"Mean Test AUC:       {results_df['Test_AUC'].mean():.4f} +/- {results_df['Test_AUC'].std():.4f}")
    print(f"Mean Overfit Gap:    {results_df['Overfit_Gap'].mean():.4f} +/- {results_df['Overfit_Gap'].std():.4f}")
    
    return results_sorted


def walk_forward_backtest(df, models_config, n_splits=5, n_features=100):
    print("\n" + "="*100)
    print("WALK-FORWARD BACKTESTING (Improved: Slippage 0.1% + Transaction Cost 0.2%)")
    print("="*100)
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    X, y, dates, prices, feature_cols = prepare_feature_target(df)
    all_backtest_results = []
    fold_num = 0
    
    for train_idx, test_idx in tscv.split(X):
        fold_num += 1
        print(f"\n--- Fold {fold_num}/{n_splits} Backtest ---")
        
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_test_fold = y.iloc[test_idx]
        test_dates_fold = dates.iloc[test_idx]
        test_prices_fold = prices.iloc[test_idx]
        
        print(f"Train: {len(train_idx)} samples | Test: {len(test_idx)} samples")
        
        selector = SelectKBest(score_func=f_classif, k=min(n_features, X_train_fold.shape[1]))
        X_train_selected = selector.fit_transform(X_train_fold, y_train_fold)
        X_test_selected = selector.transform(X_test_fold)
        
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        X_test_scaled = scaler.transform(X_test_selected)
        
        for name, model in models_config.items():
            try:
                from sklearn.base import clone
                model_copy = clone(model)
                
                model_copy.fit(X_train_scaled, y_train_fold)
                test_pred = model_copy.predict(X_test_scaled)
                
                if hasattr(model_copy, 'predict_proba'):
                    test_proba = model_copy.predict_proba(X_test_scaled)[:, 1]
                else:
                    test_proba = test_pred
                
                bt_result = calculate_trading_performance_corrected(
                    test_pred, 
                    test_proba,
                    test_dates_fold, 
                    test_prices_fold, 
                    y_test_fold,
                    initial_capital=10000,
                    transaction_cost=0.002,
                    slippage=0.001
                )
                
                bt_result['Model'] = name
                bt_result['Fold'] = fold_num
                all_backtest_results.append(bt_result)
                
            except Exception as e:
                print(f"  {name}: Backtest Error - {str(e)[:50]}")
    
    backtest_df = pd.DataFrame(all_backtest_results)
    
    print("\n" + "="*100)
    print("WALK-FORWARD BACKTEST SUMMARY")
    print("="*100)
    
    summary = backtest_df.groupby('Model').agg({
        'total_return': ['mean', 'std'],
        'sharpe_ratio': ['mean', 'std'],
        'max_drawdown': ['mean', 'std'],
        'n_trades': ['mean', 'sum']
    }).round(4)
    
    summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
    summary = summary.sort_values('total_return_mean', ascending=False)
    
    print("\n", summary)
    
    print(f"\n{'Model':<30} {'Avg Return %':<15} {'Avg Sharpe':<12} {'Avg Max DD %':<15} {'Total Trades':<12}")
    print("-" * 110)
    for model_name in summary.index:
        model_data = backtest_df[backtest_df['Model'] == model_name]
        avg_return = model_data['total_return'].mean()
        avg_sharpe = model_data['sharpe_ratio'].mean()
        avg_dd = model_data['max_drawdown'].mean()
        total_trades = model_data['n_trades'].sum()
        print(f"{model_name:<30} {avg_return:<15.2f} {avg_sharpe:<12.3f} {avg_dd:<15.2f} {total_trades:<12.0f}")
    
    return backtest_df, summary

In [40]:
all_models = get_all_models()

X_train, y_train, train_dates, train_prices, _ = prepare_feature_target(train_df)
X_val, y_val, val_dates, val_prices, _ = prepare_feature_target(val_df)
X_test, y_test, test_dates, test_prices, _ = prepare_feature_target(test_df)

print(f"\nDataset: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

check_data_leakage(X_train, X_val, X_test, train_dates, val_dates, test_dates)

X_train_sel, X_val_sel, X_test_sel, _ = feature_selection_before_scaling(
    X_train, y_train, X_val, X_test, n_features=100
)
X_train_scaled, X_val_scaled, X_test_scaled, _ = scale_features(
    X_train_sel, X_val_sel, X_test_sel
)

results_df, models, predictions, probabilities, thresholds = train_all_models(
    X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test,
    val_dates=val_dates, val_prices=val_prices, optimize_thresholds=True
)

results_sorted = create_comprehensive_report(results_df)

wf_results, wf_summary = walk_forward_validation(df_clean, all_models, n_splits=5, n_features=100)

backtest_fixed = backtest_all_models(models, predictions, probabilities, test_dates, test_prices, y_test, thresholds)

wf_backtest, wf_backtest_summary = walk_forward_backtest(df_clean, all_models, n_splits=5, n_features=100)




Dataset: Train=1225, Val=262, Test=263

DATA LEAKAGE VERIFICATION

Train: 1225 samples, 2020-12-20 00:00:00 to 2024-04-27 00:00:00
Val:   262 samples, 2024-04-28 00:00:00 to 2025-01-14 00:00:00
Test:  263 samples, 2025-01-15 00:00:00 to 2025-10-04 00:00:00

No data leakage detected

[FEATURE SELECTION] 250 -> 100
['BTC_Volume', 'BNB_Open', 'BNB_High', 'BNB_Low', 'BNB_Close', 'XRP_Volume', 'SOL_Low', 'SOL_Close', 'AVAX_Open', 'AVAX_High', 'AVAX_Low', 'AVAX_Close', 'DOT_Volume', 'ETH_Volume', 'eth_tx_count', 'eth_active_addresses', 'eth_new_addresses', 'eth_large_eth_transfers', 'eth_token_transfers', 'eth_contract_events', 'eth_total_gas_used', 'fg_fear_greed', 'usdt_totalCirculating', 'usdt_totalCirculatingUSD', 'usdt_totalMintedUSD', 'makerdao_makerdao_eth_tvl', 'dxy_DXY', 'close_lag1', 'close_lag2', 'close_lag3', 'close_lag7', 'close_lag14', 'close_lag30', 'high_lag1', 'low_lag1', 'high_lag2', 'low_lag2', 'high_lag3', 'low_lag3', 'high_lag5', 'low_lag5', 'high_lag7', 'low_lag7', 'cl

LogisticRegression             $37,559.30      275.59     % 245.49     % 3.076     -11.26     % 45        
RidgeClassifier                $29,017.06      190.17     % 160.07     % 3.220     -13.49     % 33        
SVM_RBF                        $18,597.94      85.98      % 55.88      % 1.339     -45.61     % 3         
SVM_Linear                     $35,448.30      254.48     % 224.38     % 3.312     -8.96      % 35        
MLP_Small                      $25,106.99      151.07     % 120.97     % 2.516     -14.60     % 42        
MLP_Medium                     $30,208.62      202.09     % 171.98     % 2.457     -19.79     % 79        
KNN                            $14,130.76      41.31      % 11.21      % 1.290     -11.97     % 16        
NaiveBayes                     $17,399.61      74.00      % 43.89      % 1.223     -44.19     % 23        
Bagging_RF                     $12,217.91      22.18      % -7.92      % 0.998     -3.64      % 4         
XGBoost_GPU                    $36,61

In [None]:
all_models = get_all_models()

X_train, y_train, train_dates, train_prices, _ = prepare_feature_target(train_df)
X_val, y_val, val_dates, val_prices, _ = prepare_feature_target(val_df)
X_test, y_test, test_dates, test_prices, _ = prepare_feature_target(test_df)

print(f"\nDataset: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

check_data_leakage(X_train, X_val, X_test, train_dates, val_dates, test_dates)

X_train_sel, X_val_sel, X_test_sel, _ = feature_selection_before_scaling(
    X_train, y_train, X_val, X_test, n_features=100
)
X_train_scaled, X_val_scaled, X_test_scaled, _ = scale_features(
    X_train_sel, X_val_sel, X_test_sel
)

results_df, models, predictions, probabilities, thresholds = train_all_models(
    X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test,
    val_dates=val_dates, val_prices=val_prices, optimize_thresholds=True
)

results_sorted = create_comprehensive_report(results_df)

wf_results, wf_summary = walk_forward_validation(df_clean, all_models, n_splits=5, n_features=100)

backtest_fixed = backtest_all_models(models, predictions, probabilities, test_dates, test_prices, y_test, thresholds)

wf_backtest, wf_backtest_summary = walk_forward_backtest(df_clean, all_models, n_splits=5, n_features=100)


Dataset: Train=1226, Val=263, Test=263

DATA LEAKAGE VERIFICATION

Train: 1226 samples, 2020-12-19 00:00:00 to 2024-04-27 00:00:00
Val:   263 samples, 2024-04-28 00:00:00 to 2025-01-15 00:00:00
Test:  263 samples, 2025-01-16 00:00:00 to 2025-10-05 00:00:00

No data leakage detected

[FEATURE SELECTION] 304 -> 100
['BTC_Volume', 'ETH_Volume', 'BNB_Open', 'BNB_High', 'BNB_Low', 'BNB_Close', 'SOL_Close', 'AVAX_Open', 'AVAX_High', 'AVAX_Low', 'AVAX_Close', 'DOT_Volume', 'sentiment_mean', 'positive_ratio', 'negative_ratio', 'extreme_negative_count', 'sentiment_polarity', 'bull_bear_ratio', 'weighted_sentiment', 'sentiment_ma3', 'sentiment_volatility_3', 'sentiment_volatility_7', 'sentiment_ma14', 'sentiment_volatility_14', 'news_volume_ma7', 'news_volume_ma14', 'eth_tx_count', 'eth_active_addresses', 'eth_new_addresses', 'eth_large_eth_transfers', 'eth_token_transfers', 'eth_contract_events', 'usdt_totalCirculating', 'usdt_totalCirculatingUSD', 'usdt_totalMintedUSD', 'makerdao_makerdao_eth

In [None]:
#################1007 버전 1 클로드 버전#######################
#################1007 버전 1 클로드 버전#######################
#################1007 버전 1 클로드 버전#######################
#################1007 버전 1 클로드 버전#######################
#################1007 버전 1 클로드 버전#######################
#################1007 버전 1 클로드 버전#######################

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    AdaBoostClassifier, VotingClassifier, StackingClassifier, 
    BaggingClassifier, ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, roc_curve, confusion_matrix
)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')


def prepare_feature_target(df, task='direction'):
    exclude_cols = ['date', 'next_log_return', 'next_direction', 'next_close', 
                    'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open']
    
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    X = df[feature_cols].copy()
    
    if task == 'direction':
        y = df['next_direction'].copy()
    elif task == 'log_return':
        y = df['next_log_return'].copy()
    elif task == 'close':
        y = df['next_close'].copy()
    elif task == 'multi_direction_close':
        y = df[['next_direction', 'next_close']].copy()
    elif task == 'multi_direction_log':
        y = df[['next_direction', 'next_log_return']].copy()
    else:
        raise ValueError("지원하지 않는 task 종류입니다.")
    
    return X, y, df['date'], df['ETH_Close'], feature_cols



def check_data_leakage(X_train, X_val, X_test, train_dates, val_dates, test_dates):
    print("\n" + "="*100)
    print("DATA LEAKAGE VERIFICATION")
    print("="*100)
    
    issues = []
    
    train_idx = set(X_train.index)
    val_idx = set(X_val.index)
    test_idx = set(X_test.index)
    
    if len(train_idx & val_idx) > 0:
        issues.append("CRITICAL: Train and validation index overlap")
    if len(train_idx & test_idx) > 0:
        issues.append("CRITICAL: Train and test index overlap")
    if len(val_idx & test_idx) > 0:
        issues.append("CRITICAL: Validation and test index overlap")
    
    if train_dates.max() >= val_dates.min():
        issues.append("CRITICAL: Train dates overlap with validation dates")
    if val_dates.max() >= test_dates.min():
        issues.append("CRITICAL: Validation dates overlap with test dates")
    
    if X_train.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Train has {X_train.isnull().sum().sum()} NaN values")
    if X_val.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Validation has {X_val.isnull().sum().sum()} NaN values")
    if X_test.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Test has {X_test.isnull().sum().sum()} NaN values")
    
    if np.isinf(X_train).sum().sum() > 0:
        issues.append(f"WARNING: Train has {np.isinf(X_train).sum().sum()} inf values")
    if np.isinf(X_val).sum().sum() > 0:
        issues.append(f"WARNING: Validation has {np.isinf(X_val).sum().sum()} inf values")
    if np.isinf(X_test).sum().sum() > 0:
        issues.append(f"WARNING: Test has {np.isinf(X_test).sum().sum()} inf values")
    
    print(f"\nTrain: {len(X_train)} samples, {train_dates.min()} to {train_dates.max()}")
    print(f"Val:   {len(X_val)} samples, {val_dates.min()} to {val_dates.max()}")
    print(f"Test:  {len(X_test)} samples, {test_dates.min()} to {test_dates.max()}")
    
    if len(issues) == 0:
        print("\nNo data leakage detected")
        return True
    else:
        print("\nData leakage issues detected:")
        for issue in issues:
            print(f"  {issue}")
        return False


def feature_selection_before_scaling(X_train, y_train, X_val, X_test, n_features=100):
    print(f"\n[FEATURE SELECTION] {X_train.shape[1]} -> {n_features}")
    
    selector = SelectKBest(score_func=f_classif, k=min(n_features, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_val_selected = selector.transform(X_val)
    X_test_selected = selector.transform(X_test)
    
    selected_features = X_train.columns[selector.get_support()].tolist()
    print(f"  Selected: {len(selected_features)} features")
    
    return X_train_selected, X_val_selected, X_test_selected, selected_features


def scale_features(X_train, X_val, X_test):
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler


def get_all_models():
    base_models = {
        'RandomForest': RandomForestClassifier(
            n_estimators=200, max_depth=10, min_samples_split=20,
            min_samples_leaf=10, random_state=42, n_jobs=-1
        ),
        'GradientBoosting': GradientBoostingClassifier(
            n_estimators=200, max_depth=4, learning_rate=0.03,
            subsample=0.75, random_state=42
        ),
        'ExtraTrees': ExtraTreesClassifier(
            n_estimators=200, max_depth=10, min_samples_split=20,
            min_samples_leaf=10, random_state=42, n_jobs=-1
        ),
        'AdaBoost': AdaBoostClassifier(
            n_estimators=100, learning_rate=0.5, random_state=42
        ),
        'DecisionTree': DecisionTreeClassifier(
            max_depth=8, min_samples_split=20, min_samples_leaf=10,
            random_state=42
        ),
        'LogisticRegression': LogisticRegression(
            C=0.1, penalty='l2', max_iter=2000, random_state=42, n_jobs=-1
        ),
        'RidgeClassifier': RidgeClassifier(
            alpha=1.0, random_state=42
        ),
        'SVM_RBF': SVC(
            kernel='rbf', C=1.0, gamma='scale', 
            probability=True, random_state=42
        ),
        'SVM_Linear': SVC(
            kernel='linear', C=0.1,
            probability=True, random_state=42
        ),
        'MLP_Small': MLPClassifier(
            hidden_layer_sizes=(64, 32), activation='relu',
            solver='adam', alpha=0.01, batch_size=64,
            learning_rate='adaptive', max_iter=300,
            early_stopping=True, random_state=42
        ),
        'MLP_Medium': MLPClassifier(
            hidden_layer_sizes=(128, 64, 32), activation='relu',
            solver='adam', alpha=0.001, batch_size=64,
            learning_rate='adaptive', max_iter=300,
            early_stopping=True, random_state=42
        ),
        'KNN': KNeighborsClassifier(
            n_neighbors=15, weights='distance', n_jobs=-1
        ),
        'NaiveBayes': GaussianNB(),
        'Bagging_RF': BaggingClassifier(
            estimator=DecisionTreeClassifier(max_depth=8, random_state=42),
            n_estimators=50, random_state=42, n_jobs=-1
        ),
        'XGBoost_GPU': XGBClassifier(
            n_estimators=200,
            learning_rate=0.03,
            max_depth=3,
            min_child_weight=5,
            subsample=0.7,
            colsample_bytree=0.7,
            gamma=0.1,
            reg_alpha=0.1,
            reg_lambda=5,
            tree_method='gpu_hist',
            random_state=42
        ),
        'LightGBM_GPU': LGBMClassifier(
            n_estimators=200,
            learning_rate=0.03,
            num_leaves=20,
            max_depth=4,
            min_child_samples=20,
            subsample=0.7,
            colsample_bytree=0.7,
            reg_alpha=0.1,
            reg_lambda=5,
            device='gpu',
            random_state=42,
            verbose=-1,
            n_jobs=-1
        ),
    }
    
    voting_soft = VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('lr', LogisticRegression(C=0.1, random_state=42, n_jobs=-1))
        ],
        voting='soft'
    )
    
    voting_hard = VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('lr', LogisticRegression(C=0.1, random_state=42, n_jobs=-1))
        ],
        voting='hard'
    )
    
    stacking = StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('et', ExtraTreesClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1))
        ],
        final_estimator=LogisticRegression(C=0.1, random_state=42),
        cv=TimeSeriesSplit(n_splits=3)
    )
    
    base_models['Voting_Soft'] = voting_soft
    base_models['Voting_Hard'] = voting_hard
    base_models['Stacking'] = stacking
    
    xgb_calibrated = CalibratedClassifierCV(
        base_models['XGBoost_GPU'],
        method='isotonic',
        cv=TimeSeriesSplit(n_splits=3)
    )
    lgb_calibrated = CalibratedClassifierCV(
        base_models['LightGBM_GPU'],
        method='isotonic',
        cv=TimeSeriesSplit(n_splits=3)
    )
    
    base_models['XGBoost_Calibrated'] = xgb_calibrated
    base_models['LightGBM_Calibrated'] = lgb_calibrated
    
    return base_models


def optimize_threshold_on_validation(y_val, y_proba_val, val_dates, val_prices):
    best_sharpe = -np.inf
    best_thresholds = (0.55, 0.45)
    
    for buy_th in np.arange(0.50, 0.70, 0.05):
        for sell_th in np.arange(0.30, 0.50, 0.05):
            predictions_temp = (y_proba_val > 0.5).astype(int)
            
            temp_result = calculate_trading_performance_corrected(
                predictions_temp, y_proba_val, val_dates, val_prices, y_val,
                initial_capital=10000, transaction_cost=0.002, slippage=0.001,
                buy_threshold=buy_th, sell_threshold=sell_th
            )
            
            if temp_result['sharpe_ratio'] > best_sharpe:
                best_sharpe = temp_result['sharpe_ratio']
                best_thresholds = (buy_th, sell_th)
    
    return best_thresholds


def train_all_models(X_train, y_train, X_val, y_val, X_test, y_test, 
                     val_dates=None, val_prices=None, optimize_thresholds=False):
    print("\n" + "="*100)
    print("MODEL TRAINING AND EVALUATION")
    print("="*100)
    
    models_config = get_all_models()
    
    results = []
    models_trained = {}
    predictions = {}
    probabilities = {}
    thresholds = {}
    
    print(f"\n{'Model':<30} {'Train Acc':<12} {'Val Acc':<12} {'Test Acc':<12} {'Test AUC':<12} Status")
    print("-" * 110)
    
    for name, model in models_config.items():
        try:
            model.fit(X_train, y_train)
            
            train_pred = model.predict(X_train)
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)
            
            if hasattr(model, 'predict_proba'):
                train_proba = model.predict_proba(X_train)[:, 1]
                val_proba = model.predict_proba(X_val)[:, 1]
                test_proba = model.predict_proba(X_test)[:, 1]
            else:
                train_proba = train_pred
                val_proba = val_pred
                test_proba = test_pred
            
            if optimize_thresholds and val_dates is not None and val_prices is not None:
                buy_th, sell_th = optimize_threshold_on_validation(
                    y_val, val_proba, val_dates, val_prices
                )
                thresholds[name] = (buy_th, sell_th)
            else:
                thresholds[name] = (0.55, 0.45)
            
            train_acc = accuracy_score(y_train, train_pred)
            val_acc = accuracy_score(y_val, val_pred)
            test_acc = accuracy_score(y_test, test_pred)
            
            test_precision = precision_score(y_test, test_pred, zero_division=0)
            test_recall = recall_score(y_test, test_pred, zero_division=0)
            test_f1 = f1_score(y_test, test_pred, zero_division=0)
            
            try:
                train_auc = roc_auc_score(y_train, train_proba)
                val_auc = roc_auc_score(y_val, val_proba)
                test_auc = roc_auc_score(y_test, test_proba)
            except:
                train_auc = val_auc = test_auc = 0.5
            
            results.append({
                'Model': name,
                'Train_Acc': train_acc,
                'Val_Acc': val_acc,
                'Test_Acc': test_acc,
                'Train_AUC': train_auc,
                'Val_AUC': val_auc,
                'Test_AUC': test_auc,
                'Test_Precision': test_precision,
                'Test_Recall': test_recall,
                'Test_F1': test_f1,
                'Overfit_Gap': train_acc - test_acc,
                'Buy_Threshold': thresholds[name][0],
                'Sell_Threshold': thresholds[name][1]
            })
            
            models_trained[name] = model
            predictions[name] = test_pred
            probabilities[name] = test_proba
            
            status = "OK"
            print(f"{name:<30} {train_acc:<12.4f} {val_acc:<12.4f} {test_acc:<12.4f} {test_auc:<12.4f} {status}")
            
        except Exception as e:
            print(f"{name:<30} {'ERROR':<12} {'ERROR':<12} {'ERROR':<12} {'ERROR':<12} {str(e)[:20]}")
    
    return pd.DataFrame(results), models_trained, predictions, probabilities, thresholds



def calculate_trading_performance_corrected(predictions, probabilities, dates, prices, y_true,
                                           initial_capital=10000, transaction_cost=0.002, slippage=0.001,
                                           buy_threshold=0.55, sell_threshold=0.45):
    df_backtest = pd.DataFrame({
        'date': dates.values,
        'price': prices.values,
        'prediction': predictions,
        'probability': probabilities,
        'actual_direction': y_true.values
    })
    
    capital = initial_capital
    position = 0
    eth_holdings = 0
    portfolio_values = [initial_capital]
    trades = []
    
    total_cost = transaction_cost + slippage
    
    for idx in range(len(df_backtest) - 1):
        current_row = df_backtest.iloc[idx]
        signal = current_row['prediction']
        confidence = current_row['probability']
        
        trade_price = df_backtest.iloc[idx + 1]['price']
        
        if signal == 1 and position == 0 and confidence > buy_threshold:
            eth_to_buy = (capital * 0.95) / trade_price
            cost = eth_to_buy * trade_price * (1 + total_cost)
            if cost <= capital:
                eth_holdings = eth_to_buy
                capital -= cost
                position = 1
                trades.append({'action': 'BUY', 'price': trade_price, 'date': df_backtest.iloc[idx + 1]['date']})

        elif (signal == 0 or confidence < sell_threshold) and position == 1:
            revenue = eth_holdings * trade_price * (1 - total_cost)
            capital += revenue
            eth_holdings = 0
            position = 0
            trades.append({'action': 'SELL', 'price': trade_price, 'date': df_backtest.iloc[idx + 1]['date']})
            
        eod_portfolio_value = capital + (eth_holdings * trade_price)
        portfolio_values.append(eod_portfolio_value)

    final_value = portfolio_values[-1]
    total_return = (final_value - initial_capital) / initial_capital * 100
    buy_hold_return = (df_backtest.iloc[-1]['price'] - df_backtest.iloc[0]['price']) / df_backtest.iloc[0]['price'] * 100
    
    portfolio_values = np.array(portfolio_values)
    if len(portfolio_values) > 1:
        returns = (portfolio_values[1:] / portfolio_values[:-1]) - 1
        returns = returns[~np.isnan(returns) & ~np.isinf(returns)]
        
        sharpe_ratio = np.mean(returns) / np.std(returns) * np.sqrt(252) if len(returns) > 0 and np.std(returns) > 0 else 0
        
        cummax = np.maximum.accumulate(portfolio_values)
        drawdown = (portfolio_values - cummax) / cummax
        max_drawdown = np.min(drawdown) * 100 if len(drawdown) > 0 else 0
    else:
        sharpe_ratio = 0
        max_drawdown = 0
        
    n_trades = len(trades)
    n_buys = len([t for t in trades if t['action'] == 'BUY'])
    
    return {
        'final_value': final_value,
        'total_return': total_return,
        'buy_hold_return': buy_hold_return,
        'sharpe_ratio': sharpe_ratio,
        'max_drawdown': max_drawdown,
        'n_trades': n_trades,
        'n_buys': n_buys
    }


def backtest_all_models(models, predictions, probabilities, test_dates, test_prices, y_test, thresholds=None):
    print("\n" + "="*100)
    print("BACKTESTING RESULTS (Improved: Slippage 0.1% + Transaction Cost 0.2%)")
    print("="*100)
    
    backtest_results = []
    
    buy_hold_return = (test_prices.iloc[-1] - test_prices.iloc[0]) / test_prices.iloc[0] * 100
    
    print(f"\n{'Model':<30} {'Final Value':<15} {'Return %':<12} {'vs B&H':<12} {'Sharpe':<10} {'Max DD %':<12} {'Trades':<10}")
    print("-" * 110)
    
    for name in models.keys():
        try:
            if thresholds and name in thresholds:
                buy_th, sell_th = thresholds[name]
            else:
                buy_th, sell_th = 0.55, 0.45
            
            results = calculate_trading_performance_corrected(
                predictions[name], 
                probabilities[name],
                test_dates, 
                test_prices, 
                y_test,
                initial_capital=10000,
                transaction_cost=0.002,
                slippage=0.001,
                buy_threshold=buy_th,
                sell_threshold=sell_th
            )
            
            outperformance = results['total_return'] - buy_hold_return
            
            backtest_results.append({
                'Model': name,
                'Final_Value': results['final_value'],
                'Total_Return': results['total_return'],
                'Buy_Hold_Return': buy_hold_return,
                'Outperformance': outperformance,
                'Sharpe_Ratio': results['sharpe_ratio'],
                'Max_Drawdown': results['max_drawdown'],
                'N_Trades': results['n_trades'],
                'N_Buys': results['n_buys'],
                'Buy_Threshold': buy_th,
                'Sell_Threshold': sell_th
            })
            
            print(f"{name:<30} ${results['final_value']:<14,.2f} {results['total_return']:<11.2f}% "
                  f"{outperformance:<11.2f}% {results['sharpe_ratio']:<9.3f} "
                  f"{results['max_drawdown']:<11.2f}% {results['n_trades']:<10}")
            
        except Exception as e:
            print(f"{name:<30} Error: {str(e)[:50]}")
    
    print("-" * 110)
    print(f"{'Buy & Hold Baseline':<30} ${10000 * (1 + buy_hold_return/100):<14,.2f} {buy_hold_return:<11.2f}% "
          f"{'0.00':<11}% {'N/A':<9} {'N/A':<11} {'0':<10}")
    
    return pd.DataFrame(backtest_results)


def create_comprehensive_report(results_df):
    print("\n" + "="*100)
    print("DETAILED PERFORMANCE REPORT")
    print("="*100)
    
    results_sorted = results_df.sort_values('Test_AUC', ascending=False).reset_index(drop=True)
    
    print(f"\n{'Rank':<6} {'Model':<30} {'Acc':<10} {'Prec':<10} {'Recall':<10} {'F1':<10} {'AUC':<10} {'Overfit':<10}")
    print("-" * 110)
    
    for idx, row in results_sorted.iterrows():
        print(f"{idx+1:<6} {row['Model']:<30} {row['Test_Acc']:<10.4f} {row['Test_Precision']:<10.4f} "
              f"{row['Test_Recall']:<10.4f} {row['Test_F1']:<10.4f} {row['Test_AUC']:<10.4f} {row['Overfit_Gap']:<10.4f}")
    
    print("\n" + "="*100)
    print("STATISTICAL SUMMARY")
    print("="*100)
    print(f"Best Test Accuracy:  {results_sorted.iloc[0]['Model']} ({results_sorted.iloc[0]['Test_Acc']:.4f})")
    print(f"Best Test AUC:       {results_sorted.iloc[0]['Model']} ({results_sorted.iloc[0]['Test_AUC']:.4f})")
    print(f"Best Test F1:        {results_sorted.nlargest(1, 'Test_F1').iloc[0]['Model']} ({results_sorted['Test_F1'].max():.4f})")
    print(f"\nMean Test Accuracy:  {results_df['Test_Acc'].mean():.4f} +/- {results_df['Test_Acc'].std():.4f}")
    print(f"Mean Test AUC:       {results_df['Test_AUC'].mean():.4f} +/- {results_df['Test_AUC'].std():.4f}")
    print(f"Mean Overfit Gap:    {results_df['Overfit_Gap'].mean():.4f} +/- {results_df['Overfit_Gap'].std():.4f}")
    
    return results_sorted


In [None]:


def walk_forward_backtest(df, models_config, n_splits=5, n_features=100):
    print("\n" + "="*100)
    print("WALK-FORWARD BACKTESTING (Improved: Slippage 0.1% + Transaction Cost 0.2%)")
    print("="*100)
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    X, y, dates, prices, feature_cols = prepare_feature_target(df)
    all_backtest_results = []
    fold_num = 0
    
    for train_idx, test_idx in tscv.split(X):
        fold_num += 1
        print(f"\n--- Fold {fold_num}/{n_splits} Backtest ---")
        
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_test_fold = y.iloc[test_idx]
        test_dates_fold = dates.iloc[test_idx]
        test_prices_fold = prices.iloc[test_idx]
        
        print(f"Train: {len(train_idx)} samples | Test: {len(test_idx)} samples")
        
        selector = SelectKBest(score_func=f_classif, k=min(n_features, X_train_fold.shape[1]))
        X_train_selected = selector.fit_transform(X_train_fold, y_train_fold)
        X_test_selected = selector.transform(X_test_fold)
        
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        X_test_scaled = scaler.transform(X_test_selected)
        
        for name, model in models_config.items():
            try:
                from sklearn.base import clone
                model_copy = clone(model)
                
                model_copy.fit(X_train_scaled, y_train_fold)
                test_pred = model_copy.predict(X_test_scaled)
                
                if hasattr(model_copy, 'predict_proba'):
                    test_proba = model_copy.predict_proba(X_test_scaled)[:, 1]
                else:
                    test_proba = test_pred
                
                bt_result = calculate_trading_performance_corrected(
                    test_pred, 
                    test_proba,
                    test_dates_fold, 
                    test_prices_fold, 
                    y_test_fold,
                    initial_capital=10000,
                    transaction_cost=0.002,
                    slippage=0.001
                )
                
                bt_result['Model'] = name
                bt_result['Fold'] = fold_num
                all_backtest_results.append(bt_result)
                
            except Exception as e:
                print(f"  {name}: Backtest Error - {str(e)[:50]}")
    
    backtest_df = pd.DataFrame(all_backtest_results)
    
    print("\n" + "="*100)
    print("WALK-FORWARD BACKTEST SUMMARY")
    print("="*100)
    
    summary = backtest_df.groupby('Model').agg({
        'total_return': ['mean', 'std'],
        'sharpe_ratio': ['mean', 'std'],
        'max_drawdown': ['mean', 'std'],
        'n_trades': ['mean', 'sum']
    }).round(4)
    
    summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
    summary = summary.sort_values('total_return_mean', ascending=False)
    
    print("\n", summary)
    
    print(f"\n{'Model':<30} {'Avg Return %':<15} {'Avg Sharpe':<12} {'Avg Max DD %':<15} {'Total Trades':<12}")
    print("-" * 110)
    for model_name in summary.index:
        model_data = backtest_df[backtest_df['Model'] == model_name]
        avg_return = model_data['total_return'].mean()
        avg_sharpe = model_data['sharpe_ratio'].mean()
        avg_dd = model_data['max_drawdown'].mean()
        total_trades = model_data['n_trades'].sum()
        print(f"{model_name:<30} {avg_return:<15.2f} {avg_sharpe:<12.3f} {avg_dd:<15.2f} {total_trades:<12.0f}")
    
    return backtest_df, summary




def walk_forward_validation(df, models_config, n_splits=5, n_features=100):
    print("\n" + "="*100)
    print("WALK-FORWARD VALIDATION")
    print("="*100)
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    X, y, dates, prices, feature_cols = prepare_feature_target(df)
    all_results = []
    fold_num = 0
    
    for train_idx, test_idx in tscv.split(X):
        fold_num += 1
        print(f"\n--- Fold {fold_num}/{n_splits} ---")
        
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_test_fold = y.iloc[test_idx]
        
        print(f"Train: {len(train_idx)} samples | Test: {len(test_idx)} samples")
        
        selector = SelectKBest(score_func=f_classif, k=min(n_features, X_train_fold.shape[1]))
        X_train_selected = selector.fit_transform(X_train_fold, y_train_fold)
        X_test_selected = selector.transform(X_test_fold)
        
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train_selected)
        X_test_scaled = scaler.transform(X_test_selected)
        
        for name, model in models_config.items():
            try:
                from sklearn.base import clone
                model_copy = clone(model)
                
                model_copy.fit(X_train_scaled, y_train_fold)
                test_pred = model_copy.predict(X_test_scaled)
                
                if hasattr(model_copy, 'predict_proba'):
                    test_proba = model_copy.predict_proba(X_test_scaled)[:, 1]
                else:
                    test_proba = test_pred
                
                test_acc = accuracy_score(y_test_fold, test_pred)
                test_precision = precision_score(y_test_fold, test_pred, zero_division=0)
                test_recall = recall_score(y_test_fold, test_pred, zero_division=0)
                test_f1 = f1_score(y_test_fold, test_pred, zero_division=0)
                
                try:
                    test_auc = roc_auc_score(y_test_fold, test_proba)
                except:
                    test_auc = 0.5
                
                all_results.append({
                    'Fold': fold_num,
                    'Model': name,
                    'Test_Acc': test_acc,
                    'Test_Precision': test_precision,
                    'Test_Recall': test_recall,
                    'Test_F1': test_f1,
                    'Test_AUC': test_auc
                })
                
            except Exception as e:
                print(f"  {name}: Error - {str(e)[:50]}")
    
    results_df = pd.DataFrame(all_results)
    
    print("\n" + "="*100)
    print("WALK-FORWARD VALIDATION SUMMARY")
    print("="*100)
    
    summary = results_df.groupby('Model').agg({
        'Test_Acc': ['mean', 'std'],
        'Test_Precision': ['mean', 'std'],
        'Test_Recall': ['mean', 'std'],
        'Test_F1': ['mean', 'std'],
        'Test_AUC': ['mean', 'std']
    }).round(4)
    
    print("\n", summary)
    
    return results_df, summary


In [None]:
#################1007 버전 2 퍼플렉시티 버전#######################
#################1007 버전 2 퍼플렉시티 버전#######################
#################1007 버전 2 퍼플렉시티 버전#######################
#################1007 버전 2 퍼플렉시티 버전#######################

In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    AdaBoostClassifier, VotingClassifier, StackingClassifier, 
    BaggingClassifier, ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, roc_curve, confusion_matrix
)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')


# ============================================================================
# 1. prepare_feature_target (수정: 이벤트 분리)
# ============================================================================

def prepare_feature_target(df, task='classification'):
    """
    피처와 타겟 준비 (이벤트 피처 식별 추가)
    """
    exclude_cols = ['date', 'next_log_return', 'next_direction', 'next_close',
                   'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open']
    
    # 이벤트 피처 식별
    event_patterns = ['event_', 'period_', 'in_upgrade', 'in_crisis', 
                     'days_since_last_event', 'event_count_90d']
    event_cols = [col for col in df.columns 
                  if any(pattern in col for pattern in event_patterns)]
    
    # 전체 피처
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    X = df[feature_cols].copy()
    y = df['next_direction'].copy() if task == 'classification' else df['next_log_return'].copy()
    dates = df['date'].copy()
    prices = df['ETH_Close'].copy()
    
    print(f"  총 피처: {len(feature_cols)}개 (이벤트: {len(event_cols)}개)")
    
    return X, y, dates, prices, feature_cols, event_cols


# ============================================================================
# 2. feature_selection_before_scaling (수정: 이벤트 강제 포함)
# ============================================================================

def feature_selection_before_scaling(X_train, y_train, X_val, X_test, 
                                     event_cols=None, n_features=100):
    """
    피처 선택 (이벤트는 무조건 포함)
    
    핵심:
    - 연속 피처만 SelectKBest로 선택
    - 이벤트는 무조건 포함
    """
    print(f"\n[FEATURE SELECTION]")
    
    if event_cols is None:
        event_cols = []
    
    # 이벤트와 연속 피처 분리
    event_mask = X_train.columns.isin(event_cols)
    continuous_cols = X_train.columns[~event_mask].tolist()
    actual_event_cols = X_train.columns[event_mask].tolist()
    
    print(f"  연속 피처: {len(continuous_cols)}개 -> {min(n_features, len(continuous_cols))}개")
    print(f"  이벤트 피처: {len(actual_event_cols)}개 (전부 유지)")
    
    # 연속 피처만 선택
    if len(continuous_cols) > 0:
        selector = SelectKBest(score_func=f_classif, k=min(n_features, len(continuous_cols)))
        X_train_continuous_sel = selector.fit_transform(X_train[continuous_cols], y_train)
        X_val_continuous_sel = selector.transform(X_val[continuous_cols])
        X_test_continuous_sel = selector.transform(X_test[continuous_cols])
        
        selected_continuous = [continuous_cols[i] for i in range(len(continuous_cols)) 
                              if selector.get_support()[i]]
    else:
        X_train_continuous_sel = np.array([]).reshape(len(X_train), 0)
        X_val_continuous_sel = np.array([]).reshape(len(X_val), 0)
        X_test_continuous_sel = np.array([]).reshape(len(X_test), 0)
        selected_continuous = []
    
    # 이벤트 피처 추가
    if len(actual_event_cols) > 0:
        X_train_final = np.hstack([X_train_continuous_sel, X_train[actual_event_cols].values])
        X_val_final = np.hstack([X_val_continuous_sel, X_val[actual_event_cols].values])
        X_test_final = np.hstack([X_test_continuous_sel, X_test[actual_event_cols].values])
    else:
        X_train_final = X_train_continuous_sel
        X_val_final = X_val_continuous_sel
        X_test_final = X_test_continuous_sel
    
    selected_features = selected_continuous + actual_event_cols
    
    print(f"  최종: {len(selected_features)}개")
    
    return X_train_final, X_val_final, X_test_final, selected_features, len(actual_event_cols)


# ============================================================================
# 3. scale_features (수정: 이벤트 스케일링 제외)
# ============================================================================

def scale_features(X_train, X_val, X_test, n_event_features=0):
    """
    스케일링 (이벤트 제외)
    
    핵심:
    - 마지막 n_event_features개는 0/1 그대로 유지
    - 나머지만 RobustScaler
    """
    print(f"\n[SCALING]")
    
    n_continuous = X_train.shape[1] - n_event_features
    
    scaler = RobustScaler()
    
    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    X_test_scaled = X_test.copy()
    
    # 연속 피처만 스케일링
    if n_continuous > 0:
        X_train_scaled[:, :n_continuous] = scaler.fit_transform(X_train[:, :n_continuous])
        X_val_scaled[:, :n_continuous] = scaler.transform(X_val[:, :n_continuous])
        X_test_scaled[:, :n_continuous] = scaler.transform(X_test[:, :n_continuous])
    
    print(f"  연속 피처: {n_continuous}개 스케일링")
    print(f"  이벤트 피처: {n_event_features}개 유지 (0/1)")
    
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler


# ============================================================================
# 4. create_sample_weights (새로 추가: 이벤트 가중치)
# ============================================================================

def create_sample_weights(train_df, event_cols):
    """
    이벤트 기반 샘플 가중치
    
    핵심:
    - 이벤트 당일: 5배
    - 이벤트 윈도우 (_post7, _post30): 2배
    - 위기 기간: 1.5배
    """
    weights = np.ones(len(train_df))
    
    # 주요 이벤트 (당일)
    major_events = [col for col in event_cols 
                   if any(e in col for e in ['event_merge', 'event_london', 'event_shanghai',
                                             'event_eth_etf_approval', 'event_ftx_collapse',
                                             'event_terra_collapse', 'event_dencun'])]
    
    # 이벤트 윈도우
    event_windows = [col for col in event_cols if '_post7' in col or '_post30' in col]
    
    for idx, row in train_df.iterrows():
        # 이벤트 당일: 5배
        if any(col in train_df.columns and row[col] == 1 for col in major_events):
            weights[idx] = 5.0
        # 이벤트 윈도우: 2배
        elif any(col in train_df.columns and row[col] == 1 for col in event_windows):
            weights[idx] = 2.0
        # 위기 기간: 1.5배
        elif 'in_crisis_window' in train_df.columns and row['in_crisis_window'] == 1:
            weights[idx] = 1.5
    
    n_weighted = (weights > 1.0).sum()
    print(f"\n[SAMPLE WEIGHTS]")
    print(f"  가중 샘플: {n_weighted}개 / {len(weights)}개 ({n_weighted/len(weights)*100:.1f}%)")
    print(f"  가중치 5.0: {(weights == 5.0).sum()}개")
    print(f"  가중치 2.0: {(weights == 2.0).sum()}개")
    print(f"  가중치 1.5: {(weights == 1.5).sum()}개")
    
    return weights


# ============================================================================
# 5. check_data_leakage (기존 그대로)
# ============================================================================

def check_data_leakage(X_train, X_val, X_test, train_dates, val_dates, test_dates):
    print("\n" + "="*100)
    print("DATA LEAKAGE VERIFICATION")
    print("="*100)
    
    issues = []
    
    train_idx = set(X_train.index)
    val_idx = set(X_val.index)
    test_idx = set(X_test.index)
    
    if len(train_idx & val_idx) > 0:
        issues.append("CRITICAL: Train and validation index overlap")
    if len(train_idx & test_idx) > 0:
        issues.append("CRITICAL: Train and test index overlap")
    if len(val_idx & test_idx) > 0:
        issues.append("CRITICAL: Validation and test index overlap")
    
    if train_dates.max() >= val_dates.min():
        issues.append("CRITICAL: Train dates overlap with validation dates")
    if val_dates.max() >= test_dates.min():
        issues.append("CRITICAL: Validation dates overlap with test dates")
    
    if X_train.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Train has {X_train.isnull().sum().sum()} NaN values")
    if X_val.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Validation has {X_val.isnull().sum().sum()} NaN values")
    if X_test.isnull().sum().sum() > 0:
        issues.append(f"WARNING: Test has {X_test.isnull().sum().sum()} NaN values")
    
    if np.isinf(X_train.values).sum() > 0:
        issues.append(f"WARNING: Train has {np.isinf(X_train.values).sum()} inf values")
    if np.isinf(X_val.values).sum() > 0:
        issues.append(f"WARNING: Validation has {np.isinf(X_val.values).sum()} inf values")
    if np.isinf(X_test.values).sum() > 0:
        issues.append(f"WARNING: Test has {np.isinf(X_test.values).sum()} inf values")
    
    print(f"\nTrain: {len(X_train)} samples, {train_dates.min()} to {train_dates.max()}")
    print(f"Val:   {len(X_val)} samples, {val_dates.min()} to {val_dates.max()}")
    print(f"Test:  {len(X_test)} samples, {test_dates.min()} to {test_dates.max()}")
    
    if len(issues) == 0:
        print("\n✓ No data leakage detected")
        return True
    else:
        print("\n⚠ Data leakage issues detected:")
        for issue in issues:
            print(f"  {issue}")
        return False


# ============================================================================
# 6. get_all_models (기존 그대로)
# ============================================================================

def get_all_models():
    base_models = {
        'RandomForest': RandomForestClassifier(
            n_estimators=200, max_depth=10, min_samples_split=20,
            min_samples_leaf=10, random_state=42, n_jobs=-1
        ),
        'GradientBoosting': GradientBoostingClassifier(
            n_estimators=200, max_depth=4, learning_rate=0.03,
            subsample=0.75, random_state=42
        ),
        'ExtraTrees': ExtraTreesClassifier(
            n_estimators=200, max_depth=10, min_samples_split=20,
            min_samples_leaf=10, random_state=42, n_jobs=-1
        ),
        'AdaBoost': AdaBoostClassifier(
            n_estimators=100, learning_rate=0.5, random_state=42
        ),
        'DecisionTree': DecisionTreeClassifier(
            max_depth=8, min_samples_split=20, min_samples_leaf=10,
            random_state=42
        ),
        'LogisticRegression': LogisticRegression(
            C=0.1, penalty='l2', max_iter=2000, random_state=42, n_jobs=-1
        ),
        'RidgeClassifier': RidgeClassifier(
            alpha=1.0, random_state=42
        ),
        'SVM_RBF': SVC(
            kernel='rbf', C=1.0, gamma='scale', 
            probability=True, random_state=42
        ),
        'SVM_Linear': SVC(
            kernel='linear', C=0.1,
            probability=True, random_state=42
        ),
        'MLP_Small': MLPClassifier(
            hidden_layer_sizes=(64, 32), activation='relu',
            solver='adam', alpha=0.01, batch_size=64,
            learning_rate='adaptive', max_iter=300,
            early_stopping=True, random_state=42
        ),
        'MLP_Medium': MLPClassifier(
            hidden_layer_sizes=(128, 64, 32), activation='relu',
            solver='adam', alpha=0.001, batch_size=64,
            learning_rate='adaptive', max_iter=300,
            early_stopping=True, random_state=42
        ),
        'KNN': KNeighborsClassifier(
            n_neighbors=15, weights='distance', n_jobs=-1
        ),
        'NaiveBayes': GaussianNB(),
        'Bagging_RF': BaggingClassifier(
            estimator=DecisionTreeClassifier(max_depth=8, random_state=42),
            n_estimators=50, random_state=42, n_jobs=-1
        ),
        'XGBoost_GPU': XGBClassifier(
            n_estimators=200,
            learning_rate=0.03,
            max_depth=3,
            min_child_weight=5,
            subsample=0.7,
            colsample_bytree=0.7,
            gamma=0.1,
            reg_alpha=0.1,
            reg_lambda=5,
            tree_method='gpu_hist',
            random_state=42
        ),
        'LightGBM_GPU': LGBMClassifier(
            n_estimators=200,
            learning_rate=0.03,
            num_leaves=20,
            max_depth=4,
            min_child_samples=20,
            subsample=0.7,
            colsample_bytree=0.7,
            reg_alpha=0.1,
            reg_lambda=5,
            device='gpu',
            random_state=42,
            verbose=-1,
            n_jobs=-1
        ),
    }
    
    voting_soft = VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('lr', LogisticRegression(C=0.1, random_state=42, n_jobs=-1))
        ],
        voting='soft'
    )
    
    voting_hard = VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('lr', LogisticRegression(C=0.1, random_state=42, n_jobs=-1))
        ],
        voting='hard'
    )
    
    stacking = StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)),
            ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42)),
            ('et', ExtraTreesClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1))
        ],
        final_estimator=LogisticRegression(C=0.1, random_state=42),
        cv=5
    )
    
    base_models['Voting_Soft'] = voting_soft
    base_models['Voting_Hard'] = voting_hard
    base_models['Stacking'] = stacking
    
    xgb_calibrated = CalibratedClassifierCV(
        base_models['XGBoost_GPU'],
        method='isotonic',
        cv=TimeSeriesSplit(n_splits=3)
    )
    lgb_calibrated = CalibratedClassifierCV(
        base_models['LightGBM_GPU'],
        method='isotonic',
        cv=TimeSeriesSplit(n_splits=3)
    )
    
    base_models['XGBoost_Calibrated'] = xgb_calibrated
    base_models['LightGBM_Calibrated'] = lgb_calibrated
    
    return base_models


# ============================================================================
# 7. optimize_threshold_on_validation (기존 그대로)
# ============================================================================

def optimize_threshold_on_validation(y_val, y_proba_val, val_dates, val_prices):
    best_sharpe = -np.inf
    best_thresholds = (0.55, 0.45)
    
    for buy_th in np.arange(0.50, 0.70, 0.05):
        for sell_th in np.arange(0.30, 0.50, 0.05):
            predictions_temp = (y_proba_val > 0.5).astype(int)
            
            temp_result = calculate_trading_performance_corrected(
                predictions_temp, y_proba_val, val_dates, val_prices, y_val,
                initial_capital=10000, transaction_cost=0.002, slippage=0.001,
                buy_threshold=buy_th, sell_threshold=sell_th
            )
            
            if temp_result['sharpe_ratio'] > best_sharpe:
                best_sharpe = temp_result['sharpe_ratio']
                best_thresholds = (buy_th, sell_th)
    
    return best_thresholds


# ============================================================================
# 8. train_all_models (수정: 샘플 가중치 추가)
# ============================================================================

def train_all_models(X_train, y_train, X_val, y_val, X_test, y_test, 
                     val_dates=None, val_prices=None, optimize_thresholds=False,
                     sample_weights=None):
    """
    모든 모델 학습 (샘플 가중치 지원)
    """
    print("\n" + "="*100)
    print("MODEL TRAINING AND EVALUATION")
    print("="*100)
    
    models_config = get_all_models()
    
    results = []
    models_trained = {}
    predictions = {}
    probabilities = {}
    thresholds = {}
    
    print(f"\n{'Model':<30} {'Train Acc':<12} {'Val Acc':<12} {'Test Acc':<12} {'Test AUC':<12} Status")
    print("-" * 110)
    
    for name, model in models_config.items():
        try:
            # 샘플 가중치 지원 모델 확인
            if sample_weights is not None and hasattr(model, 'fit'):
                try:
                    # XGBoost, LightGBM, tree-based 모델은 sample_weight 지원
                    model.fit(X_train, y_train, sample_weight=sample_weights)
                except TypeError:
                    # 지원 안 하는 모델은 일반 fit
                    model.fit(X_train, y_train)
            else:
                model.fit(X_train, y_train)
            
            train_pred = model.predict(X_train)
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)
            
            if hasattr(model, 'predict_proba'):
                train_proba = model.predict_proba(X_train)[:, 1]
                val_proba = model.predict_proba(X_val)[:, 1]
                test_proba = model.predict_proba(X_test)[:, 1]
            else:
                train_proba = train_pred
                val_proba = val_pred
                test_proba = test_pred
            
            if optimize_thresholds and val_dates is not None and val_prices is not None:
                buy_th, sell_th = optimize_threshold_on_validation(
                    y_val, val_proba, val_dates, val_prices
                )
                thresholds[name] = (buy_th, sell_th)
            else:
                thresholds[name] = (0.55, 0.45)
            
            train_acc = accuracy_score(y_train, train_pred)
            val_acc = accuracy_score(y_val, val_pred)
            test_acc = accuracy_score(y_test, test_pred)
            
            test_precision = precision_score(y_test, test_pred, zero_division=0)
            test_recall = recall_score(y_test, test_pred, zero_division=0)
            test_f1 = f1_score(y_test, test_pred, zero_division=0)
            
            try:
                train_auc = roc_auc_score(y_train, train_proba)
                val_auc = roc_auc_score(y_val, val_proba)
                test_auc = roc_auc_score(y_test, test_proba)
            except:
                train_auc = val_auc = test_auc = 0.5
            
            results.append({
                'Model': name,
                'Train_Acc': train_acc,
                'Val_Acc': val_acc,
                'Test_Acc': test_acc,
                'Train_AUC': train_auc,
                'Val_AUC': val_auc,
                'Test_AUC': test_auc,
                'Test_Precision': test_precision,
                'Test_Recall': test_recall,
                'Test_F1': test_f1,
                'Overfit_Gap': train_acc - test_acc,
                'Buy_Threshold': thresholds[name][0],
                'Sell_Threshold': thresholds[name][1]
            })
            
            models_trained[name] = model
            predictions[name] = test_pred
            probabilities[name] = test_proba
            
            status = "OK"
            print(f"{name:<30} {train_acc:<12.4f} {val_acc:<12.4f} {test_acc:<12.4f} {test_auc:<12.4f} {status}")
            
        except Exception as e:
            print(f"{name:<30} {'ERROR':<12} {'ERROR':<12} {'ERROR':<12} {'ERROR':<12} {str(e)[:20]}")
    
    return pd.DataFrame(results), models_trained, predictions, probabilities, thresholds


# ============================================================================
# 9. 나머지 함수들 (기존 그대로)
# ============================================================================

def calculate_trading_performance_corrected(predictions, probabilities, dates, prices, y_true,
                                           initial_capital=10000, transaction_cost=0.002, slippage=0.001,
                                           buy_threshold=0.55, sell_threshold=0.45):
    df_backtest = pd.DataFrame({
        'date': dates.values,
        'price': prices.values,
        'prediction': predictions,
        'probability': probabilities,
        'actual_direction': y_true.values
    })
    
    capital = initial_capital
    position = 0
    eth_holdings = 0
    portfolio_values = [initial_capital]
    trades = []
    
    total_cost = transaction_cost + slippage
    
    for idx in range(len(df_backtest) - 1):
        current_row = df_backtest.iloc[idx]
        signal = current_row['prediction']
        confidence = current_row['probability']
        
        trade_price = df_backtest.iloc[idx + 1]['price']
        
        if signal == 1 and position == 0 and confidence > buy_threshold:
            eth_to_buy = (capital * 0.95) / trade_price
            cost = eth_to_buy * trade_price * (1 + total_cost)
            if cost <= capital:
                eth_holdings = eth_to_buy
                capital -= cost
                position = 1
                trades.append({'action': 'BUY', 'price': trade_price, 'date': df_backtest.iloc[idx + 1]['date']})

        elif (signal == 0 or confidence < sell_threshold) and position == 1:
            revenue = eth_holdings * trade_price * (1 - total_cost)
            capital += revenue
            eth_holdings = 0
            position = 0
            trades.append({'action': 'SELL', 'price': trade_price, 'date': df_backtest.iloc[idx + 1]['date']})
            
        eod_portfolio_value = capital + (eth_holdings * trade_price)
        portfolio_values.append(eod_portfolio_value)

    final_value = portfolio_values[-1]
    total_return = (final_value - initial_capital) / initial_capital * 100
    buy_hold_return = (df_backtest.iloc[-1]['price'] - df_backtest.iloc[0]['price']) / df_backtest.iloc[0]['price'] * 100
    
    portfolio_values = np.array(portfolio_values)
    if len(portfolio_values) > 1:
        returns = (portfolio_values[1:] / portfolio_values[:-1]) - 1
        returns = returns[~np.isnan(returns) & ~np.isinf(returns)]
        
        sharpe_ratio = np.mean(returns) / np.std(returns) * np.sqrt(252) if len(returns) > 0 and np.std(returns) > 0 else 0
        
        cummax = np.maximum.accumulate(portfolio_values)
        drawdown = (portfolio_values - cummax) / cummax
        max_drawdown = np.min(drawdown) * 100 if len(drawdown) > 0 else 0
    else:
        sharpe_ratio = 0
        max_drawdown = 0
        
    n_trades = len(trades)
    n_buys = len([t for t in trades if t['action'] == 'BUY'])
    
    return {
        'final_value': final_value,
        'total_return': total_return,
        'buy_hold_return': buy_hold_return,
        'sharpe_ratio': sharpe_ratio,
        'max_drawdown': max_drawdown,
        'n_trades': n_trades,
        'n_buys': n_buys
    }


def backtest_all_models(models, predictions, probabilities, test_dates, test_prices, y_test, thresholds=None):
    print("\n" + "="*100)
    print("BACKTESTING RESULTS (Improved: Slippage 0.1% + Transaction Cost 0.2%)")
    print("="*100)
    
    backtest_results = []
    
    buy_hold_return = (test_prices.iloc[-1] - test_prices.iloc[0]) / test_prices.iloc[0] * 100
    
    print(f"\n{'Model':<30} {'Final Value':<15} {'Return %':<12} {'vs B&H':<12} {'Sharpe':<10} {'Max DD %':<12} {'Trades':<10}")
    print("-" * 110)
    
    for name in models.keys():
        try:
            if thresholds and name in thresholds:
                buy_th, sell_th = thresholds[name]
            else:
                buy_th, sell_th = 0.55, 0.45
            
            results = calculate_trading_performance_corrected(
                predictions[name], 
                probabilities[name],
                test_dates, 
                test_prices, 
                y_test,
                initial_capital=10000,
                transaction_cost=0.002,
                slippage=0.001,
                buy_threshold=buy_th,
                sell_threshold=sell_th
            )
            
            outperformance = results['total_return'] - buy_hold_return
            
            backtest_results.append({
                'Model': name,
                'Final_Value': results['final_value'],
                'Total_Return': results['total_return'],
                'Buy_Hold_Return': buy_hold_return,
                'Outperformance': outperformance,
                'Sharpe_Ratio': results['sharpe_ratio'],
                'Max_Drawdown': results['max_drawdown'],
                'N_Trades': results['n_trades'],
                'N_Buys': results['n_buys'],
                'Buy_Threshold': buy_th,
                'Sell_Threshold': sell_th
            })
            
            print(f"{name:<30} ${results['final_value']:<14,.2f} {results['total_return']:<11.2f}% "
                  f"{outperformance:<11.2f}% {results['sharpe_ratio']:<9.3f} "
                  f"{results['max_drawdown']:<11.2f}% {results['n_trades']:<10}")
            
        except Exception as e:
            print(f"{name:<30} Error: {str(e)[:50]}")
    
    print("-" * 110)
    print(f"{'Buy & Hold Baseline':<30} ${10000 * (1 + buy_hold_return/100):<14,.2f} {buy_hold_return:<11.2f}% "
          f"{'0.00':<11}% {'N/A':<9} {'N/A':<11} {'0':<10}")
    
    return pd.DataFrame(backtest_results)


def create_comprehensive_report(results_df):
    print("\n" + "="*100)
    print("DETAILED PERFORMANCE REPORT")
    print("="*100)
    
    results_sorted = results_df.sort_values('Test_AUC', ascending=False).reset_index(drop=True)
    
    print(f"\n{'Rank':<6} {'Model':<30} {'Acc':<10} {'Prec':<10} {'Recall':<10} {'F1':<10} {'AUC':<10} {'Overfit':<10}")
    print("-" * 110)
    
    for idx, row in results_sorted.iterrows():
        print(f"{idx+1:<6} {row['Model']:<30} {row['Test_Acc']:<10.4f} {row['Test_Precision']:<10.4f} "
              f"{row['Test_Recall']:<10.4f} {row['Test_F1']:<10.4f} {row['Test_AUC']:<10.4f} {row['Overfit_Gap']:<10.4f}")
    
    print("\n" + "="*100)
    print("STATISTICAL SUMMARY")
    print("="*100)
    print(f"Best Test Accuracy:  {results_sorted.iloc[0]['Model']} ({results_sorted.iloc[0]['Test_Acc']:.4f})")
    print(f"Best Test AUC:       {results_sorted.iloc[0]['Model']} ({results_sorted.iloc[0]['Test_AUC']:.4f})")
    print(f"Best Test F1:        {results_sorted.nlargest(1, 'Test_F1').iloc[0]['Model']} ({results_sorted['Test_F1'].max():.4f})")
    print(f"\nMean Test Accuracy:  {results_df['Test_Acc'].mean():.4f} +/- {results_df['Test_Acc'].std():.4f}")
    print(f"Mean Test AUC:       {results_df['Test_AUC'].mean():.4f} +/- {results_df['Test_AUC'].std():.4f}")
    print(f"Mean Overfit Gap:    {results_df['Overfit_Gap'].mean():.4f} +/- {results_df['Overfit_Gap'].std():.4f}")
    
    return results_sorted


# ============================================================================
# 10. 공통 전처리 함수 (새로 추가: 중복 제거)
# ============================================================================

def preprocess_fold(X_train, y_train, X_test, event_cols, n_features=100):
    """
    Walk-forward용 공통 전처리 함수
    
    중복 코드를 하나로 통합:
    - 피처 선택
    - 스케일링
    """
    # 피처 선택
    X_train_sel, X_test_sel, _, selected_features, n_events = \
        feature_selection_before_scaling(
            X_train, y_train, X_train, X_test,  # val도 train으로 (fold에서는 val 없음)
            event_cols=event_cols, 
            n_features=n_features
        )
    
    # 스케일링
    X_train_scaled, _, X_test_scaled, scaler = scale_features(
        X_train_sel, X_train_sel, X_test_sel,
        n_event_features=n_events
    )
    
    return X_train_scaled, X_test_scaled


# ============================================================================
# 11. walk_forward_validation (리팩토링: 중복 제거)
# ============================================================================

def walk_forward_validation(df, models_config, n_splits=5, n_features=100):
    """
    Walk-forward 검증 (리팩토링)
    
    ✅ 검증: 맞게 구현됨
    - 각 fold마다 독립적으로 scaler fit
    - 미래 정보 누수 없음
    
    🔧 개선: 중복 코드 제거
    """
    print("\n" + "="*100)
    print("WALK-FORWARD VALIDATION")
    print("="*100)
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    X, y, dates, prices, feature_cols, event_cols = prepare_feature_target(df)
    all_results = []
    fold_num = 0
    
    for train_idx, test_idx in tscv.split(X):
        fold_num += 1
        print(f"\n--- Fold {fold_num}/{n_splits} ---")
        
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_test_fold = y.iloc[test_idx]
        
        print(f"Train: {len(train_idx)} samples | Test: {len(test_idx)} samples")
        
        # 공통 전처리 사용 (중복 제거!)
        X_train_scaled, X_test_scaled = preprocess_fold(
            X_train_fold, y_train_fold, X_test_fold, 
            event_cols, n_features
        )
        
        for name, model in models_config.items():
            try:
                from sklearn.base import clone
                model_copy = clone(model)
                
                model_copy.fit(X_train_scaled, y_train_fold)
                test_pred = model_copy.predict(X_test_scaled)
                
                if hasattr(model_copy, 'predict_proba'):
                    test_proba = model_copy.predict_proba(X_test_scaled)[:, 1]
                else:
                    test_proba = test_pred
                
                test_acc = accuracy_score(y_test_fold, test_pred)
                test_precision = precision_score(y_test_fold, test_pred, zero_division=0)
                test_recall = recall_score(y_test_fold, test_pred, zero_division=0)
                test_f1 = f1_score(y_test_fold, test_pred, zero_division=0)
                
                try:
                    test_auc = roc_auc_score(y_test_fold, test_proba)
                except:
                    test_auc = 0.5
                
                all_results.append({
                    'Fold': fold_num,
                    'Model': name,
                    'Test_Acc': test_acc,
                    'Test_Precision': test_precision,
                    'Test_Recall': test_recall,
                    'Test_F1': test_f1,
                    'Test_AUC': test_auc
                })
                
            except Exception as e:
                print(f"  {name}: Error - {str(e)[:50]}")
    
    results_df = pd.DataFrame(all_results)
    
    print("\n" + "="*100)
    print("WALK-FORWARD VALIDATION SUMMARY")
    print("="*100)
    
    summary = results_df.groupby('Model').agg({
        'Test_Acc': ['mean', 'std'],
        'Test_Precision': ['mean', 'std'],
        'Test_Recall': ['mean', 'std'],
        'Test_F1': ['mean', 'std'],
        'Test_AUC': ['mean', 'std']
    }).round(4)
    
    print("\n", summary)
    
    return results_df, summary


# ============================================================================
# 12. walk_forward_backtest (리팩토링: 중복 제거)
# ============================================================================

def walk_forward_backtest(df, models_config, n_splits=5, n_features=100):
    """
    Walk-forward 백테스팅 (리팩토링)
    """
    print("\n" + "="*100)
    print("WALK-FORWARD BACKTESTING (Improved: Slippage 0.1% + Transaction Cost 0.2%)")
    print("="*100)
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    X, y, dates, prices, feature_cols, event_cols = prepare_feature_target(df)
    all_backtest_results = []
    fold_num = 0
    
    for train_idx, test_idx in tscv.split(X):
        fold_num += 1
        print(f"\n--- Fold {fold_num}/{n_splits} Backtest ---")
        
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_test_fold = X.iloc[test_idx]
        y_test_fold = y.iloc[test_idx]
        test_dates_fold = dates.iloc[test_idx]
        test_prices_fold = prices.iloc[test_idx]
        
        print(f"Train: {len(train_idx)} samples | Test: {len(test_idx)} samples")
        
        # 공통 전처리 사용
        X_train_scaled, X_test_scaled = preprocess_fold(
            X_train_fold, y_train_fold, X_test_fold, 
            event_cols, n_features
        )
        
        for name, model in models_config.items():
            try:
                from sklearn.base import clone
                model_copy = clone(model)
                
                model_copy.fit(X_train_scaled, y_train_fold)
                test_pred = model_copy.predict(X_test_scaled)
                
                if hasattr(model_copy, 'predict_proba'):
                    test_proba = model_copy.predict_proba(X_test_scaled)[:, 1]
                else:
                    test_proba = test_pred
                
                bt_result = calculate_trading_performance_corrected(
                    test_pred, 
                    test_proba,
                    test_dates_fold, 
                    test_prices_fold, 
                    y_test_fold,
                    initial_capital=10000,
                    transaction_cost=0.002,
                    slippage=0.001
                )
                
                bt_result['Model'] = name
                bt_result['Fold'] = fold_num
                all_backtest_results.append(bt_result)
                
            except Exception as e:
                print(f"  {name}: Backtest Error - {str(e)[:50]}")
    
    backtest_df = pd.DataFrame(all_backtest_results)
    
    print("\n" + "="*100)
    print("WALK-FORWARD BACKTEST SUMMARY")
    print("="*100)
    
    summary = backtest_df.groupby('Model').agg({
        'total_return': ['mean', 'std'],
        'sharpe_ratio': ['mean', 'std'],
        'max_drawdown': ['mean', 'std'],
        'n_trades': ['mean', 'sum']
    }).round(4)
    
    summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
    summary = summary.sort_values('total_return_mean', ascending=False)
    
    print("\n", summary)
    
    print(f"\n{'Model':<30} {'Avg Return %':<15} {'Avg Sharpe':<12} {'Avg Max DD %':<15} {'Total Trades':<12}")
    print("-" * 110)
    for model_name in summary.index:
        model_data = backtest_df[backtest_df['Model'] == model_name]
        avg_return = model_data['total_return'].mean()
        avg_sharpe = model_data['sharpe_ratio'].mean()
        avg_dd = model_data['max_drawdown'].mean()
        total_trades = model_data['n_trades'].sum()
        print(f"{model_name:<30} {avg_return:<15.2f} {avg_sharpe:<12.3f} {avg_dd:<15.2f} {total_trades:<12.0f}")
    
    return backtest_df, summary





In [None]:
# ============================================================================
# 【실행 예시】
# ============================================================================

# 데이터 준비 (이벤트 포함!)
X_train, y_train, train_dates, train_prices, train_features, train_event_cols = \
    prepare_feature_target(train_df)
X_val, y_val, val_dates, val_prices, _, _ = prepare_feature_target(val_df)
X_test, y_test, test_dates, test_prices, _, _ = prepare_feature_target(test_df)

print(f"\nDataset: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

check_data_leakage(X_train, X_val, X_test, train_dates, val_dates, test_dates)

# 피처 선택 (이벤트 강제 포함!)
X_train_sel, X_val_sel, X_test_sel, selected_features, n_events = \
    feature_selection_before_scaling(
        X_train, y_train, X_val, X_test, 
        event_cols=train_event_cols, 
        n_features=100
    )

# 스케일링 (이벤트 제외!)
X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_features(
    X_train_sel, X_val_sel, X_test_sel, 
    n_event_features=n_events
)

# 샘플 가중치 생성
sample_weights = create_sample_weights(train_df, train_event_cols)

# 모델 학습 (가중치 적용!)
results, models, preds, probs, thresholds = train_all_models(
    X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test,
    val_dates=val_dates, val_prices=val_prices,
    optimize_thresholds=False,
    sample_weights=sample_weights  # 핵심!
)

# 평가
report = create_comprehensive_report(results)

# 백테스팅
backtest_results = backtest_all_models(
    models, preds, probs, test_dates, test_prices, y_test, thresholds
)

# Walk-forward 검증
wf_results, wf_summary = walk_forward_validation(
    df_clean, get_all_models(), n_splits=5, n_features=100
)

# Walk-forward 백테스팅
wf_backtest, wf_bt_summary = walk_forward_backtest(
    df_clean, get_all_models(), n_splits=5, n_features=100
)

  총 피처: 380개 (이벤트: 76개)
  총 피처: 380개 (이벤트: 76개)
  총 피처: 380개 (이벤트: 76개)

Dataset: Train=1226, Val=263, Test=263

DATA LEAKAGE VERIFICATION

Train: 1226 samples, 2020-12-19 00:00:00 to 2024-04-27 00:00:00
Val:   263 samples, 2024-04-28 00:00:00 to 2025-01-15 00:00:00
Test:  263 samples, 2025-01-16 00:00:00 to 2025-10-05 00:00:00

✓ No data leakage detected

[FEATURE SELECTION]
  연속 피처: 304개 -> 100개
  이벤트 피처: 76개 (전부 유지)
  최종: 176개

[SCALING]
  연속 피처: 100개 스케일링
  이벤트 피처: 76개 유지 (0/1)

[SAMPLE WEIGHTS]
  가중 샘플: 246개 / 1226개 (20.1%)
  가중치 5.0: 246개
  가중치 2.0: 0개
  가중치 1.5: 0개

MODEL TRAINING AND EVALUATION

Model                          Train Acc    Val Acc      Test Acc     Test AUC     Status
--------------------------------------------------------------------------------------------------------------
RandomForest                   0.8728       0.5741       0.5475       0.5862       OK
GradientBoosting               0.8891       0.5627       0.5361       0.5655       OK
ExtraTrees      