In [103]:
import xgboost
import sklearn

print(xgboost.__version__)   # 3.0.5
print(sklearn.__version__)   # 1.0.x 이하이면 호환 문제


3.0.5
1.7.2


In [1]:

import os
import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
import pickle
import warnings
import pandas_ta as ta
from sklearn.feature_selection import mutual_info_regression, RFECV
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
warnings.filterwarnings('ignore')

def standardize_date_column(df):
    date_cols = [col for col in df.columns if col.lower() == 'date']
    if not date_cols:
        return df

    date_col = date_cols[0]
    if date_col != 'date':
        df.rename(columns={date_col: 'date'}, inplace=True)

    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    df['date'] = df['date'].dt.normalize()

    if pd.api.types.is_datetime64tz_dtype(df['date']):
        df['date'] = df['date'].dt.tz_convert(None)
    else:
        df['date'] = df['date'].dt.tz_localize(None)

    return df

def load_and_standardize_data(file_name, base_dir='./macro_data'):
    file_path = os.path.join(base_dir, file_name)
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"{file_path} not found")

    df = pd.read_csv(file_path)
    df = standardize_date_column(df)
    return df

def create_sentiment_features_with_lags(news_df):
    sentiment_agg = news_df.groupby('date').agg(
        sentiment_mean=('label', 'mean'),
        sentiment_std=('label', 'std'),
        news_count=('label', 'count'),
        positive_ratio=('label', lambda x: (x == 1).sum() / len(x)),
        negative_ratio=('label', lambda x: (x == -1).sum() / len(x))
    ).reset_index()

    sentiment_agg['sentiment_std'] = sentiment_agg['sentiment_std'].fillna(0)

    for lag in [1, 3, 5, 7]:
        sentiment_agg[f'sentiment_mean_lag{lag}'] = sentiment_agg['sentiment_mean'].shift(lag)
        sentiment_agg[f'positive_ratio_lag{lag}'] = sentiment_agg['positive_ratio'].shift(lag)
        sentiment_agg[f'news_count_lag{lag}'] = sentiment_agg['news_count'].shift(lag)

    return sentiment_agg


def calculate_technical_indicators(df, price_col='ETH_Close', volume_col='ETH_Volume', high_col=None, low_col=None):
    """
    pandas_ta 기반 기술적 지표 계산 (논문 검증됨)
    참고: "Cryptocurrency Price Forecasting Using XGBoost Regressor" (2024)
    """
    df = df.sort_values('date').reset_index(drop=True)
    
    df['returns'] = df[price_col].pct_change()
    df['log_returns'] = np.log(df[price_col] / df[price_col].shift(1))
    
    
    # RSI (Momentum) - 논문에서 중요도 높음
    for length in [14, 30]:
        rsi_series = ta.rsi(df[price_col], length=length)
        df[f'rsi_{length}'] = rsi_series.shift(1)  
    
    # MACD (Momentum & Trend)
    macd_df = ta.macd(df[price_col], fast=12, slow=26, signal=9)
    if macd_df is not None and not macd_df.empty:
        for col in macd_df.columns:
            df[col] = macd_df[col].shift(1)  

    bb_df = ta.bbands(df[price_col], length=20, std=2)
    if bb_df is not None and not bb_df.empty:
        bb_cols = list(bb_df.columns)
        for col in bb_cols:
            df[col] = bb_df[col].shift(1)  
        
        # BB 파생 지표 계산 (논문 권장)
        bb_middle = bb_df.iloc[:, 1] if len(bb_df.columns) >= 2 else bb_df.iloc[:, 0]
        bb_upper = bb_df.iloc[:, 0]
        bb_lower = bb_df.iloc[:, 2] if len(bb_df.columns) >= 3 else bb_df.iloc[:, 1]
        
        df['bb_width_20'] = ((bb_upper - bb_lower) / bb_middle).shift(1)
        df['bb_position_20'] = ((df[price_col].shift(1) - bb_lower.shift(1)) / 
                                (bb_upper.shift(1) - bb_lower.shift(1)))
    
    # ADX (Trend Strength)
    if high_col and low_col:
        adx_df = ta.adx(df[high_col], df[low_col], df[price_col], length=14)
        if adx_df is not None and not adx_df.empty:
            for col in adx_df.columns:
                df[col] = adx_df[col].shift(1)  
    
    # ATR (Volatility) - 논문에서 중요도 높음
    if high_col and low_col:
        atr_series = ta.atr(df[high_col], df[low_col], df[price_col], length=14)
        if atr_series is not None:
            df['atr_14'] = atr_series.shift(1) 
    
    # MFI (Volume)
    if high_col and low_col and volume_col:
        mfi_series = ta.mfi(df[high_col], df[low_col], df[price_col], df[volume_col], length=14)
        if mfi_series is not None:
            df['mfi_14'] = mfi_series.shift(1)  
    
    # CCI (Momentum)
    if high_col and low_col:
        cci_series = ta.cci(df[high_col], df[low_col], df[price_col], length=20)
        if cci_series is not None:
            df['cci_20'] = cci_series.shift(1)  
    
    # OBV (Volume)
    obv_series = ta.obv(df[price_col], df[volume_col])
    if obv_series is not None:
        df['obv'] = obv_series.shift(1) 
    
    # VWAP
    if high_col and low_col:
        vwap_series = ta.vwap(df[high_col], df[low_col], df[price_col], df[volume_col])
        if vwap_series is not None:
            df['vwap'] = vwap_series.shift(1)
    
    # SMA/EMA (Trend) - 다양한 기간
    for window in [7, 14, 30, 60]:
        sma = ta.sma(df[price_col], length=window)
        ema = ta.ema(df[price_col], length=window)
        if sma is not None:
            df[f'sma_{window}'] = sma.shift(1)  
        if ema is not None:
            df[f'ema_{window}'] = ema.shift(1) 
        
        # Volatility & Volume indicators
        df[f'volatility_{window}'] = df['returns'].shift(1).rolling(window=window).std()
        df[f'volume_sma_{window}'] = df[volume_col].shift(1).rolling(window=window).mean()
        df[f'returns_sma_{window}'] = df['returns'].shift(1).rolling(window=window).mean()
        df[f'returns_ema_{window}'] = df['returns'].shift(1).ewm(span=window, adjust=False).mean()
        df[f'cumulative_returns_{window}'] = (1 + df['returns'].shift(1)).rolling(window=window).apply(lambda x: x.prod(), raw=True) - 1
    
    # Momentum & ROC
    for window in [10, 20]:
        df[f'momentum_{window}'] = df[price_col].shift(1) - df[price_col].shift(window + 1)
        df[f'roc_{window}'] = ((df[price_col].shift(1) - df[price_col].shift(window + 1)) / 
                               df[price_col].shift(window + 1)) * 100
    
    # Stochastic & Williams %R
    if high_col in df.columns and low_col in df.columns:
        high_roll = df[high_col].shift(1).rolling(window=14).max()
        low_roll = df[low_col].shift(1).rolling(window=14).min()
        df['stochastic_14'] = 100 * (df[price_col].shift(1) - low_roll) / (high_roll - low_roll)
        df['williams_r_14'] = -100 * (high_roll - df[price_col].shift(1)) / (high_roll - low_roll)
    
    # Lag features
    for lag in [1, 3, 7, 14]:
        df[f'price_lag_{lag}'] = df[price_col].shift(lag)
        df[f'volume_lag_{lag}'] = df[volume_col].shift(lag)
        df[f'returns_lag_{lag}'] = df['returns'].shift(lag)
    
    # Regime indicators
    df['volatility_30'] = df['returns'].shift(1).rolling(30).std()
    df['volatility_regime'] = (df['volatility_30'] > df['volatility_30'].shift(1).rolling(60).mean()).astype(int)
    df['price_trend'] = (df['sma_14'] > df['sma_60']).astype(int)
    
    return df


def merge_external_features(base_df, data_with_lags, sentiment_features, google_trends_df, eth_onchain_df):
    merged_df = base_df.copy()

    eth_onchain_df = standardize_date_column(eth_onchain_df)
    onchain_col_orig = [c for c in eth_onchain_df.columns if c != 'date']
    for col in onchain_col_orig:
        new_col = f'onchain_{col}'
        eth_onchain_df.rename(columns={col: new_col}, inplace=True)
        for lag in [1,2,3]:
            eth_onchain_df[f'{new_col}_lag{lag}'] = eth_onchain_df[new_col].shift(lag)

    merged_df = merged_df.merge(eth_onchain_df, on='date', how='left')

    for df, prefix, lag_list in data_with_lags:
        df_renamed = df.copy()
        df_renamed = standardize_date_column(df_renamed)

        orig_cols = [c for c in df_renamed.columns if c != 'date']
        for col in orig_cols:
            df_renamed.rename(columns={col: f'{prefix}_{col}'}, inplace=True)

        renamed_cols = [c for c in df_renamed.columns if c != 'date']
        for col in renamed_cols:
            for lag in lag_list:
                df_renamed[f'{col}_lag{lag}'] = df_renamed[col].shift(lag)

        merged_df = merged_df.merge(df_renamed, on='date', how='left')

    merged_df = merged_df.merge(sentiment_features, on='date', how='left')

    google_trends_df = standardize_date_column(google_trends_df)
    trend_cols = [c for c in google_trends_df.columns if c != 'date']
    for col in trend_cols:
        google_trends_df[f'{col}_lag7'] = google_trends_df[col].shift(7)
        google_trends_df[f'{col}_lag14'] = google_trends_df[col].shift(14)
    merged_df = merged_df.merge(google_trends_df.add_prefix('trends_').rename(columns={'trends_date': 'date'}),
                              on='date', how='left')

    return merged_df

print("="*70)
print("ETHEREUM PRICE PREDICTION - NO LEAKAGE PREPROCESSING")
print("="*70)

print("\nStep 1: Loading raw datasets...")
macro_df = load_and_standardize_data('macro_crypto_data.csv')
news_df = load_and_standardize_data('news_data.csv')
eth_onchain_df = load_and_standardize_data('eth_onchain.csv')
fear_greed_df = load_and_standardize_data('fear_greed.csv')
usdt_eth_mcap_df = load_and_standardize_data('usdt_eth_mcap.csv')
usdt_total_mcap_df = load_and_standardize_data('usdt_total_mcap.csv')
aave_tvl_df = load_and_standardize_data('aave_eth_tvl.csv')
lido_tvl_df = load_and_standardize_data('lido_eth_tvl.csv')
makerdao_tvl_df = load_and_standardize_data('makerdao_eth_tvl.csv')
eth_chain_tvl_df = load_and_standardize_data('eth_chain_tvl.csv')
eth_funding_df = load_and_standardize_data('eth_funding_rate.csv')
sp500_df = load_and_standardize_data('SP500.csv')
vix_df = load_and_standardize_data('VIX.csv')
gold_df = load_and_standardize_data('GOLD.csv')
dxy_df = load_and_standardize_data('DXY.csv')
google_trends_df = load_and_standardize_data('ethereum_google_trends_weekly_2017_2025_scaled.csv')

print(f"Macro: {macro_df.shape}, {macro_df['date'].min()} to {macro_df['date'].max()}")

print("\nStep 2: Creating sentiment features...")
sentiment_features = create_sentiment_features_with_lags(news_df)

print("\nStep 3: Unifying date range...")
common_start = macro_df['date'].min()
common_end = macro_df['date'].max()
for df in [news_df, eth_onchain_df, fear_greed_df]:
    common_start = max(common_start, df['date'].min())
    common_end = min(common_end, df['date'].max())

macro_df = macro_df[(macro_df['date'] >= common_start) & (macro_df['date'] <= common_end)].reset_index(drop=True)
print(f"Unified range: {common_start} to {common_end}")

print("\nStep 4: Split raw data FIRST (70-15-15)...")
train_size = int(len(macro_df) * 0.7)
val_size = int(len(macro_df) * 0.15)

train_raw = macro_df.iloc[:train_size].copy()
val_raw = macro_df.iloc[train_size:train_size+val_size].copy()
test_raw = macro_df.iloc[train_size+val_size:].copy()

print(f"Train raw: {train_raw.shape}, {train_raw['date'].min()} to {train_raw['date'].max()}")
print(f"Val raw: {val_raw.shape}, {val_raw['date'].min()} to {val_raw['date'].max()}")
print(f"Test raw: {test_raw.shape}, {test_raw['date'].min()} to {test_raw['date'].max()}")

data_with_lags = [
    (fear_greed_df, 'fg', [1]),
    (usdt_eth_mcap_df, 'usdt_eth', [1]),
    (usdt_total_mcap_df, 'usdt_total', [1]),
    (aave_tvl_df, 'aave', [1, 3, 7]),
    (lido_tvl_df, 'lido', [1, 3, 7]),
    (makerdao_tvl_df, 'maker', [1, 3, 7]),
    (eth_chain_tvl_df, 'chain_tvl', [1, 3, 7]),
    (eth_funding_df, 'funding', [1]),
    (sp500_df, 'sp500', [1]),
    (vix_df, 'vix', [1]),
    (gold_df, 'gold', [1]),
    (dxy_df, 'dxy', [1])
]

print("\nStep 5: Processing TRAIN set independently...")
eth_cols = [col for col in train_raw.columns if col.startswith('ETH_')]
train_eth = train_raw[['date'] + eth_cols].copy()
has_high_low = 'ETH_High' in train_eth.columns and 'ETH_Low' in train_eth.columns
train_eth = calculate_technical_indicators(train_eth, 'ETH_Close', 'ETH_Volume',
                                          'ETH_High' if has_high_low else None,
                                          'ETH_Low' if has_high_low else None)

btc_cols = [col for col in train_raw.columns if col.startswith('BTC_')]
if btc_cols:
    train_btc = train_raw[['date'] + btc_cols].copy()
    has_btc_hl = 'BTC_High' in train_btc.columns and 'BTC_Low' in train_btc.columns
    train_btc = calculate_technical_indicators(train_btc, 'BTC_Close', 'BTC_Volume',
                                              'BTC_High' if has_btc_hl else None,
                                              'BTC_Low' if has_btc_hl else None)
    train_btc = train_btc.add_prefix('BTC_').rename(columns={'BTC_date': 'date'})

    eth_shifted = train_eth['ETH_Close'].shift(1)
    btc_shifted = train_btc['BTC_BTC_Close'].shift(1)
    train_btc['btc_eth_correlation'] = eth_shifted.rolling(30).corr(btc_shifted)
    train_btc['btc_dominance'] = btc_shifted / (btc_shifted + eth_shifted)
    train_btc['eth_btc_ratio'] = eth_shifted / btc_shifted
    train_btc['eth_btc_ratio_sma_30'] = (eth_shifted / btc_shifted).rolling(30).mean()

    train_eth = train_eth.merge(train_btc[['date'] + [col for col in train_btc.columns if col != 'date']],
                                on='date', how='left')

altcoins = ['BNB', 'ADA']
for coin in altcoins:
    if f'{coin}_Close' in train_raw.columns:
        coin_shifted = train_raw[f'{coin}_Close'].shift(1)
        eth_shifted = train_eth['ETH_Close'].shift(1)
        train_eth[f'{coin.lower()}_eth_ratio'] = coin_shifted / eth_shifted
        train_eth[f'{coin.lower()}_eth_correlation'] = eth_shifted.rolling(30).corr(coin_shifted)

train_df = merge_external_features(train_eth, data_with_lags, sentiment_features, google_trends_df, eth_onchain_df)
train_df['target_next_log_return'] = np.log(train_df['ETH_Close'] / train_df['ETH_Close'].shift(1)).shift(-1)
train_df['target_direction'] = (train_df['target_next_log_return'] > 0).astype(int)

print(f"Train with features: {train_df.shape}")

print("\nStep 6: Processing VAL set (using train+val for rolling windows)...")
combined_for_val = pd.concat([train_raw, val_raw]).reset_index(drop=True)
val_eth = combined_for_val[['date'] + eth_cols].copy()
val_eth = calculate_technical_indicators(val_eth, 'ETH_Close', 'ETH_Volume',
                                        'ETH_High' if has_high_low else None,
                                        'ETH_Low' if has_high_low else None)

if btc_cols:
    val_btc = combined_for_val[['date'] + btc_cols].copy()
    val_btc = calculate_technical_indicators(val_btc, 'BTC_Close', 'BTC_Volume',
                                            'BTC_High' if has_btc_hl else None,
                                            'BTC_Low' if has_btc_hl else None)
    val_btc = val_btc.add_prefix('BTC_').rename(columns={'BTC_date': 'date'})

    eth_shifted = val_eth['ETH_Close'].shift(1)
    btc_shifted = val_btc['BTC_BTC_Close'].shift(1)
    val_btc['btc_eth_correlation'] = eth_shifted.rolling(30).corr(btc_shifted)
    val_btc['btc_dominance'] = btc_shifted / (btc_shifted + eth_shifted)
    val_btc['eth_btc_ratio'] = eth_shifted / btc_shifted
    val_btc['eth_btc_ratio_sma_30'] = (eth_shifted / btc_shifted).rolling(30).mean()

    val_eth = val_eth.merge(val_btc[['date'] + [col for col in val_btc.columns if col != 'date']],
                           on='date', how='left')

for coin in altcoins:
    if f'{coin}_Close' in combined_for_val.columns:
        coin_shifted = combined_for_val[f'{coin}_Close'].shift(1)
        eth_shifted = val_eth['ETH_Close'].shift(1)
        val_eth[f'{coin.lower()}_eth_ratio'] = coin_shifted / eth_shifted
        val_eth[f'{coin.lower()}_eth_correlation'] = eth_shifted.rolling(30).corr(coin_shifted)

val_df_full = merge_external_features(val_eth, data_with_lags, sentiment_features, google_trends_df, eth_onchain_df)
val_df_full['target_next_log_return'] = np.log(val_df_full['ETH_Close'] / val_df_full['ETH_Close'].shift(1)).shift(-1)
val_df_full['target_direction'] = (val_df_full['target_next_log_return'] > 0).astype(int)

val_df = val_df_full.iloc[len(train_raw):].reset_index(drop=True)
print(f"Val with features: {val_df.shape}")

print("\nStep 7: Processing TEST set (using train+val+test for rolling windows)...")
combined_for_test = pd.concat([train_raw, val_raw, test_raw]).reset_index(drop=True)
test_eth = combined_for_test[['date'] + eth_cols].copy()
test_eth = calculate_technical_indicators(test_eth, 'ETH_Close', 'ETH_Volume',
                                         'ETH_High' if has_high_low else None,
                                         'ETH_Low' if has_high_low else None)

if btc_cols:
    test_btc = combined_for_test[['date'] + btc_cols].copy()
    test_btc = calculate_technical_indicators(test_btc, 'BTC_Close', 'BTC_Volume',
                                             'BTC_High' if has_btc_hl else None,
                                             'BTC_Low' if has_btc_hl else None)
    test_btc = test_btc.add_prefix('BTC_').rename(columns={'BTC_date': 'date'})

    eth_shifted = test_eth['ETH_Close'].shift(1)
    btc_shifted = test_btc['BTC_BTC_Close'].shift(1)
    test_btc['btc_eth_correlation'] = eth_shifted.rolling(30).corr(btc_shifted)
    test_btc['btc_dominance'] = btc_shifted / (btc_shifted + eth_shifted)
    test_btc['eth_btc_ratio'] = eth_shifted / btc_shifted
    test_btc['eth_btc_ratio_sma_30'] = (eth_shifted / btc_shifted).rolling(30).mean()

    test_eth = test_eth.merge(test_btc[['date'] + [col for col in test_btc.columns if col != 'date']],
                             on='date', how='left')

for coin in altcoins:
    if f'{coin}_Close' in combined_for_test.columns:
        coin_shifted = combined_for_test[f'{coin}_Close'].shift(1)
        eth_shifted = test_eth['ETH_Close'].shift(1)
        test_eth[f'{coin.lower()}_eth_ratio'] = coin_shifted / eth_shifted
        test_eth[f'{coin.lower()}_eth_correlation'] = eth_shifted.rolling(30).corr(coin_shifted)

test_df_full = merge_external_features(test_eth, data_with_lags, sentiment_features, google_trends_df, eth_onchain_df)
test_df_full['target_next_log_return'] = np.log(test_df_full['ETH_Close'] / test_df_full['ETH_Close'].shift(1)).shift(-1)
test_df_full['target_direction'] = (test_df_full['target_next_log_return'] > 0).astype(int)

test_df = test_df_full.iloc[len(train_raw)+len(val_raw):].reset_index(drop=True)
print(f"Test with features: {test_df.shape}")

print("\nStep 8: Removing NaN rows from each set...")
max_lag = 60
train_df = train_df.iloc[max_lag:-1].reset_index(drop=True)
val_df = val_df.iloc[:-1].reset_index(drop=True)
test_df = test_df.iloc[:-1].reset_index(drop=True)

print(f"After NaN removal:")
print(f"Train: {train_df.shape}")
print(f"Val: {val_df.shape}")
print(f"Test: {test_df.shape}")


ETHEREUM PRICE PREDICTION - NO LEAKAGE PREPROCESSING

Step 1: Loading raw datasets...
Macro: (3199, 51), 2017-01-01 00:00:00 to 2025-10-05 00:00:00

Step 2: Creating sentiment features...

Step 3: Unifying date range...
Unified range: 2020-01-01 00:00:00 to 2025-10-04 00:00:00

Step 4: Split raw data FIRST (70-15-15)...
Train raw: (1472, 51), 2020-01-01 00:00:00 to 2024-01-11 00:00:00
Val raw: (315, 51), 2024-01-12 00:00:00 to 2024-11-21 00:00:00
Test raw: (316, 51), 2024-11-22 00:00:00 to 2025-10-03 00:00:00

Step 5: Processing TRAIN set independently...
Train with features: (1472, 265)

Step 6: Processing VAL set (using train+val for rolling windows)...
Val with features: (315, 265)

Step 7: Processing TEST set (using train+val+test for rolling windows)...
Test with features: (323, 265)

Step 8: Removing NaN rows from each set...
After NaN removal:
Train: (1411, 265)
Val: (314, 265)
Test: (322, 265)


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression, RFECV
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import numpy as np
import pandas as pd


'''
최신 논문(특히 2025년도 "Optimizing Forecast Accuracy in Cryptocurrency Markets: 
Evaluating Feature Selection Techniques for Technical Indicators") 
및 여러 비교 논문들은 supervised+unsupervised 방법을 조합하되 
반드시 nested cross-validation과 시계열 보존(roll-forward 방식)으로 
데이터 누수를 막는 것을 기준으로 하고 있음. 따라서, 데이터 누수를 막기 위해서

'''
# [0] 데이터 분할 및 전처리(기술적지표, 외부지표 등 생성 단계는 기존 코드 동일)
exclude_cols = ['date', 'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open', 'ETH_Volume', 'target_next_log_return', 'target_direction']
feature_cols = [c for c in train_df.columns if c not in exclude_cols]

# [1] Unsupervised 필터 (variance, correlation) - 전체 dataset/validation/test에는 사용 가능
print("\n[Phase 1] Unsupervised filtering (variance/correlation) ...")
to_drop = [c for c in feature_cols if train_df[c].isnull().sum()>len(train_df)*0.5 or train_df[c].nunique()<=1]
feature_cols = [c for c in feature_cols if c not in to_drop]
for df in [train_df, val_df, test_df]:
    df.drop(columns=to_drop, inplace=True, errors='ignore')
for col in feature_cols:
    train_df[col] = train_df[col].interpolate().fillna(train_df[col].median())
    val_df[col] = val_df[col].interpolate().fillna(train_df[col].median())
    test_df[col] = test_df[col].interpolate().fillna(train_df[col].median())
corr_matrix = train_df[feature_cols].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [column for column in upper_tri.columns if any(upper_tri[column]>0.95)]
feature_cols = [c for c in feature_cols if c not in to_drop_corr]
for df in [train_df, val_df, test_df]:
    df.drop(columns=to_drop_corr, inplace=True, errors='ignore')

# [2] Nested CV 기반 supervised feature selection + 모델 튜닝
print("\n[Phase 2] Nested cross-validation: supervised feature selection & tuning ...")

X = train_df[feature_cols].values
y = train_df['target_next_log_return'].values

tscv_outer = TimeSeriesSplit(n_splits=5)
final_selected_features = []
best_param_list = []

for train_index, valid_index in tscv_outer.split(X):
    # Split indices for fold
    X_train_cv, X_valid_cv = X[train_index], X[valid_index]
    y_train_cv, y_valid_cv = y[train_index], y[valid_index]

    # [A] Supervised feature selection (MI + RFECV) - 내부CV만 사용
    mi_scores = mutual_info_regression(X_train_cv, y_train_cv, random_state=42, n_neighbors=5)
    mi_rank_idx = np.argsort(mi_scores)[::-1][:60]  # top 60개
    mi_features_idx = [feature_cols[i] for i in mi_rank_idx]
    X_train_mi = X_train_cv[:, mi_rank_idx]

    # RFECV with GridSearch for best model in fold
    xgb = XGBRegressor(tree_method='gpu_hist', n_jobs=-1, random_state=42)
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    rfecv = RFECV(estimator=xgb, step=1, cv=TimeSeriesSplit(3), scoring='neg_mean_squared_error', min_features_to_select=20)
    
    # RFECV 실행
    rfecv.fit(X_train_mi, y_train_cv)
    selected_idx = [i for i, s in enumerate(rfecv.support_) if s]
    selected_features_this_fold = [mi_features_idx[i] for i in selected_idx]
    final_selected_features.append(selected_features_this_fold)

    # 최적 파라미터
    grid_search = GridSearchCV(xgb, param_grid, scoring='neg_mean_squared_error', cv=TimeSeriesSplit(3), n_jobs=-1)
    grid_search.fit(X_train_mi[:,selected_idx], y_train_cv)
    best_param_list.append(grid_search.best_params_)

print(f"\nNested CV finished.\nAll folds selected features: {final_selected_features}")
print(f"\nBest params from each fold: {best_param_list}")

# 교차검증에서 가장 많이 선택된 feature만 retain

flat_features = [f for featlist in final_selected_features for f in featlist]
feature_freq = Counter(flat_features)
selected_final_features = [f for f, cnt in feature_freq.items() if cnt > (len(final_selected_features)//2)]
print(f"\nConsensus selected features (appearing in majority folds):\n{selected_final_features}")

print("\n[Step 3] Scaling selected features only (no leakage)...")
scaler = StandardScaler()
train_df[selected_final_features] = scaler.fit_transform(train_df[selected_final_features])
val_df[selected_final_features] = scaler.transform(val_df[selected_final_features])
test_df[selected_final_features] = scaler.transform(test_df[selected_final_features])

print(f"\nFinal feature selection complete. Model ready: {len(selected_final_features)} features.")



[Phase 1] Unsupervised filtering (variance/correlation) ...

[Phase 2] Nested cross-validation: supervised feature selection & tuning ...

Nested CV finished.
All folds selected features: [['BTC_DMP_14', 'BTC_bb_position_20', 'volume_sma_7', 'BTC_returns', 'positive_ratio', 'volume_lag_7', 'dxy_DXY', 'returns', 'BTC_returns_lag_7', 'bb_width_20', 'DMN_14', 'BTC_returns_lag_14', 'returns_lag_3', 'returns_ema_7', 'BTC_volume_lag_3', 'BTC_returns_lag_1', 'returns_sma_60', 'obv', 'returns_sma_7', 'stochastic_14', 'BTC_BTC_Open'], ['BTC_returns_ema_14', 'BTC_BTC_Volume', 'volume_lag_1', 'returns_lag_7', 'dxy_DXY', 'BTC_MACD_12_26_9', 'BTC_volume_sma_7', 'BTC_obv', 'BTC_bb_position_20', 'funding_fundingRate', 'volume_sma_7', 'returns', 'negative_ratio', 'bnb_eth_ratio', 'BTC_returns', 'BTC_rsi_14', 'BTC_bb_width_20', 'BTC_returns_ema_7', 'BTC_ADX_14', 'BTC_mfi_14', 'bb_position_20', 'sentiment_mean_lag1', 'vix_VIX', 'BTC_DMN_14', 'BTC_roc_20', 'BTC_atr_14', 'bnb_eth_correlation', 'ada_eth_r

In [None]:



# print("\nStep 9: Advanced Feature Selection (논문 기반)...")
# print("Reference: 'Optimizing Forecast Accuracy in Cryptocurrency Markets' (2025)")

# exclude_cols = ['date', 'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open', 'ETH_Volume',
#                 'target_next_log_return', 'target_direction']
# feature_cols = [col for col in train_df.columns if col not in exclude_cols]

# # PHASE 1: 결측치 제거
# print("\n[Phase 1] Variance-based filtering...")

# cols_to_drop = []
# for col in feature_cols:
#     if train_df[col].isnull().sum() > len(train_df) * 0.5:
#         cols_to_drop.append(col)

# if cols_to_drop:
#     print(f"Dropping {len(cols_to_drop)} features with >50% missing")
#     feature_cols = [c for c in feature_cols if c not in cols_to_drop]
#     train_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
#     val_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
#     test_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# for col in feature_cols:
#     train_df[col] = train_df[col].fillna(method='ffill').fillna(train_df[col].median())
#     train_median = train_df[col].median()
#     val_df[col] = val_df[col].fillna(method='ffill').fillna(train_median)
#     test_df[col] = test_df[col].fillna(method='ffill').fillna(train_median)

# print(f"Features after Phase 1: {len(feature_cols)}")

# # PHASE 2: 상관관계 0.95 이상 중복 제거
# print("\n[Phase 2] Correlation-based redundancy removal...")
# corr_matrix = train_df[feature_cols].corr().abs()
# upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop_corr = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

# if to_drop_corr:
#     print(f"Dropping {len(to_drop_corr)} highly correlated features (>0.95)")
#     feature_cols = [c for c in feature_cols if c not in to_drop_corr]
#     train_df.drop(columns=to_drop_corr, inplace=True, errors='ignore')
#     val_df.drop(columns=to_drop_corr, inplace=True, errors='ignore')
#     test_df.drop(columns=to_drop_corr, inplace=True, errors='ignore')

# print(f"Features after Phase 2: {len(feature_cols)}")

# # PHASE 3: Mutual Information (Top 50~60)
# print("\n[Phase 3] Mutual Information feature selection...")
# print("Reference: MI effectively captures non-linear relationships in crypto markets")

# X_train = train_df[feature_cols].values
# y_train = train_df['target_next_log_return'].values
# X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
# y_train = np.nan_to_num(y_train, nan=0.0, posinf=0.0, neginf=0.0)

# mi_scores = mutual_info_regression(X_train, y_train, random_state=42, n_neighbors=5)
# mi_scores_series = pd.Series(mi_scores, index=feature_cols).sort_values(ascending=False)
# n_mi_features = min(60, len(feature_cols))
# top_mi_features = mi_scores_series.head(n_mi_features).index.tolist()
# print(f"Selected top {len(top_mi_features)} features by MI")
# print(f"Top 10 MI scores:\n{mi_scores_series.head(10)}")

# X_train_mi = train_df[top_mi_features].values
# X_train_mi = np.nan_to_num(X_train_mi, nan=0.0, posinf=0.0, neginf=0.0)

# # ====================하이퍼파라미터 자동 탐색 ====================
# print("\nStep 9.1: XGBoost Hyperparameter Optimization (Optuna, 논문 권장)")

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 400),
#         'max_depth': trial.suggest_int('max_depth', 3, 8),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
#         'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 2.0),
#         'tree_method': 'gpu_hist',
#         'random_state': 42,
#         'n_jobs': -1
#     }
    
#     model = XGBRegressor(**params)
#     score = cross_val_score(model, X_train_mi, y_train, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
#     return -score.mean()


# # Pruner 적용
# study = optuna.create_study(
#     direction='minimize',
#     pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)
# )
# study.optimize(objective, n_trials=50)  

# print("Best XGBoost params:", study.best_trial.params)

# best_xgb = XGBRegressor(**study.best_trial.params)

# # PHASE 4: RFECV with 최적 하이퍼파라미터 적용
# print("\n[Phase 4] Recursive Feature Elimination with Cross-Validation... (Optuna 최적 파라미터 적용)")

# selector = RFECV(
#     estimator=best_xgb,
#     step=1,
#     cv=5,
#     scoring='neg_mean_squared_error',
#     min_features_to_select=20,
#     n_jobs=-1
# )
# print("Training RFECV... (this may take a few minutes with GPU & Optuna-tuned params)")
# selector.fit(X_train_mi, y_train)
# selected_features = [top_mi_features[i] for i in range(len(top_mi_features)) if selector.support_[i]]
# print(f"\nOptimal number of features: {len(selected_features)}")
# print(f"Feature reduction: {100 * (1 - len(selected_features) / len(feature_cols)):.1f}%")
# print(f"\nSelected features:\n{selected_features}")

# # PHASE 5: Feature Importance 분석
# print("\n[Phase 5] XGBoost Feature Importance analysis...")
# feature_importance = pd.Series(
#     selector.estimator_.feature_importances_,
#     index=selected_features
# ).sort_values(ascending=False)
# print(f"\nTop 15 most important features:")
# print(feature_importance.head(15))

# # ====================  카테고리별 상세 분류 ====================

# def categorize_feature(f):
#     if any(x in f for x in ['rsi', 'macd', 'momentum', 'roc', 'cci', 'stochastic', 'williams']):
#         return 'Momentum'
#     if any(x in f for x in ['volatility', 'atr', 'bb_', 'bbands']):
#         return 'Volatility'
#     if any(x in f for x in ['volume', 'obv', 'mfi']):
#         return 'Volume'
#     if any(x in f for x in ['sma', 'ema', 'trend']):
#         return 'Trend'
#     if 'onchain_' in f:
#         return 'On-chain'
#     if any(x in f for x in ['sp500', 'vix', 'gold', 'dxy']):
#         return 'Macro'
#     if any(x in f for x in ['sentiment', 'news', 'positive_ratio', 'negative_ratio']):
#         return 'Sentiment'
#     if any(x in f for x in ['aave', 'lido', 'maker', 'chain_tvl', 'funding']):
#         return 'External'
#     return 'Other'

# category_map = {f: categorize_feature(f) for f in selected_features}

# category_counts = Counter(category_map.values())
# print("\nFeature category breakdown (detailed):")
# for cat, count in category_counts.items():
#     print(f"{cat}: {count}")

# print("\nDetailed feature list by category:")
# for cat in sorted(set(category_map.values())):
#     print(f"\n[{cat}]")
#     for f in [k for k, v in category_map.items() if v == cat]:
#         print(f"  - {f}")

# # ==================== Step 10: Scaling ====================
# print("\n[Step 10] Scaling selected features only...")
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_df[selected_features] = scaler.fit_transform(train_df[selected_features])
# val_df[selected_features] = scaler.transform(val_df[selected_features])
# test_df[selected_features] = scaler.transform(test_df[selected_features])

# print(f"\n{'='*70}")
# print(f"FINAL PREPROCESSED DATASETS")
# print(f"{'='*70}")
# print(f"Train: {train_df.shape}, {train_df['date'].min()} to {train_df['date'].max()}")
# print(f"Val: {val_df.shape}, {val_df['date'].min()} to {val_df['date'].max()}")
# print(f"Test: {test_df.shape}, {test_df['date'].min()} to {test_df['date'].max()}")
# print(f"Selected Features: {len(selected_features)}")
# print(f"\nFeature selection complete. Ready for model training.")

In [None]:
################################# 퍼플렉시티 버전인데 일단 주석 ##########
# print("\nStep 9: Feature selection and missing value handling...")
# exclude_cols = ['date', 'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open', 'ETH_Volume',
#                 'target_next_log_return', 'target_direction']
# feature_cols = [col for col in train_df.columns if col not in exclude_cols]

# cols_to_drop = []
# for col in feature_cols:
#     if train_df[col].isnull().sum() > len(train_df) * 0.5:
#         cols_to_drop.append(col)

# if cols_to_drop:
#     print(f"Dropping {len(cols_to_drop)} features with >50% missing in train")
#     feature_cols = [c for c in feature_cols if c not in cols_to_drop]
#     train_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
#     val_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
#     test_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# for col in feature_cols:
#     train_df[col] = train_df[col].fillna(method='ffill').fillna(train_df[col].median())
#     train_median = train_df[col].median()

#     val_df[col] = val_df[col].fillna(method='ffill').fillna(train_median)
#     test_df[col] = test_df[col].fillna(method='ffill').fillna(train_median)

# print("\nStep 10: Scaling with StandardScaler...")
# scaler = StandardScaler()
# train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
# val_df[feature_cols] = scaler.transform(val_df[feature_cols])
# test_df[feature_cols] = scaler.transform(test_df[feature_cols])

# print(f"\nFinal datasets:")
# print(f"Train: {train_df.shape}, {train_df['date'].min()} to {train_df['date'].max()}")
# print(f"Val: {val_df.shape}, {val_df['date'].min()} to {val_df['date'].max()}")
# print(f"Test: {test_df.shape}, {test_df['date'].min()} to {test_df['date'].max()}")
# print(f"Features: {len(feature_cols)}")

In [None]:
###############################    클로드 버전 #############################

In [56]:

print("\nStep 11: Feature Selection...")

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

exclude_cols = ['date', 'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open', 'ETH_Volume',
                'target_next_log_return', 'target_direction']
all_features = [col for col in train_df.columns if col not in exclude_cols]

print(f"Initial features: {len(all_features)}")

# Stage 1: Variance threshold
selector = VarianceThreshold(threshold=0.01)
selector.fit(train_df[all_features])
features_after_variance = [feat for feat, selected in zip(all_features, selector.get_support()) if selected]
print(f"After variance filter: {len(features_after_variance)} features")

# Stage 2: Correlation filter
corr_matrix = train_df[features_after_variance].corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper_triangle.columns if any(upper_triangle[col] > 0.95)]
features_after_corr = [f for f in features_after_variance if f not in to_drop]
print(f"After correlation filter: {len(features_after_corr)} features")

# Stage 3: Tree-based importance
X_train = train_df[features_after_corr].values
y_train = train_df['target_next_log_return'].values
mask = ~np.isnan(y_train)

rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train[mask], y_train[mask])

importances = pd.DataFrame({
    'feature': features_after_corr,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

TOP_K = 30
selected_features = importances.head(TOP_K)['feature'].tolist()

print(f"\nFinal selected features: {len(selected_features)}")
print(f"\nTop {TOP_K} most important:")
for idx, row in importances.head(TOP_K).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Apply selection to all datasets
train_df = train_df[['date'] + selected_features + ['target_next_log_return', 'target_direction']].copy()
val_df = val_df[['date'] + selected_features + ['target_next_log_return', 'target_direction']].copy()
test_df = test_df[['date'] + selected_features + ['target_next_log_return', 'target_direction']].copy()

print(f"\nFinal dataset shapes:")
print(f"Train: {train_df.shape}")
print(f"Val: {val_df.shape}")
print(f"Test: {test_df.shape}")


targets = {
    'log_return': 'target_next_log_return',
    'direction': 'target_direction'
}




Step 11: Feature Selection...
Initial features: 208
After variance filter: 208 features
After correlation filter: 127 features

Final selected features: 30

Top 30 most important:
  BTC_obv: 0.0551
  returns_lag_3: 0.0467
  returns: 0.0431
  macd_signal_12_26_9: 0.0413
  BTC_BTC_Volume: 0.0244
  btc_dominance: 0.0237
  volume_lag_3: 0.0222
  lido_lido_eth_tvl: 0.0201
  BTC_roc_20: 0.0170
  BTC_returns: 0.0152
  volume_sma_7: 0.0151
  BTC_returns_lag_14: 0.0141
  volume_lag_14: 0.0137
  btc_eth_correlation: 0.0135
  BTC_macd_12_26: 0.0131
  BTC_mfi_14: 0.0119
  dxy_DXY: 0.0116
  returns_sma_60: 0.0111
  vix_VIX: 0.0110
  BTC_returns_ema_7: 0.0107
  BTC_macd_hist_12_26_9: 0.0106
  sentiment_mean: 0.0106
  returns_ema_7: 0.0105
  BTC_returns_lag_7: 0.0099
  bnb_eth_ratio: 0.0098
  BTC_returns_sma_14: 0.0097
  funding_fundingRate_lag1: 0.0096
  BTC_volatility_7: 0.0096
  vwap: 0.0093
  BTC_returns_lag_1: 0.0092

Final dataset shapes:
Train: (1411, 33)
Val: (314, 33)
Test: (322, 33)


In [None]:
################################### 퍼플렉시티 버전 ################################

In [60]:

print("\n" + "="*70)
print("Step 11: Advanced Feature Selection")
print("="*70)

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

TOP_K = 30  
USE_RFECV = False 
CUMULATIVE_THRESHOLD = 0.95 
exclude_cols = ['date', 'ETH_Close', 'ETH_High', 'ETH_Low', 'ETH_Open', 'ETH_Volume',
                'target_next_log_return', 'target_direction']
all_features = [col for col in train_df.columns if col not in exclude_cols]

print(f"\nInitial features: {len(all_features)}")

# ============================================================================
# Stage 1: Variance Threshold
# ============================================================================
print("\n[Stage 1] Variance Threshold Filter...")
selector = VarianceThreshold(threshold=0.01)
selector.fit(train_df[all_features])
features_after_variance = [feat for feat, selected in zip(all_features, selector.get_support()) if selected]
print(f"  Remaining: {len(features_after_variance)} features")

# ============================================================================
# Stage 2: Correlation Filter (Removes redundant features)
# ============================================================================
print("\n[Stage 2] Correlation Filter (threshold=0.95)...")
corr_matrix = train_df[features_after_variance].corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper_triangle.columns if any(upper_triangle[col] > 0.95)]
features_after_corr = [f for f in features_after_variance if f not in to_drop]
print(f"  Dropped {len(to_drop)} highly correlated features")
print(f"  Remaining: {len(features_after_corr)} features")

# ============================================================================
# Stage 3: Tree-based Feature Importance
# ============================================================================
print("\n[Stage 3] Random Forest Feature Importance...")
X_train = train_df[features_after_corr].values
y_train = train_df['target_next_log_return'].values
mask = ~np.isnan(y_train)

rf = RandomForestRegressor(
    n_estimators=200,  # Increased for stability
    max_depth=15,
    min_samples_split=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train[mask], y_train[mask])

importances = pd.DataFrame({
    'feature': features_after_corr,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# ============================================================================
# Stage 4: Feature Selection Strategy
# ============================================================================
if USE_RFECV:
    print("\n[Stage 4] RFECV - Finding Optimal Feature Count...")
    print("  (This may take several minutes...)")

    rfecv = RFECV(
        estimator=RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1),
        step=1,
        cv=TimeSeriesSplit(n_splits=3),
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=0
    )
    rfecv.fit(X_train[mask], y_train[mask])

    selected_features = [features_after_corr[i] for i in range(len(features_after_corr)) if rfecv.support_[i]]
    print(f"  RFECV selected {len(selected_features)} optimal features")
    print(f"  (Ignoring TOP_K={TOP_K} parameter)")

else:
    # Method 1: Top K selection
    print(f"\n[Stage 4] Top-K Selection (K={TOP_K})...")
    selected_features = importances.head(TOP_K)['feature'].tolist()

    # Calculate cumulative importance for reference
    importances['cumulative'] = importances['importance'].cumsum()
    importances['cumulative_pct'] = importances['cumulative'] / importances['importance'].sum()

    cum_at_k = importances.iloc[TOP_K-1]['cumulative_pct']
    print(f"  Cumulative importance at K={TOP_K}: {cum_at_k:.2%}")

    # Show how many features needed for 95% cumulative importance
    features_for_95 = len(importances[importances['cumulative_pct'] <= CUMULATIVE_THRESHOLD])
    print(f"  Features needed for {CUMULATIVE_THRESHOLD:.0%} importance: {features_for_95}")

# ============================================================================
# Display Top Features with Category Analysis
# ============================================================================
print(f"\n{'='*70}")
print(f"Selected {len(selected_features)} Features")
print(f"{'='*70}")

# Categorize features
def categorize_feature(feat):
    if feat.startswith('BTC_'):
        return 'BTC'
    elif any(x in feat for x in ['returns', 'log_returns', 'momentum', 'roc']):
        return 'Momentum'
    elif any(x in feat for x in ['volume', 'obv']):
        return 'Volume'
    elif any(x in feat for x in ['sma', 'ema', 'macd', 'rsi', 'bb_', 'atr', 'adx']):
        return 'Technical'
    elif any(x in feat for x in ['tvl', 'onchain', 'lido', 'aave', 'maker']):
        return 'DeFi/OnChain'
    elif any(x in feat for x in ['sentiment', 'news']):
        return 'Sentiment'
    elif any(x in feat for x in ['vix', 'sp500', 'gold', 'dxy']):
        return 'Macro'
    elif any(x in feat for x in ['correlation', 'ratio', 'dominance']):
        return 'Market Structure'
    else:
        return 'Other'

# Category distribution
categories = {}
for feat in selected_features:
    cat = categorize_feature(feat)
    categories[cat] = categories.get(cat, 0) + 1

print("\nCategory Distribution:")
for cat, count in sorted(categories.items(), key=lambda x: -x[1]):
    pct = count / len(selected_features) * 100
    print(f"  {cat:20s}: {count:2d} ({pct:5.1f}%)")

print(f"\n Top {min(30, len(selected_features))} Most Important Features:")
print(f"{'Rank':<6} {'Feature':<35} {'Importance':<12} {'Category':<20}")
print("-" * 75)
for idx, row in importances.head(min(30, len(selected_features))).iterrows():
    feat = row['feature']
    if feat in selected_features:
        rank = list(selected_features).index(feat) + 1
        cat = categorize_feature(feat)
        print(f"{rank:<6} {feat:<35} {row['importance']:.6f}    {cat:<20}")

# ============================================================================
# Check for Missing Critical Features
# ============================================================================
print("\n" + "="*70)
print("Critical Feature Check")
print("="*70)

critical_features = {
    'RSI': [f for f in features_after_corr if 'rsi' in f.lower()],
    'ADX': [f for f in features_after_corr if 'adx' in f.lower()],
    'Bollinger Bands': [f for f in features_after_corr if 'bb_' in f],
    'Stochastic': [f for f in features_after_corr if 'stochastic' in f],
}

for indicator, features in critical_features.items():
    if features:
        selected_count = len([f for f in features if f in selected_features])
        if selected_count > 0:
            print(f"  ✓ {indicator}: {selected_count}/{len(features)} selected")
        else:
            # Find rank of best feature in this category
            ranks = [importances[importances['feature']==f].index[0] for f in features if f in importances['feature'].values]
            if ranks:
                best_rank = min(ranks) + 1
                print(f"  ⚠ {indicator}: Not in top {len(selected_features)}, best rank: {best_rank}")
    else:
        print(f"  ✗ {indicator}: Not computed")

# ============================================================================
# Apply Selection to All Datasets
# ============================================================================
print("\n" + "="*70)
print("Applying Feature Selection to All Datasets")
print("="*70)

train_df = train_df[['date'] + selected_features + ['target_next_log_return', 'target_direction']].copy()
val_df = val_df[['date'] + selected_features + ['target_next_log_return', 'target_direction']].copy()
test_df = test_df[['date'] + selected_features + ['target_next_log_return', 'target_direction']].copy()

print(f"\nFinal Dataset Shapes:")
print(f"  Train: {train_df.shape}")
print(f"  Val:   {val_df.shape}")
print(f"  Test:  {test_df.shape}")


importances[importances['feature'].isin(selected_features)].to_csv('feature_importance.csv', index=False)

print("\n" + "="*70)
print("Feature Selection Completed Successfully!")
print("="*70)

print(f"  TOP_K: {TOP_K}")
print(f"  USE_RFECV: {USE_RFECV}")
print(f"  CUMULATIVE_THRESHOLD: {CUMULATIVE_THRESHOLD}")




Step 11: Advanced Feature Selection

Initial features: 208

[Stage 1] Variance Threshold Filter...
  Remaining: 208 features

[Stage 2] Correlation Filter (threshold=0.95)...
  Dropped 81 highly correlated features
  Remaining: 127 features

[Stage 3] Random Forest Feature Importance...

[Stage 4] Top-K Selection (K=30)...
  Cumulative importance at K=30: 52.82%
  Features needed for 95% importance: 105

Selected 30 Features

Category Distribution:
  BTC                 : 13 ( 43.3%)
  Momentum            :  5 ( 16.7%)
  Volume              :  5 ( 16.7%)
  Market Structure    :  3 ( 10.0%)
  Technical           :  1 (  3.3%)
  DeFi/OnChain        :  1 (  3.3%)
  Macro               :  1 (  3.3%)
  Other               :  1 (  3.3%)

 Top 30 Most Important Features:
Rank   Feature                             Importance   Category            
---------------------------------------------------------------------------
1      BTC_obv                             0.056564    BTC             

date                        0
BTC_obv                     0
returns                     0
returns_lag_3               0
macd_signal_12_26_9         0
btc_dominance               0
BTC_BTC_Volume              0
BTC_returns                 0
volume_lag_3                0
lido_lido_eth_tvl           0
returns_sma_60              0
returns_ema_7               0
volume_sma_7                0
returns_lag_1               0
volume_lag_14               0
BTC_returns_lag_14          0
BTC_roc_20                  0
BTC_macd_12_26              0
btc_eth_correlation         0
dxy_DXY                     0
BTC_returns_lag_7           0
volume_lag_7                0
BTC_returns_ema_7           0
BTC_returns_lag_1           0
bnb_eth_ratio               0
BTC_returns_lag_3           0
BTC_momentum_10             0
funding_fundingRate_lag1    0
BTC_cci_20                  0
volume_lag_1                0
BTC_mfi_14                  0
target_next_log_return      0
target_direction            0
dtype: int