In [None]:
!pip install ppscore


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf
from pytrends.request import TrendReq
import ppscore as pps
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_recall_curve, auc, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
tf.random.set_seed(42)

def load_onchain_data():
    df = pd.read_csv('eth_onchain.csv', index_col=0, parse_dates=True)
    df.index = pd.to_datetime(df.index)
    return df

def load_external_data(start_date, end_date):
    eth = yf.download('ETH-USD', start=start_date, end=end_date, progress=False)
    if isinstance(eth.columns, pd.MultiIndex):
        eth.columns = eth.columns.droplevel(1)
    eth_data = pd.DataFrame({
        'eth_close': eth['Close'],
        'eth_vol': eth['Volume']
    })
    
    spx = yf.download('^GSPC', start=start_date, end=end_date, progress=False)
    if isinstance(spx.columns, pd.MultiIndex):
        spx.columns = spx.columns.droplevel(1)
    spx_data = pd.DataFrame({'spx_close': spx['Close']})
    
    gold = yf.download('GC=F', start=start_date, end=end_date, progress=False)
    if isinstance(gold.columns, pd.MultiIndex):
        gold.columns = gold.columns.droplevel(1)
    gold_data = pd.DataFrame({'gold_close': gold['Close']})
    
    brent = yf.download('BZ=F', start=start_date, end=end_date, progress=False)
    if isinstance(brent.columns, pd.MultiIndex):
        brent.columns = brent.columns.droplevel(1)
    brent_data = pd.DataFrame({'brent_close': brent['Close']})
    
    external = pd.concat([eth_data, spx_data, gold_data, brent_data], axis=1)
    
    pytrends = TrendReq(hl='en-US', tz=0)
    keywords = ['ethereum', 'cryptocurrency', 'bitcoin']
    
    try:
        timeframe_str = f'{start_date} {end_date}'
        pytrends.build_payload(keywords, timeframe=timeframe_str)
        trends = pytrends.interest_over_time()
        if not trends.empty:
            trends = trends.drop('isPartial', axis=1, errors='ignore')
            trends.columns = ['g_ethereum', 'g_crypto', 'g_bitcoin']
            external = external.join(trends)
    except:
        pass
    
    return external

def merge_data(onchain_df, external_df):
    merged = onchain_df.join(external_df, how='inner')
    merged = merged.fillna(method='ffill').fillna(method='bfill')
    merged = merged.dropna()
    return merged

def feature_engineering(df, split_idx):
    df = df.copy()
    
    df['logret'] = np.log(df['eth_close']) - np.log(df['eth_close'].shift(1))
    df['ma7'] = df['eth_close'].rolling(window=7, min_periods=1).mean()
    df['ma21'] = df['eth_close'].rolling(window=21, min_periods=1).mean()
    df['vol7'] = df['logret'].rolling(window=7, min_periods=1).std()
    df['ma_cross'] = df['ma7'] - df['ma21']
    df['rsi'] = calculate_rsi(df['eth_close'], 14)
    df['dayofweek'] = df.index.dayofweek
    df['month'] = df.index.month
    
    base_features = [col for col in df.columns if col not in ['eth_close', 'dayofweek', 'month']]
    
    for feature in base_features:
        for lag in range(1, 8):
            df[f'{feature}_lag{lag}'] = df[feature].shift(lag)
    
    df = df.dropna()
    return df

def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period, min_periods=1).mean()
    rs = gain / (loss + 1e-10)
    rsi = 100 - (100 / (1 + rs))
    return rsi

def adaptive_feature_selection(df, price_col='eth_close', pps_thresh=0.10, pearson_thresh=0.005):
    feature_cols = [col for col in df.columns if col not in [price_col]]
    
    pps_scores = {}
    for col in feature_cols:
        try:
            score = pps.score(df, col, price_col)['ppscore']
            pps_scores[col] = score
        except:
            pps_scores[col] = 0
    
    selected_phase1 = [col for col, score in pps_scores.items() if score >= pps_thresh]
    
    if len(selected_phase1) == 0:
        pps_thresh = 0.05
        selected_phase1 = [col for col, score in pps_scores.items() if score >= pps_thresh]
    
    if len(selected_phase1) > 0:
        corr = df[selected_phase1].corrwith(df[price_col])
        selected_phase2 = [col for col in selected_phase1 if abs(corr[col]) >= pearson_thresh]
    else:
        selected_phase2 = []
    
    onchain_keywords = ['tx_count', 'active_addresses', 'new_addresses', 'large_eth_transfers', 
                        'token_transfers', 'contract_events', 'avg_gas_price', 'total_gas_used', 
                        'avg_block_size', 'avg_block_difficulty']
    
    selected_onchain = [col for col in selected_phase2 if any(kw in col for kw in onchain_keywords)]
    selected_external = [col for col in selected_phase2 if col not in selected_onchain]
    
    if len(selected_onchain) == 0:
        selected_onchain = [col for col in feature_cols if any(kw in col for kw in onchain_keywords)][:10]
    if len(selected_external) == 0:
        selected_external = [col for col in feature_cols if col not in selected_onchain][:10]
    
    return selected_onchain, selected_external

def make_sequences(df, selected_onchain, selected_external, price_col='eth_close', timesteps=7):
    df = df.copy()
    
    df['target_price'] = df[price_col].shift(-1)
    df['target_dir'] = (df[price_col].shift(-1) > df[price_col]).astype(int)
    
    df = df.dropna()
    
    split_idx = int(len(df) * 0.8)
    train_df = df.iloc[:split_idx]
    test_df = df.iloc[split_idx:]
    
    scaler_on = MinMaxScaler()
    scaler_ex = MinMaxScaler()
    scaler_price = MinMaxScaler()
    
    scaler_on.fit(train_df[selected_onchain])
    scaler_ex.fit(train_df[selected_external])
    scaler_price.fit(train_df[['target_price']])
    
    df_on_scaled = scaler_on.transform(df[selected_onchain])
    df_ex_scaled = scaler_ex.transform(df[selected_external])
    target_price_scaled = scaler_price.transform(df[['target_price']])
    
    X_on, X_ex, y_price, y_dir, indices = [], [], [], [], []
    
    for i in range(timesteps, len(df)):
        X_on.append(df_on_scaled[i-timesteps:i])
        X_ex.append(df_ex_scaled[i-timesteps:i])
        y_price.append(target_price_scaled[i, 0])
        y_dir.append(df['target_dir'].iloc[i])
        indices.append(i)
    
    X_on = np.array(X_on)
    X_ex = np.array(X_ex)
    y_price = np.array(y_price)
    y_dir = np.array(y_dir)
    
    train_size = split_idx - timesteps
    
    X_on_train, X_on_test = X_on[:train_size], X_on[train_size:]
    X_ex_train, X_ex_test = X_ex[:train_size], X_ex[train_size:]
    y_price_train, y_price_test = y_price[:train_size], y_price[train_size:]
    y_dir_train, y_dir_test = y_dir[:train_size], y_dir[train_size:]
    test_indices = indices[train_size:]
    
    class_weights = compute_class_weight('balanced', classes=np.unique(y_dir_train), y=y_dir_train)
    class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
    
    return (X_on_train, X_ex_train, y_price_train, y_dir_train,
            X_on_test, X_ex_test, y_price_test, y_dir_test, 
            scaler_price, class_weight_dict, df, test_indices)

def build_model_paper(n_on, n_ex, timesteps=7):
    input_on = Input(shape=(timesteps, n_on), name='onchain_input')
    x_on = GRU(256, return_sequences=True)(input_on)
    x_on = GRU(256, return_sequences=True)(x_on)
    x_on = GRU(256)(x_on)
    x_on = Dense(256, activation='relu')(x_on)
    x_on = Dense(64, activation='relu')(x_on)
    
    input_ex = Input(shape=(timesteps, n_ex), name='external_input')
    x_ex = LSTM(256, return_sequences=True)(input_ex)
    x_ex = LSTM(256, return_sequences=True)(x_ex)
    x_ex = LSTM(256)(x_ex)
    x_ex = Dense(256, activation='relu')(x_ex)
    x_ex = Dense(64, activation='relu')(x_ex)
    
    concat = Concatenate()([x_on, x_ex])
    x = Dense(128, activation='relu')(concat)
    x = Dropout(0.1)(x)
    x = Dense(64, activation='relu')(x)
    
    output_price = Dense(1, activation='linear', name='price')(x)
    output_dir = Dense(1, activation='sigmoid', name='dir')(x)
    
    model = Model(inputs=[input_on, input_ex], outputs=[output_price, output_dir])
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss={'price': 'mse', 'dir': 'binary_crossentropy'},
        loss_weights={'price': 1.0, 'dir': 2.0},
        metrics={'price': 'mae', 'dir': 'accuracy'}
    )
    
    return model

def train_model(model, X_on_train, X_ex_train, y_price_train, y_dir_train, 
                X_on_test, X_ex_test, y_price_test, y_dir_test, class_weight_dict):
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    history = model.fit(
        [X_on_train, X_ex_train],
        {'price': y_price_train, 'dir': y_dir_train},
        validation_data=([X_on_test, X_ex_test], {'price': y_price_test, 'dir': y_dir_test}),
        epochs=30,
        batch_size=64,
        callbacks=[early_stop],
        class_weight={'dir': class_weight_dict},
        verbose=0
    )
    
    return model, history

def evaluate_model(model, X_on_test, X_ex_test, y_price_test, y_dir_test, scaler_price):
    pred_price_scaled, pred_dir = model.predict([X_on_test, X_ex_test], verbose=0)
    
    pred_price = scaler_price.inverse_transform(pred_price_scaled.reshape(-1, 1)).flatten()
    true_price = scaler_price.inverse_transform(y_price_test.reshape(-1, 1)).flatten()
    
    rmse = np.sqrt(np.mean((pred_price - true_price)**2))
    
    epsilon = 1e-8
    mape = np.mean(np.abs((true_price - pred_price) / (true_price + epsilon))) * 100
    
    pred_dir_binary = (pred_dir.flatten() > 0.5).astype(int)
    acc = accuracy_score(y_dir_test, pred_dir_binary)
    
    precision, recall, _ = precision_recall_curve(y_dir_test, pred_dir.flatten())
    pr_auc = auc(recall, precision)
    
    return rmse, mape, acc, pr_auc, pred_dir.flatten()

def naive_backtest(pred_probs, df_original, test_indices, fee=0.001):
    capital = 1.0
    trades = 0
    wins = 0
    
    for idx, prob in enumerate(pred_probs[:-1]):
        current_idx = test_indices[idx]
        next_idx = test_indices[idx + 1] if idx + 1 < len(test_indices) else current_idx + 1
        
        if next_idx >= len(df_original):
            break
            
        if prob > 0.5:
            current_price = df_original['eth_close'].iloc[current_idx]
            next_price = df_original['eth_close'].iloc[next_idx]
            ret = (next_price - current_price) / current_price - fee
            capital *= (1 + ret)
            trades += 1
            if ret > 0:
                wins += 1
    
    total_return = (capital - 1.0) * 100
    win_rate = (wins / trades * 100) if trades > 0 else 0
    
    return total_return, trades, win_rate

onchain_df = load_onchain_data()
start_date = onchain_df.index.min().strftime('%Y-%m-%d')
end_date = onchain_df.index.max().strftime('%Y-%m-%d')

external_df = load_external_data(start_date, end_date)
merged_df = merge_data(onchain_df, external_df)

split_idx = int(len(merged_df) * 0.8)
df_features = feature_engineering(merged_df, split_idx)

selected_onchain, selected_external = adaptive_feature_selection(df_features)

(X_on_train, X_ex_train, y_price_train, y_dir_train,
 X_on_test, X_ex_test, y_price_test, y_dir_test, 
 scaler_price, class_weight_dict, df_original, test_indices) = make_sequences(
    df_features, selected_onchain, selected_external)

model = build_model_paper(len(selected_onchain), len(selected_external))

model, history = train_model(
    model, X_on_train, X_ex_train, y_price_train, y_dir_train,
    X_on_test, X_ex_test, y_price_test, y_dir_test, class_weight_dict
)

rmse, mape, acc, pr_auc, pred_probs = evaluate_model(
    model, X_on_test, X_ex_test, y_price_test, y_dir_test, scaler_price
)

total_return, trades, win_rate = naive_backtest(pred_probs, df_original, test_indices)

print(f"\n{'='*60}")
print(f"{'MODEL EVALUATION RESULTS':^60}")
print(f"{'='*60}")
print(f"\nData Period: {start_date} to {end_date}")
print(f"Total samples: {len(df_features)}, Train: {len(X_on_train)}, Test: {len(X_on_test)}")
print(f"\nFeature Selection:")
print(f"  On-chain features: {len(selected_onchain)}")
print(f"  External features: {len(selected_external)}")
print(f"\nRegression Metrics:")
print(f"  RMSE:  {rmse:>10.2f}")
print(f"  MAPE:  {mape:>10.2f}%")
print(f"\nClassification Metrics:")
print(f"  Accuracy:  {acc:>10.4f}")
print(f"  PR-AUC:    {pr_auc:>10.4f}")
print(f"\nBacktest Results:")
print(f"  Total Return:  {total_return:>10.2f}%")
print(f"  Trades:        {trades:>10}")
print(f"  Win Rate:      {win_rate:>10.2f}%")
print(f"{'='*60}\n")

2025-10-04 01:36:09.169481: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-04 01:36:09.169522: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-04 01:36:09.170685: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-04 01:36:09.177675: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-04 01:36:21.985915: I tensorflow/core

ValueError: Expected `class_weight` to be a dict with keys from 0 to one less than the number of classes, found {'dir': {0: 0.8427345187001839, 1: 1.229427549194991}}

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from pytrends.request import TrendReq
import ppscore as pps
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import time
import warnings

warnings.filterwarnings('ignore')

# 재현성을 위한 시드 고정
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)

def load_onchain_data():
    """eth_onchain.csv 파일에서 온체인 데이터를 로드합니다."""
    print("온체인 데이터 로드 중...")
    df = pd.read_csv('eth_onchain.csv', index_col=0, parse_dates=True)
    df.index = pd.to_datetime(df.index)
    print("온체인 데이터 로드 완료.")
    return df

def fetch_external_data(start_date, end_date):
    """논문에 언급된 외부 데이터를 API를 통해 직접 가져옵니다."""
    print("외부 데이터 수집 시작...")
    
    # 1. 금융 데이터 (YFinance)
    tickers = {
        'ETH-USD': 'eth', 
        '^GSPC': 'spx',    # S&P 500
        'GC=F': 'gold',    # 금 선물
        'BZ=F': 'brent'    # 브렌트유 선물
    }
    external_data = []
    for ticker, name in tickers.items():
        print(f"  - {ticker} 데이터 다운로드 중...")
        data = yf.download(ticker, start=start_date, end=end_date, progress=False)
        if not data.empty:
            # high, low 컬럼 추가
            temp_df = data[['Close', 'Volume', 'High', 'Low']].copy()
            temp_df.columns = [f'{name}_close', f'{name}_vol', f'{name}_high', f'{name}_low']
            external_data.append(temp_df)
        time.sleep(1) # API 과호출 방지

    df_financial = pd.concat(external_data, axis=1)

    # 2. 구글 트렌드 데이터 (Pytrends)
    try:
        print("  - Google Trends 데이터 다운로드 중...")
        pytrends = TrendReq(hl='en-US', tz=0)
        keywords = ['ethereum', 'cryptocurrency']
        timeframe = f'{start_date} {end_date}'
        pytrends.build_payload(keywords, cat=0, timeframe=timeframe, geo='', gprop='')
        df_trends = pytrends.interest_over_time()
        if not df_trends.empty:
            df_trends = df_trends.drop(columns=['isPartial'], errors='ignore')
            df_trends.columns = [f'gtrend_{kw}' for kw in keywords]
            df_financial = df_financial.join(df_trends, how='outer')
        time.sleep(5) # API 과호출 방지 시간 증가
    except Exception as e:
        print(f"Google Trends 데이터 수집 실패: {e}")

    # 데이터 병합 후 전처리
    df_financial = df_financial.fillna(method='ffill').fillna(method='bfill')
    print("외부 데이터 수집 완료.")
    return df_financial

def feature_engineering(df):
    """기술적 지표 및 시차(lag) 피처를 생성합니다. (강화된 버전)"""
    print("피처 엔지니어링 시작...")
    df_copy = df.copy()
    close = df_copy['eth_close']
    high = df_copy.get('eth_high', close)
    low = df_copy.get('eth_low', close)
    
    # 1. 기본 지표
    df_copy['ma7'] = close.rolling(window=7).mean()
    df_copy['ma21'] = close.rolling(window=21).mean()
    df_copy['logret'] = np.log(close).diff()
    df_copy['vol7'] = df_copy['logret'].rolling(window=7).std()

    # 2. MACD
    ema12 = close.ewm(span=12, adjust=False).mean()
    ema26 = close.ewm(span=26, adjust=False).mean()
    df_copy['macd'] = ema12 - ema26
    df_copy['macd_signal'] = df_copy['macd'].ewm(span=9, adjust=False).mean()

    # 3. Bollinger Bands
    ma20 = close.rolling(window=20).mean()
    std20 = close.rolling(window=20).std()
    df_copy['bollinger_upper'] = ma20 + (std20 * 2)
    df_copy['bollinger_lower'] = ma20 - (std20 * 2)

    # 4. RSI
    delta = close.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / (loss + 1e-10)
    df_copy['rsi'] = 100 - (100 / (1 + rs))

    # 5. 시차 피처
    for col in list(df.columns) + ['macd', 'rsi']:
        if col in df_copy.columns:
            df_copy[f'{col}_lag1'] = df_copy[col].shift(1)
            
    print("피처 엔지니어링 완료.")
    return df_copy

def adaptive_feature_selection(df, target_col='eth_close', pps_threshold=0.05, corr_threshold=0.05):
    """논문의 2단계 적응형 피처 선택을 구현합니다."""
    print("적응형 피처 선택 시작...")
    feature_candidates = [col for col in df.columns if col not in ['target_price', 'target_dir', target_col]]
    
    print(f"  - 1단계: PPS 계산 중 (대상 피처 {len(feature_candidates)}개)...")
    pps_scores = pps.matrix(df, output='df')[['x', 'y', 'ppscore']]
    pps_target_scores = pps_scores[pps_scores['y'] == target_col]
    
    selected_by_pps = pps_target_scores[pps_target_scores['ppscore'] > pps_threshold]['x'].tolist()
    print(f"  - 1단계: {len(selected_by_pps)}개 피처 선택됨 (PPS > {pps_threshold}).")

    if not selected_by_pps:
        print("경고: PPS를 통과한 피처가 없습니다. 상관계수만으로 진행합니다.")
        selected_by_pps = feature_candidates

    print("  - 2단계: 상관계수 계산 중...")
    correlations = df[selected_by_pps].corrwith(df[target_col]).abs()
    selected_features = correlations[correlations > corr_threshold].index.tolist()
    
    print(f"적응형 피처 선택 완료. 최종 {len(selected_features)}개 피처 선택됨.")
    return selected_features

def preprocess_and_create_sequences(df, feature_cols, target_price_col, target_dir_col, timesteps=7):
    """데이터 스케일링 및 시퀀스 생성을 수행합니다."""
    split_idx = int(len(df) * 0.8)
    train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]
    
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train_df[feature_cols])
    test_scaled = scaler.transform(test_df[feature_cols])
    
    scaler_price = MinMaxScaler()
    scaler_price.fit(train_df[[target_price_col]])

    def create_sequences_from_data(data, targets_price, targets_dir):
        X, y_price, y_dir = [], [], []
        for i in range(timesteps, len(data)):
            X.append(data[i-timesteps:i])
            y_price.append(targets_price[i])
            y_dir.append(targets_dir[i])
        return np.array(X), np.array(y_price), np.array(y_dir)

    X_train, y_price_train, y_dir_train = create_sequences_from_data(train_scaled, train_df[target_price_col].values, train_df[target_dir_col].values)
    X_test, y_price_test, y_dir_test = create_sequences_from_data(test_scaled, test_df[target_price_col].values, test_df[target_dir_col].values)
    
    return X_train, X_test, y_price_train, y_price_test, y_dir_train, y_dir_test, scaler, scaler_price

def build_adaptive_model(n_features, timesteps, learning_rate, loss_weights):
    """하이퍼파라미터를 입력받아 모델을 구축하고 컴파일합니다."""
    n_features_gru = n_features // 2
    n_features_lstm = n_features - n_features_gru
    
    input_gru = Input(shape=(timesteps, n_features_gru), name='gru_input')
    input_lstm = Input(shape=(timesteps, n_features_lstm), name='lstm_input')
    
    gru_stack = GRU(256, return_sequences=True)(input_gru)
    gru_stack = GRU(256)(gru_stack)
    
    lstm_stack = LSTM(256, return_sequences=True)(input_lstm)
    lstm_stack = LSTM(256)(lstm_stack)

    price_mlp = Dense(128, activation='relu')(gru_stack)
    price_output = Dense(1, activation='linear', name='price_output')(price_mlp)

    concatenated = Concatenate()([gru_stack, lstm_stack])
    
    dir_mlp = Dense(128, activation='relu')(concatenated)
    dir_mlp = Dropout(0.1)(dir_mlp)
    dir_output = Dense(1, activation='sigmoid', name='dir_output')(dir_mlp)

    model = Model(inputs=[input_gru, input_lstm], outputs=[price_output, dir_output])
    
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss={'price_output': 'mse', 'dir_output': 'binary_crossentropy'},
                  loss_weights=loss_weights,
                  metrics={'dir_output': 'accuracy'})
    return model

# --- 1. 데이터 준비 (한 번만 실행) ---
onchain_df = load_onchain_data()
start_date, end_date = onchain_df.index.min(), onchain_df.index.max()
external_df = fetch_external_data(start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))
df_merged = onchain_df.join(external_df, how='inner')

df_merged['target_price'] = df_merged['eth_close'].shift(-1)
df_merged['target_dir'] = (df_merged['target_price'] > df_merged['eth_close']).astype(int)

df_features = feature_engineering(df_merged)
df_final = df_features.dropna()

selected_features = adaptive_feature_selection(df_final)

TIMESTEPS = 7
X_train, X_test, y_price_train, y_price_test, y_dir_train, y_dir_test, scaler, scaler_price = \
    preprocess_and_create_sequences(df_final, selected_features, 'target_price', 'target_dir', TIMESTEPS)

n_f = len(selected_features)
n_f_gru = n_f // 2
X_train_gru, X_train_lstm = X_train[:, :, :n_f_gru], X_train[:, :, n_f_gru:]
X_test_gru, X_test_lstm = X_test[:, :, :n_f_gru], X_test[:, :, n_f_gru:]


# --- 2. 하이퍼파라미터 탐색 ---
hyperparameter_space = [
    {'lr': 1e-4, 'weights': {'price_output': 1.0, 'dir_output': 1.0}},
    {'lr': 1e-4, 'weights': {'price_output': 1.0, 'dir_output': 100000.0}},
    {'lr': 1e-5, 'weights': {'price_output': 1.0, 'dir_output': 1000000.0}},
    {'lr': 1e-5, 'weights': {'price_output': 1.0, 'dir_output': 500000.0}},
]

results = []
best_accuracy = 0
best_params = None

for i, params in enumerate(hyperparameter_space):
    print("\n" + "="*60)
    print(f" 하이퍼파라미터 탐색 {i+1}/{len(hyperparameter_space)} ".center(60, "="))
    print(f"Learning Rate: {params['lr']}, Loss Weights: {params['weights']}")
    print("="*60)

    model = build_adaptive_model(
        n_features=len(selected_features),
        timesteps=TIMESTEPS,
        learning_rate=params['lr'],
        loss_weights=params['weights']
    )
    
    early_stop = EarlyStopping(monitor='val_dir_output_accuracy', mode='max', patience=15, restore_best_weights=True)

    history = model.fit(
        [X_train_gru, X_train_lstm],
        {'price_output': y_price_train, 'dir_output': y_dir_train},
        validation_data=([X_test_gru, X_test_lstm], {'price_output': y_price_test, 'dir_output': y_dir_test}),
        epochs=100,
        batch_size=64,
        callbacks=[early_stop],
        verbose=1
    )

    pred_price, pred_dir_prob = model.predict([X_test_gru, X_test_lstm])
    pred_dir = (pred_dir_prob > 0.5).astype(int)
    
    accuracy = accuracy_score(y_dir_test, pred_dir)
    mape = mean_absolute_percentage_error(y_price_test.reshape(-1, 1), scaler_price.transform(pred_price))
    
    print(f"\n결과: Accuracy = {accuracy*100:.2f}%, MAPE = {mape:.4f}%")
    
    results.append({'params': params, 'accuracy': accuracy, 'mape': mape})
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

# --- 3. 최종 결과 요약 ---
print("\n" + "="*60)
print(" 하이퍼파라미터 탐색 최종 결과 요약 ".center(60, "="))
print("="*60)

# 결과를 데이터프레임으로 보기 좋게 출력
results_df = pd.DataFrame({
    'Learning Rate': [r['params']['lr'] for r in results],
    'Direction Loss Weight': [r['params']['weights']['dir_output'] for r in results],
    'Accuracy (%)': [r['accuracy'] * 100 for r in results],
    'MAPE': [r['mape'] for r in results]
})
print(results_df.round(4))

print("\n" + "-"*60)
print(f"가장 높은 정확도: {best_accuracy * 100:.2f}%")
print(f"최적 하이퍼파라미터: {best_params}")
print("="*60)