In [None]:
### GPT 버전 ####

In [None]:
### 퍼플렉시티 버전 ####

In [20]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, mean_absolute_percentage_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam

TARGET_MACRO_FILE = 'macro_crypto_data.csv'
ONCHAIN_FILE = 'eth_onchain.csv'
NEWS_DIR = "./news_data"
DEVICE = 'GPU' if len(tf.config.list_physical_devices('GPU')) > 0 else 'CPU'

START_TIME = '2021-01-01'
END_TIME = '2025-10-02'

L = 7
BATCH_SIZE = 64
EPOCHS = 12
LR = 5e-4
TOP_N = 5

def parse_date_from_filename(filename):
    patterns = [r'(\d{4})-(\d{2})-(\d{2})', r'(\d{4})(\d{2})(\d{2})', r'(\d{2})-(\d{2})-(\d{4})', r'(\d{2})(\d{2})(\d{4})']
    basename = os.path.basename(filename)
    for pattern in patterns:
        match = re.search(pattern, basename)
        if match:
            try:
                if len(match.group(1)) == 4:
                    year, month, day = match.groups()
                else:
                    day, month, year = match.groups()
                return pd.to_datetime(f"{year}-{month}-{day}")
            except:
                continue
    return None

def load_all_news_data(root_dir):
    all_data = []
    if not os.path.exists(root_dir):
        dates = pd.date_range(START_TIME, END_TIME, freq='D')
        return pd.DataFrame({'date': dates, 'news': ['test news'] * len(dates), 'label': np.random.choice([1,0,-1], len(dates))})
    csv_files = sorted([f for f in os.listdir(root_dir) if f.endswith('.csv')])
    for filename in csv_files:
        filepath = os.path.join(root_dir, filename)
        file_date = parse_date_from_filename(filename)
        for enc in ['utf-8','cp949','latin1']:
            try:
                df = pd.read_csv(filepath, encoding=enc)
                break
            except Exception:
                continue
        else:
            continue
        if 'date' not in df.columns:
            df['date'] = file_date
        else:
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            if file_date is not None:
                df['date'] = df['date'].fillna(file_date)
        if 'label' not in df.columns:
            raise ValueError(f"{filepath}에 'label' 컬럼이 필요합니다.")
        if 'news' in df.columns:
            df = df[['date','news','label']]
        else:
            df = df[['date','label']]
        all_data.append(df)
    if len(all_data) == 0:
        dates = pd.date_range(START_TIME, END_TIME, freq='D')
        return pd.DataFrame({'date': dates, 'news': ['test news'] * len(dates), 'label': np.random.choice([1,0,-1], len(dates))})
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df['date'] = pd.to_datetime(combined_df['date'], errors='coerce').dt.normalize()
    return combined_df

print("1/11 뉴스 로드 시작")
news_df = load_all_news_data(NEWS_DIR)
print(f"1/11 뉴스 로드 완료: {len(news_df)}건")

print("2/11 뉴스 감성 집계 시작")
news_df = news_df.sort_values('date')
grouped = news_df.groupby('date')['label']
daily = grouped.agg(sent_mean='mean', sent_count='count').reset_index().set_index('date')
pos = grouped.apply(lambda x: (x==1).sum())
neu = grouped.apply(lambda x: (x==0).sum())
neg = grouped.apply(lambda x: (x==-1).sum())
props = pd.DataFrame({'pos_cnt': pos, 'neu_cnt': neu, 'neg_cnt': neg})
daily = daily.join(props)
def day_entropy(row):
    counts = np.array([row['pos_cnt'], row['neu_cnt'], row['neg_cnt']], dtype=float)
    s = counts.sum()
    if s <= 0:
        return 0.0
    p = counts / s
    p_nonzero = p[p>0]
    return -np.sum(p_nonzero * np.log(p_nonzero))
daily['sent_entropy'] = daily.apply(day_entropy, axis=1)
daily['sent_majority'] = news_df.groupby('date')['label'].apply(lambda sub: int(np.sign(np.round(sub.mean()))) if len(sub)>0 else 0)
daily = daily.sort_index()
alpha = 0.4
daily['sent_mean_ewma'] = daily['sent_mean'].ewm(alpha=alpha, adjust=False).mean()
all_dates_news = pd.date_range(daily.index.min(), daily.index.max(), freq='D')
daily = daily.reindex(all_dates_news).fillna({'sent_mean':0.0,'sent_count':0,'pos_cnt':0,'neu_cnt':0,'neg_cnt':0,'sent_entropy':0.0,'sent_majority':0,'sent_mean_ewma':0.0}).fillna(0)
print("2/11 뉴스 감성 집계 완료")

print("3/11 macro 파일 로드 시작")
if not os.path.exists(TARGET_MACRO_FILE):
    raise FileNotFoundError(f"{TARGET_MACRO_FILE} 파일이 필요합니다.")
macro_raw = pd.read_csv(TARGET_MACRO_FILE, parse_dates=['Date'])
macro_raw['Date'] = pd.to_datetime(macro_raw['Date']).dt.tz_localize(None).dt.normalize()
macro_raw = macro_raw.set_index('Date').sort_index()
print("3/11 macro 파일 로드 완료")

print("4/11 온체인 로드 시작")
if not os.path.exists(ONCHAIN_FILE):
    raise FileNotFoundError(f"{ONCHAIN_FILE} 파일이 필요합니다.")
onchain = pd.read_csv(ONCHAIN_FILE, parse_dates=['date']).set_index('date').sort_index()
onchain.index = pd.to_datetime(onchain.index)
print("4/11 온체인 로드 완료")

start = max(macro_raw.index.min(), onchain.index.min(), daily.index.min(), pd.to_datetime(START_TIME))
end = min(macro_raw.index.max(), onchain.index.max(), daily.index.max(), pd.to_datetime(END_TIME))
date_index_full = pd.date_range(start, end, freq='D')

print("5/11 날짜 정렬 및 리인덱스 시작")
macro_raw = macro_raw.reindex(date_index_full).ffill().bfill()
onchain = onchain.reindex(date_index_full).fillna(0)
daily = daily.reindex(date_index_full).fillna(0)
print("5/11 날짜 정렬 완료")

print("6/11 ETH 타깃 및 기술지표 준비 시작")
eth_cols = ['ETH_Open','ETH_High','ETH_Low','ETH_Close','ETH_Volume']
for c in eth_cols:
    if c not in macro_raw.columns:
        raise ValueError(f"{c} 컬럼이 macro 파일에 필요합니다.")
eth_price = macro_raw[eth_cols].rename(columns={'ETH_Open':'open','ETH_High':'high','ETH_Low':'low','ETH_Close':'close','ETH_Volume':'volume'})

def compute_technical_indicators(df):
    out = pd.DataFrame(index=df.index)
    pt = df['close']
    N = 14
    lowN = df['low'].rolling(N).min()
    highN = df['high'].rolling(N).max()
    out['stoch_k'] = (pt - lowN) / (highN - lowN + 1e-9) * 100
    out['stoch_d'] = out['stoch_k'].rolling(3).mean()
    out['williams_r'] = (highN - pt) / (highN - lowN + 1e-9) * 100
    out['ad_osc'] = (pt - pt.shift(1)) / (df['high'] - df['low'] + 1e-9)
    out['momentum'] = pt - pt.shift(10)
    out['disparity7'] = pt / pt.rolling(7).mean() * 100
    out['roc'] = pt / pt.shift(12) * 100
    return out.fillna(0)

tech = compute_technical_indicators(eth_price)
target_feats = pd.concat([eth_price, tech, onchain], axis=1).fillna(0)
print("6/11 ETH 타깃 및 기술지표 준비 완료")

print("7/11 top-n macro 입력 생성 시작")
cols = [c for c in macro_raw.columns if '_' in c]
coins = []
for c in cols:
    coin = c.split('_')[0]
    if coin not in coins:
        coins.append(coin)
coins = [c for c in coins if c.upper() != 'ETH']
if len(coins) < TOP_N:
    TOP_N = len(coins)
selected_coins = coins[:TOP_N]
macro_list = []
for coin in selected_coins:
    needed = [f"{coin}_Open", f"{coin}_Close", f"{coin}_High", f"{coin}_Low", f"{coin}_Volume"]
    for n in needed:
        if n not in macro_raw.columns:
            raise ValueError(f"{n} 컬럼이 macro 파일에 필요합니다.")
    arr = macro_raw[needed].values
    macro_list.append(arr)
macro_array = np.concatenate(macro_list, axis=1)
feat_suffix = ['Open','Close','High','Low','Volume']
feature_names = []
for coin in selected_coins:
    for sfx in feat_suffix:
        feature_names.append(f"{coin}_{sfx}")
if macro_array.shape[1] != len(feature_names):
    feature_names = [f"m{i}" for i in range(macro_array.shape[1])]
macro_df = pd.DataFrame(macro_array, index=date_index_full, columns=feature_names)
print("7/11 top-n macro 입력 생성 완료")

print("8/11 sentiment feature 준비 및 병합")
sent_cols = ['sent_mean','sent_count','pos_cnt','neu_cnt','neg_cnt','sent_entropy','sent_mean_ewma']
sent_df = daily[sent_cols].fillna(0)
print("8/11 sentiment 준비 완료")

print("9/11 PPS 기반 특성 선택 시작")
def compute_pps(df, target_col, threshold=0.3, sample_size=500):
    scores = {}
    if len(df) > sample_size:
        df_sample = df.iloc[-sample_size:]
    else:
        df_sample = df
    
    for col in df.columns:
        if col == target_col:
            continue
        X_feature = df_sample[col].values.reshape(-1, 1)
        dt = DecisionTreeRegressor(max_depth=4, random_state=42)
        try:
            cv_scores = cross_val_score(dt, X_feature, df_sample[target_col], cv=3, scoring='r2')
            score = max(0, cv_scores.mean())
            scores[col] = score
        except:
            scores[col] = 0.0
    
    selected = [k for k, v in scores.items() if v >= threshold]
    return selected, scores

macro_with_eth = pd.concat([macro_df, target_feats], axis=1)
sent_macro_all = pd.concat([sent_df, macro_with_eth], axis=1)

pps_selected, pps_scores = compute_pps(sent_macro_all, 'close', threshold=0.01, sample_size=500)
print(f"9/11 PPS 선택 완료: {len(pps_selected)}개 특성")

print("10/11 정규화 및 교차 특성 생성")
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

target_cols = ['close']
feature_data = sent_macro_all[pps_selected].values
target_data = sent_macro_all[target_cols].values

feature_normalized = scaler_features.fit_transform(feature_data)
target_normalized = scaler_target.fit_transform(target_data)

cross_features = []
if feature_normalized.shape[1] >= 2:
    for i in range(min(3, feature_normalized.shape[1]-1)):
        cross = feature_normalized[:, i] * feature_normalized[:, i+1]
        cross_features.append(cross.reshape(-1, 1))

if cross_features:
    cross_features = np.concatenate(cross_features, axis=1)
    feature_normalized = np.concatenate([feature_normalized, cross_features], axis=1)

print("10/11 정규화 완료")

print("11/11 상관계수 기반 2차 필터링")
corr_threshold = 0.05
correlations = []
for i in range(feature_normalized.shape[1]):
    corr = np.corrcoef(feature_normalized[:, i], target_normalized[:, 0])[0, 1]
    correlations.append(abs(corr) if not np.isnan(corr) else 0.0)

selected_indices = [i for i, c in enumerate(correlations) if c >= corr_threshold]
if len(selected_indices) == 0:
    selected_indices = list(range(min(10, feature_normalized.shape[1])))

feature_normalized = feature_normalized[:, selected_indices]
print(f"11/11 최종 선택 특성 수: {feature_normalized.shape[1]}")

price_direction = np.sign(np.diff(target_data[:, 0], prepend=target_data[0, 0]))
price_direction = np.where(price_direction > 0, 1, 0)

X_seq, y_price, y_direction = [], [], []
for i in range(L, len(feature_normalized)):
    X_seq.append(feature_normalized[i-L:i])
    y_price.append(target_normalized[i, 0])
    y_direction.append(price_direction[i])

X_seq = np.array(X_seq, dtype=np.float32)
y_price = np.array(y_price, dtype=np.float32)
y_direction = np.array(y_direction, dtype=np.float32)

split_idx = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
y_price_train, y_price_test = y_price[:split_idx], y_price[split_idx:]
y_dir_train, y_dir_test = y_direction[:split_idx], y_direction[split_idx:]

print(f"데이터 준비 완료 - 학습: {X_train.shape}, 테스트: {X_test.shape}")

print("모델 구축 시작")
input_shape = (L, feature_normalized.shape[1])

input_layer = layers.Input(shape=input_shape)

gru1 = layers.GRU(256, return_sequences=True)(input_layer)
gru2 = layers.GRU(256, return_sequences=True)(gru1)
gru3 = layers.GRU(256, return_sequences=False)(gru2)
dense_reg = layers.Dense(128, activation='relu')(gru3)
output_price = layers.Dense(1, activation='linear', name='price')(dense_reg)

lstm1 = layers.LSTM(256, return_sequences=True)(input_layer)
lstm2 = layers.LSTM(256, return_sequences=True)(lstm1)
lstm3 = layers.LSTM(256, return_sequences=False)(lstm2)
concat = layers.Concatenate()([lstm3, gru3])
dense_cls = layers.Dense(128, activation='relu')(concat)
dropout = layers.Dropout(0.1)(dense_cls)
output_direction = layers.Dense(1, activation='sigmoid', name='direction')(dropout)

model = Model(inputs=input_layer, outputs=[output_price, output_direction])

model.compile(
    optimizer=Adam(learning_rate=LR),
    loss={'price': 'mse', 'direction': 'binary_crossentropy'},
    loss_weights={'price': 1.0, 'direction': 1.0},
    metrics={'price': 'mae', 'direction': 'accuracy'}
)

print(model.summary())
print("모델 구축 완료")

print(f"학습 시작 (Epochs: {EPOCHS}, Batch Size: {BATCH_SIZE})")
history = model.fit(
    X_train,
    {'price': y_price_train, 'direction': y_dir_train},
    validation_split=0.2,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)

print("예측 및 평가")
predictions = model.predict(X_test, verbose=0)
y_price_pred = predictions[0].flatten()
y_dir_pred = (predictions[1].flatten() > 0.5).astype(int)

y_price_pred_rescaled = scaler_target.inverse_transform(y_price_pred.reshape(-1, 1)).flatten()
y_price_test_rescaled = scaler_target.inverse_transform(y_price_test.reshape(-1, 1)).flatten()

rmse = np.sqrt(mean_squared_error(y_price_test_rescaled, y_price_pred_rescaled))
mape = mean_absolute_percentage_error(y_price_test_rescaled, y_price_pred_rescaled) * 100
accuracy = accuracy_score(y_dir_test, y_dir_pred) * 100
roc_auc = roc_auc_score(y_dir_test, predictions[1].flatten())

print(f"\n=== 최종 결과 ===")
print(f"가격 예측 RMSE: ${rmse:.2f}")
print(f"가격 예측 MAPE: {mape:.2f}%")
print(f"방향 예측 Accuracy: {accuracy:.2f}%")
print(f"방향 예측 ROC-AUC: {roc_auc:.3f}")

results_df = pd.DataFrame({
    'actual_price': y_price_test_rescaled,
    'predicted_price': y_price_pred_rescaled,
    'actual_direction': y_dir_test,
    'predicted_direction': y_dir_pred
})



1/11 뉴스 로드 시작
1/11 뉴스 로드 완료: 25947건
2/11 뉴스 감성 집계 시작
2/11 뉴스 감성 집계 완료
3/11 macro 파일 로드 시작
3/11 macro 파일 로드 완료
4/11 온체인 로드 시작
4/11 온체인 로드 완료
5/11 날짜 정렬 및 리인덱스 시작
5/11 날짜 정렬 완료
6/11 ETH 타깃 및 기술지표 준비 시작
6/11 ETH 타깃 및 기술지표 준비 완료
7/11 top-n macro 입력 생성 시작
7/11 top-n macro 입력 생성 완료
8/11 sentiment feature 준비 및 병합
8/11 sentiment 준비 완료
9/11 PPS 기반 특성 선택 시작
9/11 PPS 선택 완료: 4개 특성
10/11 정규화 및 교차 특성 생성
10/11 정규화 완료
11/11 상관계수 기반 2차 필터링
11/11 최종 선택 특성 수: 7
데이터 준비 완료 - 학습: (1383, 7, 7), 테스트: (346, 7, 7)
모델 구축 시작
Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None, 7, 7)]               0         []                            
                                                                                                  
 gru_27 (GRU)                (None, 7, 256)               203520    ['input_10[0][0]

In [19]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, mean_absolute_percentage_error, classification_report
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

TARGET_MACRO_FILE = 'macro_crypto_data.csv'
ONCHAIN_FILE = 'eth_onchain.csv'
DEVICE = 'GPU' if len(tf.config.list_physical_devices('GPU')) > 0 else 'CPU'

START_TIME = '2021-01-01'
END_TIME = '2025-10-02'

L = 10
BATCH_SIZE = 32
EPOCHS = 100
LR = 1e-3
TOP_N = 10

print("1/10 데이터 로드")
if not os.path.exists(TARGET_MACRO_FILE):
    raise FileNotFoundError(f"{TARGET_MACRO_FILE} 파일이 필요합니다.")
macro_raw = pd.read_csv(TARGET_MACRO_FILE, parse_dates=['Date'])
macro_raw['Date'] = pd.to_datetime(macro_raw['Date']).dt.tz_localize(None).dt.normalize()
macro_raw = macro_raw.set_index('Date').sort_index()

if not os.path.exists(ONCHAIN_FILE):
    raise FileNotFoundError(f"{ONCHAIN_FILE} 파일이 필요합니다.")
onchain = pd.read_csv(ONCHAIN_FILE, parse_dates=['date']).set_index('date').sort_index()
onchain.index = pd.to_datetime(onchain.index)

start = max(macro_raw.index.min(), onchain.index.min(), pd.to_datetime(START_TIME))
end = min(macro_raw.index.max(), onchain.index.max(), pd.to_datetime(END_TIME))
date_index_full = pd.date_range(start, end, freq='D')

macro_raw = macro_raw.reindex(date_index_full).ffill().bfill()
onchain = onchain.reindex(date_index_full).fillna(0)
print("1/10 데이터 로드 완료")

print("2/10 특성 엔지니어링")
eth_cols = ['ETH_Open','ETH_High','ETH_Low','ETH_Close','ETH_Volume']
for c in eth_cols:
    if c not in macro_raw.columns:
        raise ValueError(f"{c} 컬럼이 macro 파일에 필요합니다.")
eth_price = macro_raw[eth_cols].rename(columns={'ETH_Open':'open','ETH_High':'high','ETH_Low':'low','ETH_Close':'close','ETH_Volume':'volume'})

def compute_all_features(df):
    out = pd.DataFrame(index=df.index)
    pt = df['close']
    
    for period in [7, 14, 30, 60, 90]:
        out[f'sma_{period}'] = pt.rolling(period).mean()
        out[f'ema_{period}'] = pt.ewm(span=period, adjust=False).mean()
        out[f'std_{period}'] = pt.rolling(period).std()
        out[f'returns_{period}'] = pt.pct_change(period)
    
    out['bb_upper'] = pt.rolling(20).mean() + 2 * pt.rolling(20).std()
    out['bb_lower'] = pt.rolling(20).mean() - 2 * pt.rolling(20).std()
    out['bb_position'] = (pt - out['bb_lower']) / (out['bb_upper'] - out['bb_lower'] + 1e-9)
    
    N = 14
    lowN = df['low'].rolling(N).min()
    highN = df['high'].rolling(N).max()
    out['stoch_k'] = (pt - lowN) / (highN - lowN + 1e-9) * 100
    out['stoch_d'] = out['stoch_k'].rolling(3).mean()
    
    delta = pt.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / (loss + 1e-9)
    out['rsi'] = 100 - (100 / (1 + rs))
    
    exp12 = pt.ewm(span=12, adjust=False).mean()
    exp26 = pt.ewm(span=26, adjust=False).mean()
    out['macd'] = exp12 - exp26
    out['macd_signal'] = out['macd'].ewm(span=9, adjust=False).mean()
    out['macd_hist'] = out['macd'] - out['macd_signal']
    
    out['atr'] = (df['high'] - df['low']).rolling(14).mean()
    out['adx'] = 50.0
    
    out['obv'] = (np.sign(pt.diff()) * df['volume']).cumsum()
    out['vwap'] = (pt * df['volume']).cumsum() / df['volume'].cumsum()
    
    return out.fillna(method='ffill').fillna(0)

tech = compute_all_features(eth_price)
target_feats = pd.concat([eth_price, tech, onchain], axis=1).fillna(method='ffill').fillna(0)
print("2/10 특성 엔지니어링 완료")

print("3/10 macro 특성 생성")
cols = [c for c in macro_raw.columns if '_' in c]
coins = []
for c in cols:
    coin = c.split('_')[0]
    if coin not in coins:
        coins.append(coin)
coins = [c for c in coins if c.upper() != 'ETH']
if len(coins) < TOP_N:
    TOP_N = len(coins)
selected_coins = coins[:TOP_N]

macro_features = []
for coin in selected_coins:
    needed = [f"{coin}_Close", f"{coin}_Volume"]
    if all(n in macro_raw.columns for n in needed):
        coin_data = macro_raw[needed].copy()
        coin_data[f'{coin}_returns'] = coin_data[f"{coin}_Close"].pct_change()
        coin_data[f'{coin}_vol_ma'] = coin_data[f"{coin}_Volume"].rolling(7).mean()
        macro_features.append(coin_data)

if macro_features:
    macro_df = pd.concat(macro_features, axis=1).fillna(method='ffill').fillna(0)
else:
    macro_df = pd.DataFrame(index=date_index_full)
print("3/10 macro 특성 생성 완료")

print("4/10 특성 선택")
all_features = pd.concat([target_feats, macro_df], axis=1)
all_features = all_features.loc[:, ~all_features.columns.duplicated()]

def adaptive_feature_selection(df, target_col, initial_threshold=0.2, min_features=20):
    scores = {}
    sample_data = df.iloc[-1000:] if len(df) > 1000 else df
    
    for col in df.columns:
        if col == target_col or col in ['open', 'high', 'low', 'volume']:
            continue
        try:
            X = sample_data[col].values.reshape(-1, 1)
            y = sample_data[target_col].values
            dt = DecisionTreeRegressor(max_depth=3, random_state=42)
            score = cross_val_score(dt, X, y, cv=3, scoring='r2').mean()
            scores[col] = max(0, score)
        except:
            scores[col] = 0.0
    
    sorted_features = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    threshold = initial_threshold
    while threshold > 0.01:
        selected = [k for k, v in sorted_features if v >= threshold]
        if len(selected) >= min_features:
            break
        threshold -= 0.02
    
    if len(selected) < min_features:
        selected = [k for k, v in sorted_features[:min_features]]
    
    return selected, scores

selected_features, feature_scores = adaptive_feature_selection(all_features, 'close', initial_threshold=0.2, min_features=25)
print(f"4/10 특성 선택 완료: {len(selected_features)}개")

print("5/10 타깃 생성")
close_prices = all_features['close'].values
returns_future = np.zeros(len(close_prices))
for i in range(len(close_prices) - 7):
    returns_future[i] = (close_prices[i+7] - close_prices[i]) / close_prices[i]

threshold_buy = 0.02
threshold_sell = -0.02

direction_labels = np.zeros(len(returns_future))
direction_labels[returns_future > threshold_buy] = 1
direction_labels[returns_future < threshold_sell] = 0
direction_labels[(returns_future >= threshold_sell) & (returns_future <= threshold_buy)] = 0.5

binary_labels = (direction_labels > 0.25).astype(int)
print(f"5/10 타깃 생성 완료 - 상승: {(binary_labels==1).sum()}, 하락: {(binary_labels==0).sum()}")

print("6/10 정규화")
scaler = RobustScaler()
scaler_target = MinMaxScaler()

feature_data = all_features[selected_features].values
target_data = all_features[['close']].values

feature_normalized = scaler.fit_transform(feature_data)
target_normalized = scaler_target.fit_transform(target_data)
print("6/10 정규화 완료")

print("7/10 시퀀스 생성")
X_seq, y_price, y_direction = [], [], []
for i in range(L, len(feature_normalized) - 7):
    X_seq.append(feature_normalized[i-L:i])
    y_price.append(target_normalized[i, 0])
    y_direction.append(binary_labels[i])

X_seq = np.array(X_seq, dtype=np.float32)
y_price = np.array(y_price, dtype=np.float32)
y_direction = np.array(y_direction, dtype=np.float32)

split_idx = int(0.75 * len(X_seq))
X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
y_price_train, y_price_test = y_price[:split_idx], y_price[split_idx:]
y_dir_train, y_dir_test = y_direction[:split_idx], y_direction[split_idx:]
print(f"7/10 시퀀스 생성 완료 - 학습: {X_train.shape}, 테스트: {X_test.shape}")

print("8/10 클래스 가중치 계산")
class_weights_array = compute_class_weight('balanced', classes=np.unique(y_dir_train), y=y_dir_train)
class_weight_map = {int(cls): weight for cls, weight in zip(np.unique(y_dir_train), class_weights_array)}
sample_weights_train = np.array([class_weight_map[int(y)] for y in y_dir_train])
print(f"클래스 가중치: {class_weight_map}")

print("9/10 모델 구축")
input_shape = (L, feature_normalized.shape[1])
input_layer = layers.Input(shape=input_shape)

x = layers.GRU(128, return_sequences=True, dropout=0.2)(input_layer)
x = layers.GRU(128, return_sequences=True, dropout=0.2)(x)
x = layers.GRU(128, return_sequences=False, dropout=0.2)(x)
price_output = layers.Dense(1, activation='linear', name='price')(x)

y = layers.LSTM(128, return_sequences=True, dropout=0.2)(input_layer)
y = layers.LSTM(128, return_sequences=True, dropout=0.2)(y)
y = layers.LSTM(128, return_sequences=False, dropout=0.2)(y)
combined = layers.Concatenate()([x, y])
dir_dense = layers.Dense(64, activation='relu')(combined)
dir_dense = layers.Dropout(0.3)(dir_dense)
direction_output = layers.Dense(1, activation='sigmoid', name='direction')(dir_dense)

model = Model(inputs=input_layer, outputs=[price_output, direction_output])

model.compile(
    optimizer=Adam(learning_rate=LR),
    loss={'price': 'huber', 'direction': 'binary_crossentropy'},
    loss_weights={'price': 0.3, 'direction': 0.7},
    metrics={'price': 'mae', 'direction': ['accuracy', tf.keras.metrics.AUC(name='auc')]}
)

print(model.summary())

callbacks = [
    EarlyStopping(monitor='val_direction_accuracy', patience=15, mode='max', restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_direction_accuracy', factor=0.5, patience=7, mode='max', min_lr=1e-6),
    ModelCheckpoint('best_model.h5', monitor='val_direction_accuracy', mode='max', save_best_only=True)
]

print(f"10/10 학습 시작 (Epochs: {EPOCHS}, Batch: {BATCH_SIZE})")
history = model.fit(
    X_train,
    {'price': y_price_train, 'direction': y_dir_train},
    sample_weight={'price': np.ones(len(y_price_train)), 'direction': sample_weights_train},
    validation_split=0.15,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print("예측 및 평가")
predictions = model.predict(X_test, verbose=0)
y_price_pred = predictions[0].flatten()
y_dir_pred_prob = predictions[1].flatten()
y_dir_pred = (y_dir_pred_prob > 0.5).astype(int)

y_price_pred_rescaled = scaler_target.inverse_transform(y_price_pred.reshape(-1, 1)).flatten()
y_price_test_rescaled = scaler_target.inverse_transform(y_price_test.reshape(-1, 1)).flatten()

rmse = np.sqrt(mean_squared_error(y_price_test_rescaled, y_price_pred_rescaled))
mape = mean_absolute_percentage_error(y_price_test_rescaled, y_price_pred_rescaled) * 100
accuracy = accuracy_score(y_dir_test, y_dir_pred) * 100
roc_auc = roc_auc_score(y_dir_test, y_dir_pred_prob)

print(f"\n=== 최종 결과 ===")
print(f"학습 기간: {START_TIME} ~ {END_TIME}")
print(f"총 샘플 수: {len(X_seq)}개")
print(f"가격 예측 RMSE: ${rmse:.2f}")
print(f"가격 예측 MAPE: {mape:.2f}%")
print(f"방향 예측 Accuracy: {accuracy:.2f}%")
print(f"방향 예측 ROC-AUC: {roc_auc:.3f}")
print("\n분류 리포트:")
print(classification_report(y_dir_test, y_dir_pred, target_names=['하락', '상승']))

results_df = pd.DataFrame({
    'actual_price': y_price_test_rescaled,
    'predicted_price': y_price_pred_rescaled,
    'actual_direction': y_dir_test,
    'predicted_direction': y_dir_pred,
    'predicted_probability': y_dir_pred_prob
})


1/10 데이터 로드
1/10 데이터 로드 완료
2/10 특성 엔지니어링
2/10 특성 엔지니어링 완료
3/10 macro 특성 생성
3/10 macro 특성 생성 완료
4/10 특성 선택
4/10 특성 선택 완료: 25개
5/10 타깃 생성
5/10 타깃 생성 완료 - 상승: 1093, 하락: 643
6/10 정규화
6/10 정규화 완료
7/10 시퀀스 생성
7/10 시퀀스 생성 완료 - 학습: (1289, 10, 25), 테스트: (430, 10, 25)
8/10 클래스 가중치 계산
클래스 가중치: {0: 1.3742004264392325, 1: 0.7859756097560976}
9/10 모델 구축
Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 10, 25)]             0         []                            
                                                                                                  
 gru_24 (GRU)                (None, 10, 128)              59520     ['input_9[0][0]']             
                                                                                                  
 lstm_24 (LSTM)              (None, 10, 128)   

  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
예측 및 평가

=== 최종 결과 ===
학습 기간: 2021-01-01 ~ 2025-10-02
총 샘플 수: 1719개
가격 예측 RMSE: $219.09
가격 예측 MAPE: 6.44%
방향 예측 Accuracy: 59.53%
방향 예측 ROC-AUC: 0.712

분류 리포트:
              precision    recall  f1-score   support

          하락       0.49      0.76      0.60       169
          상승       0.76      0.49      0.59       261

    accuracy                           0.60       430
   macro avg       0.63      0.62      0.60       430
weighted avg       0.65      0.60      0.59       430

