In [5]:
!pip install --upgrade ml-dtypes


Collecting ml-dtypes
  Using cached ml_dtypes-0.5.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (4.9 MB)
Installing collected packages: ml-dtypes
  Attempting uninstall: ml-dtypes
    Found existing installation: ml-dtypes 0.2.0
    Uninstalling ml-dtypes-0.2.0:
      Successfully uninstalled ml-dtypes-0.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0 requires ml-dtypes~=0.2.0, but you have ml-dtypes 0.5.3 which is incompatible.[0m[31m
[0mSuccessfully installed ml-dtypes-0.5.3


In [7]:
pip show ml_dtypes onnx onnxruntime

Name: ml_dtypes
Version: 0.5.3
Summary: ml_dtypes is a stand-alone implementation of several NumPy dtype extensions used in machine learning.
Home-page: 
Author: 
Author-email: ml_dtypes authors <ml_dtypes@google.com>
License: 
Location: /raid/invigoworks/anaconda3/lib/python3.10/site-packages
Requires: numpy, numpy
Required-by: jax, jaxlib, onnx, tensorflow
---
Name: onnx
Version: 1.19.0
Summary: Open Neural Network Exchange
Home-page: 
Author: 
Author-email: ONNX Contributors <onnx-technical-discuss@lists.lfaidata.foundation>
License: Apache License v2.0
Location: /raid/invigoworks/anaconda3/lib/python3.10/site-packages
Requires: ml_dtypes, numpy, protobuf, typing_extensions
Required-by: onnxmltools, skl2onnx
---
Name: onnxruntime
Version: 1.22.1
Summary: ONNX Runtime is a runtime accelerator for Machine Learning models
Home-page: https://onnxruntime.ai
Author: Microsoft Corporation
Author-email: onnxruntime@microsoft.com
License: MIT License
Location: /raid/invigoworks/anaconda3/lib

In [2]:
import os, re, math
from datetime import timedelta
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

TARGET_MACRO_FILE = 'macro_crypto_data.csv'   
ONCHAIN_FILE = 'eth_onchain.csv'             
NEWS_DIR = "./news_data"                       
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
START_TIME='2020-01-01'
END_TIME='2025-10-02'

L = 7
BATCH_SIZE = 64
EPOCHS = 12
LR = 5e-4
TOP_N = 5

def parse_date_from_filename(filename):
    patterns = [r'(\d{4})-(\d{2})-(\d{2})', r'(\d{4})(\d{2})(\d{2})', r'(\d{2})-(\d{2})-(\d{4})', r'(\d{2})(\d{2})(\d{4})']
    basename = os.path.basename(filename)
    for pattern in patterns:
        match = re.search(pattern, basename)
        if match:
            try:
                if len(match.group(1)) == 4:
                    year, month, day = match.groups()
                else:
                    day, month, year = match.groups()
                return pd.to_datetime(f"{year}-{month}-{day}")
            except:
                continue
    return None

def load_all_news_data(root_dir):
    all_data = []
    if not os.path.exists(root_dir):
        print(f"경고: 디렉토리가 존재하지 않습니다: {root_dir} -> 더미 뉴스 생성")
        dates = pd.date_range(START_TIME, '2025-10-02', freq='D')
        return pd.DataFrame({'date': dates, 'news': ['test news'] * len(dates), 'label': np.random.choice([1,0,-1], len(dates))})
    csv_files = sorted([f for f in os.listdir(root_dir) if f.endswith('.csv')])
    for filename in csv_files:
        filepath = os.path.join(root_dir, filename)
        file_date = parse_date_from_filename(filename)
        for enc in ['utf-8','cp949','latin1']:
            try:
                df = pd.read_csv(filepath, encoding=enc)
                break
            except Exception:
                continue
        else:
            print(f"읽기 실패: {filepath}")
            continue
        if 'date' not in df.columns:
            df['date'] = file_date
        else:
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            if file_date is not None:
                df['date'] = df['date'].fillna(file_date)
        if 'label' not in df.columns:
            raise ValueError(f"{filepath}에 'label' 컬럼이 필요합니다.")
        if 'news' in df.columns:
            df = df[['date','news','label']]
        else:
            df = df[['date','label']]
        all_data.append(df)
    if len(all_data) == 0:
        print("경고: CSV 없음 -> 더미 뉴스 생성")
        dates = pd.date_range(START_TIME, END_TIME, freq='D')
        return pd.DataFrame({'date': dates, 'news': ['test news'] * len(dates), 'label': np.random.choice([1,0,-1], len(dates))})
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df['date'] = pd.to_datetime(combined_df['date'], errors='coerce').dt.normalize()
    return combined_df

print("1/11 뉴스 로드 시작")
news_df = load_all_news_data(NEWS_DIR)
print(f"1/11 뉴스 로드 완료: {len(news_df)}건, 기간 {news_df['date'].min()} ~ {news_df['date'].max()}")

print("2/11 뉴스 감성 집계 시작")
news_df = news_df.sort_values('date')
grouped = news_df.groupby('date')['label']
daily = grouped.agg(sent_mean='mean', sent_count='count').reset_index().set_index('date')
pos = grouped.apply(lambda x: (x==1).sum())
neu = grouped.apply(lambda x: (x==0).sum())
neg = grouped.apply(lambda x: (x==-1).sum())
props = pd.DataFrame({'pos_cnt': pos, 'neu_cnt': neu, 'neg_cnt': neg})
daily = daily.join(props)
def day_entropy(row):
    counts = np.array([row['pos_cnt'], row['neu_cnt'], row['neg_cnt']], dtype=float)
    s = counts.sum()
    if s <= 0:
        return 0.0
    p = counts / s
    p_nonzero = p[p>0]
    return -np.sum(p_nonzero * np.log(p_nonzero))
daily['sent_entropy'] = daily.apply(day_entropy, axis=1)
daily['sent_majority'] = news_df.groupby('date')['label'].apply(lambda sub: int(np.sign(np.round(sub.mean()))) if len(sub)>0 else 0)
daily = daily.sort_index()
alpha = 0.4
daily['sent_mean_ewma'] = daily['sent_mean'].ewm(alpha=alpha, adjust=False).mean()
all_dates_news = pd.date_range(daily.index.min(), daily.index.max(), freq='D')
daily = daily.reindex(all_dates_news).fillna({'sent_mean':0.0,'sent_count':0,'pos_cnt':0,'neu_cnt':0,'neg_cnt':0,'sent_entropy':0.0,'sent_majority':0,'sent_mean_ewma':0.0}).fillna(0)
print("2/11 뉴스 감성 집계 완료")

print("3/11 macro 파일 로드 시작")
if not os.path.exists(TARGET_MACRO_FILE):
    raise FileNotFoundError(f"{TARGET_MACRO_FILE} 파일이 필요합니다.")
macro_raw = pd.read_csv(TARGET_MACRO_FILE, parse_dates=['Date'])
macro_raw['Date'] = pd.to_datetime(macro_raw['Date']).dt.tz_convert(None).dt.normalize()
macro_raw = macro_raw.set_index('Date').sort_index()
print("3/11 macro 파일 로드 완료")

print("4/11 온체인 로드 시작")
if not os.path.exists(ONCHAIN_FILE):
    raise FileNotFoundError(f"{ONCHAIN_FILE} 파일이 필요합니다.")
onchain = pd.read_csv(ONCHAIN_FILE, parse_dates=['date']).set_index('date').sort_index()
onchain.index = pd.to_datetime(onchain.index)
print("4/11 온체인 로드 완료")

start = max(macro_raw.index.min(), onchain.index.min(), daily.index.min())
end = min(macro_raw.index.max(), onchain.index.max(), daily.index.max())
date_index = pd.date_range(start, end, freq='D')

print("5/11 날짜 정렬 및 리인덱스 시작")
macro_raw = macro_raw.reindex(date_index).ffill().bfill()
onchain = onchain.reindex(date_index).fillna(0)
daily = daily.reindex(date_index).fillna(0)
print("5/11 날짜 정렬 완료")

print("6/11 ETH 타깃 및 기술지표 준비 시작")
eth_cols = ['ETH_Open','ETH_High','ETH_Low','ETH_Close','ETH_Volume']
for c in eth_cols:
    if c not in macro_raw.columns:
        raise ValueError(f"{c} 컬럼이 macro 파일에 필요합니다.")
eth_price = macro_raw[eth_cols].rename(columns={'ETH_Open':'open','ETH_High':'high','ETH_Low':'low','ETH_Close':'close','ETH_Volume':'volume'})

def compute_technical_indicators(df):
    out = pd.DataFrame(index=df.index)
    pt = df['close']
    N = 14
    lowN = df['low'].rolling(N).min()
    highN = df['high'].rolling(N).max()
    out['stoch_k'] = (pt - lowN) / (highN - lowN + 1e-9) * 100
    out['stoch_d'] = out['stoch_k'].rolling(3).mean()
    out['williams_r'] = (highN - pt) / (highN - lowN + 1e-9) * 100
    out['ad_osc'] = (pt - pt.shift(1)) / (df['high'] - df['low'] + 1e-9)
    out['momentum'] = pt - pt.shift(10)
    out['disparity7'] = pt / pt.rolling(7).mean() * 100
    out['roc'] = pt / pt.shift(12) * 100
    return out.fillna(0)

tech = compute_technical_indicators(eth_price)
target_feats = pd.concat([eth_price, tech, onchain], axis=1).fillna(0)
print("6/11 ETH 타깃 및 기술지표 준비 완료")

print("7/11 top-n macro 입력 생성 시작")
# macro_raw의 컬럼명에서 코인 접두사 추출
cols = [c for c in macro_raw.columns if '_' in c]
coins = []
for c in cols:
    coin = c.split('_')[0]
    if coin not in coins:
        coins.append(coin)
# 타깃 ETH 제외
coins = [c for c in coins if c.upper() != 'ETH']
if len(coins) < TOP_N:
    TOP_N = len(coins)
selected_coins = coins[:TOP_N]
macro_list = []
for coin in selected_coins:
    needed = [f"{coin}_Open", f"{coin}_Close", f"{coin}_High", f"{coin}_Low", f"{coin}_Volume"]
    for n in needed:
        if n not in macro_raw.columns:
            raise ValueError(f"{n} 컬럼이 macro 파일에 필요합니다.")
    arr = macro_raw[needed].values
    macro_list.append(arr)
macro_array = np.concatenate(macro_list, axis=1)
macro_df = pd.DataFrame(macro_array, index=date_index, columns=[f"{c}" for c in range(macro_array.shape[1])])
print("7/11 top-n macro 입력 생성 완료")

print("8/11 sentiment feature 준비 및 병합")
sent_cols = ['sent_mean','sent_count','pos_cnt','neu_cnt','neg_cnt','sent_entropy','sent_mean_ewma']
sent_df = daily[sent_cols].fillna(0)
print("8/11 sentiment 준비 완료")

print("9/11 정규화 및 데이터 분할 준비")
xg_all = target_feats.loc[date_index]
xm_all = macro_df.loc[date_index]
s_all = sent_df.loc[date_index]
p_all = eth_price.loc[date_index][['close']]

n_total = len(date_index)
n_train = int(n_total * 0.7)
n_val = int(n_total * 0.1)
n_test = n_total - n_train - n_val
dates = list(date_index)
train_dates = dates[L-1 : n_train]
val_dates = dates[n_train : n_train + n_val]
test_dates = dates[n_train + n_val : ]

scaler_xg = StandardScaler().fit(xg_all.iloc[:n_train].values)
scaler_xm = StandardScaler().fit(xm_all.iloc[:n_train].values)
scaler_s = StandardScaler().fit(s_all.iloc[:n_train].values)
scaler_p = StandardScaler().fit(p_all.iloc[:n_train].values)
print("9/11 정규화 및 분할 준비 완료")

class CryptoDataset(Dataset):
    def __init__(self, dates_list, xg_df, xm_df, s_df, p_df, L, scalers):
        self.dates = dates_list
        self.xg = xg_df
        self.xm = xm_df
        self.s = s_df
        self.p = p_df
        self.L = L
        self.scaler_xg, self.scaler_xm, self.scaler_s, self.scaler_p = scalers
    def __len__(self):
        return len(self.dates)
    def __getitem__(self, idx):
        t = self.dates[idx]
        start = t - timedelta(days=self.L-1)
        idxs = pd.date_range(start=start, end=t, freq='D')
        xg_win = self.xg.loc[idxs].values.astype(np.float32)
        xm_win = self.xm.loc[idxs].values.astype(np.float32)
        s_win = self.s.loc[idxs].values.astype(np.float32)
        p_last = float(self.p.loc[t]['close'])
        next_day = t + timedelta(days=1)
        y = float(self.p.loc[next_day]['close']) if next_day in self.p.index else p_last
        Bxg = self.scaler_xg.transform(xg_win)
        Bxm = self.scaler_xm.transform(xm_win)
        Bs = self.scaler_s.transform(s_win)
        p_last_s = self.scaler_p.transform([[p_last]])[0,0]
        y_s = self.scaler_p.transform([[y]])[0,0]
        return Bxg, Bxm, Bs, np.float32(p_last_s), np.float32(y_s)

train_ds = CryptoDataset(train_dates, xg_all, xm_all, s_all, p_all, L, (scaler_xg, scaler_xm, scaler_s, scaler_p))
val_ds = CryptoDataset(val_dates, xg_all, xm_all, s_all, p_all, L, (scaler_xg, scaler_xm, scaler_s, scaler_p))
test_ds = CryptoDataset(test_dates, xg_all, xm_all, s_all, p_all, L, (scaler_xg, scaler_xm, scaler_s, scaler_p))
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)
print("10/11 데이터셋 및 DataLoader 생성 완료")

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x):
        Lx = x.size(1)
        return x + self.pe[:Lx, :].unsqueeze(0)

class TimeEmbed(nn.Module):
    def __init__(self, in_c, d_model, kernel_size=3):
        super().__init__()
        padding = kernel_size // 2
        self.conv = nn.Conv1d(in_channels=in_c, out_channels=d_model, kernel_size=kernel_size, padding=padding)
        self.pos = PositionalEncoding(d_model, max_len=500)
    def forward(self, x):
        x_t = x.transpose(1,2)
        y = self.conv(x_t).transpose(1,2)
        y = self.pos(y)
        return y

class Zeta(nn.Module):
    def __init__(self, d_model, hidden=128, L=L):
        super().__init__()
        self.ff = nn.Sequential(nn.Linear(d_model, hidden), nn.ReLU(), nn.Linear(hidden, d_model))
        self.linear_time = nn.Linear(L, 1)
        self.linear_feat = nn.Linear(d_model, d_model)
    def forward(self, x):
        y = self.ff(x)
        y_t = self.linear_time(y.transpose(1,2)).squeeze(2)
        out = self.linear_feat(y_t)
        return out

def roll_tensor(x, shift):
    if shift == 0:
        return x
    return torch.cat([x[:, -shift:, :], x[:, :-shift, :]], dim=1)

def compute_macro_h(xembg, xembm):
    B, Lx, d = xembg.size()
    attn_scores = []
    rolled = []
    for tau in range(Lx):
        r = roll_tensor(xembm, tau)
        num = (xembg * r).sum(dim=2)
        den = (xembg.norm(dim=2) * r.norm(dim=2) + 1e-9)
        sim = (num / den).mean(dim=1)
        attn_scores.append(sim.unsqueeze(1))
        rolled.append(r.unsqueeze(1))
    attn = torch.cat(attn_scores, dim=1)
    a = F.softmax(attn, dim=1)
    rolled_stack = torch.cat(rolled, dim=1)
    a_exp = a.unsqueeze(-1).unsqueeze(-1)
    hm = (a_exp * rolled_stack).sum(dim=1)
    return hm

class PriceDynamics(nn.Module):
    def __init__(self, in_c, L=L):
        super().__init__()
        self.norm = nn.LayerNorm(in_c)
        self.lin = nn.Linear(L, 1)
    def forward(self, x):
        x = self.norm(x)
        x_t = x.transpose(1,2)
        out = self.lin(x_t)
        out = out.squeeze(-1)
        delta = out.mean(dim=1, keepdim=False)
        return delta

class CryptoPulseModel(nn.Module):
    def __init__(self, in_target_c, in_macro_c, in_sent_c, d_model=64, L=L):
        super().__init__()
        self.L = L
        self.embed_g = TimeEmbed(in_target_c, d_model)
        self.embed_m = TimeEmbed(in_macro_c, d_model)
        self.embed_s = TimeEmbed(in_sent_c, d_model)
        self.zeta = Zeta(d_model, hidden=128, L=L)
        self.price_dyn = PriceDynamics(in_target_c, L=L)
        self.macro_pred_head = nn.Sequential(nn.Linear(d_model, d_model//2), nn.ReLU(), nn.Linear(d_model//2, 1))
        self.dyn_from_emb = nn.Linear(d_model, 1)
        self.gamma_head = nn.Sequential(nn.Linear(2*d_model, 64), nn.ReLU(), nn.Linear(64,1))
    def forward(self, xg, xm, s, p_last):
        xg = xg.to(DEVICE); xm = xm.to(DEVICE); s = s.to(DEVICE); p_last = p_last.to(DEVICE)
        xembg = self.embed_g(xg)
        xembm = self.embed_m(xm)
        semb = self.embed_s(s)
        hm = compute_macro_h(xembg, xembm)
        z = self.zeta(hm)
        delta_macro = self.macro_pred_head(z).squeeze(-1)
        delta_dyn_scale = self.price_dyn(xg)
        delta_dyn_emb = self.dyn_from_emb(xembg.mean(dim=1)).squeeze(-1)
        delta_dyn = 0.5 * delta_dyn_scale + 0.5 * delta_dyn_emb
        kappa_vec = self.zeta(semb)
        kappa = torch.tanh(kappa_vec.mean(dim=1))
        if kappa.dim() > 1:
            kappa = kappa.mean(dim=1)
        cat = torch.cat([xembg.mean(dim=1), semb.mean(dim=1)], dim=1)
        gamma = torch.sigmoid(self.gamma_head(cat)).squeeze(-1)
        p1 = p_last + kappa * delta_macro
        p2 = p_last + kappa * delta_dyn
        p_hat = gamma * p1 + (1.0 - gamma) * p2
        return p_hat, p1, p2, delta_macro, delta_dyn, gamma, kappa

in_target_c = xg_all.shape[1]
in_macro_c = xm_all.shape[1]
in_sent_c = s_all.shape[1]
model = CryptoPulseModel(in_target_c, in_macro_c, in_sent_c, d_model=64, L=L).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
print("11/11 모델 초기화 완료")

def inverse_scale_p(x_scaled):
    arr = np.array(x_scaled).reshape(-1,1)
    return scaler_p.inverse_transform(arr).reshape(-1)

def evaluate_model(model, loader):
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for xg,xm,s,p_last,y in loader:
            xg = torch.tensor(xg).to(DEVICE)
            xm = torch.tensor(xm).to(DEVICE)
            s = torch.tensor(s).to(DEVICE)
            p_last_t = torch.tensor(p_last).to(DEVICE).float()
            y_t = torch.tensor(y).to(DEVICE).float()
            p_hat_s, *_ = model(xg, xm, s, p_last_t)
            preds.append(p_hat_s.cpu().numpy())
            trues.append(y_t.cpu().numpy())
    preds = np.concatenate(preds).ravel()
    trues = np.concatenate(trues).ravel()
    preds_inv = inverse_scale_p(preds)
    trues_inv = inverse_scale_p(trues)
    mae = np.mean(np.abs(preds_inv - trues_inv))
    mse = np.mean((preds_inv - trues_inv)**2)
    corr = np.corrcoef(preds_inv, trues_inv)[0,1] if len(preds_inv)>1 and np.std(preds_inv)>0 and np.std(trues_inv)>0 else 0.0
    return mae, mse, corr

print("학습 시작")
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    count = 0
    for xg,xm,s,p_last,y in train_loader:
        xg = xg.to(DEVICE)
        xm = xm.to(DEVICE)
        s = s.to(DEVICE)
        p_last_t = p_last.to(DEVICE).float()
        y_t = y.to(DEVICE).float()
        optimizer.zero_grad()
        p_hat_s, *_ = model(xg, xm, s, p_last_t)
        loss = F.mse_loss(p_hat_s, y_t)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xg.size(0)
        count += xg.size(0)
    avg_loss = total_loss / count if count>0 else 0.0
    for param_group in optimizer.param_groups:
        param_group['lr'] *= 0.5
    val_mae, val_mse, val_corr = evaluate_model(model, val_loader)
    print(f"Epoch {epoch}/{EPOCHS} - train_mse(scaled):{avg_loss:.6f} val_mae:{val_mae:.6f} val_mse:{val_mse:.6f} val_corr:{val_corr:.4f}")

test_mae, test_mse, test_corr = evaluate_model(model, test_loader)
print(f"테스트 결과 - MAE:{test_mae:.6f} MSE:{test_mse:.6f} CORR:{test_corr:.4f}")
print("파이프라인 전체 완료")

1/11 뉴스 로드 시작
1/11 뉴스 로드 완료: 25947건, 기간 2020-01-01 00:00:00 ~ 2025-10-03 00:00:00
2/11 뉴스 감성 집계 시작
2/11 뉴스 감성 집계 완료
3/11 macro 파일 로드 시작
3/11 macro 파일 로드 완료
4/11 온체인 로드 시작
4/11 온체인 로드 완료
5/11 날짜 정렬 및 리인덱스 시작
5/11 날짜 정렬 완료
6/11 ETH 타깃 및 기술지표 준비 시작
6/11 ETH 타깃 및 기술지표 준비 완료
7/11 top-n macro 입력 생성 시작
7/11 top-n macro 입력 생성 완료
8/11 sentiment feature 준비 및 병합
8/11 sentiment 준비 완료
9/11 정규화 및 데이터 분할 준비
9/11 정규화 및 분할 준비 완료
10/11 데이터셋 및 DataLoader 생성 완료
11/11 모델 초기화 완료
학습 시작


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 1/12 - train_mse(scaled):0.006682 val_mae:76.331551 val_mse:11725.232422 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 2/12 - train_mse(scaled):0.006639 val_mae:76.915329 val_mse:11743.152344 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 3/12 - train_mse(scaled):0.006605 val_mae:76.788208 val_mse:11728.221680 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 4/12 - train_mse(scaled):0.006584 val_mae:76.923264 val_mse:11736.145508 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 5/12 - train_mse(scaled):0.006573 val_mae:76.940117 val_mse:11734.514648 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 6/12 - train_mse(scaled):0.006567 val_mae:76.975281 val_mse:11737.759766 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 7/12 - train_mse(scaled):0.006562 val_mae:76.968071 val_mse:11736.279297 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 8/12 - train_mse(scaled):0.006561 val_mae:76.974152 val_mse:11736.921875 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 9/12 - train_mse(scaled):0.006560 val_mae:76.974068 val_mse:11736.773438 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 10/12 - train_mse(scaled):0.006560 val_mae:76.974472 val_mse:11736.765625 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 11/12 - train_mse(scaled):0.006559 val_mae:76.975166 val_mse:11736.826172 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


Epoch 12/12 - train_mse(scaled):0.006559 val_mae:76.975174 val_mse:11736.814453 val_corr:0.9719


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


테스트 결과 - MAE:76.985764 MSE:12178.814453 CORR:0.9908
파이프라인 전체 완료


  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()
  xg = torch.tensor(xg).to(DEVICE)
  xm = torch.tensor(xm).to(DEVICE)
  s = torch.tensor(s).to(DEVICE)
  p_last_t = torch.tensor(p_last).to(DEVICE).float()
  y_t = torch.tensor(y).to(DEVICE).float()


In [5]:
import os, re, math
from datetime import timedelta
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

TARGET_MACRO_FILE = 'macro_crypto_data.csv'
ONCHAIN_FILE = 'eth_onchain.csv'
NEWS_DIR = "./news_data"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

START_TIME = '2021-01-01'
END_TIME = '2025-10-02'

L = 7
BATCH_SIZE = 64
EPOCHS = 12
LR = 5e-4
TOP_N = 5

def parse_date_from_filename(filename):
    patterns = [r'(\d{4})-(\d{2})-(\d{2})', r'(\d{4})(\d{2})(\d{2})', r'(\d{2})-(\d{2})-(\d{4})', r'(\d{2})(\d{2})(\d{4})']
    basename = os.path.basename(filename)
    for pattern in patterns:
        match = re.search(pattern, basename)
        if match:
            try:
                if len(match.group(1)) == 4:
                    year, month, day = match.groups()
                else:
                    day, month, year = match.groups()
                return pd.to_datetime(f"{year}-{month}-{day}")
            except:
                continue
    return None

def load_all_news_data(root_dir):
    all_data = []
    if not os.path.exists(root_dir):
        dates = pd.date_range(START_TIME, END_TIME, freq='D')
        return pd.DataFrame({'date': dates, 'news': ['test news'] * len(dates), 'label': np.random.choice([1,0,-1], len(dates))})
    csv_files = sorted([f for f in os.listdir(root_dir) if f.endswith('.csv')])
    for filename in csv_files:
        filepath = os.path.join(root_dir, filename)
        file_date = parse_date_from_filename(filename)
        for enc in ['utf-8','cp949','latin1']:
            try:
                df = pd.read_csv(filepath, encoding=enc)
                break
            except Exception:
                continue
        else:
            continue
        if 'date' not in df.columns:
            df['date'] = file_date
        else:
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            if file_date is not None:
                df['date'] = df['date'].fillna(file_date)
        if 'label' not in df.columns:
            raise ValueError(f"{filepath}에 'label' 컬럼이 필요합니다.")
        if 'news' in df.columns:
            df = df[['date','news','label']]
        else:
            df = df[['date','label']]
        all_data.append(df)
    if len(all_data) == 0:
        dates = pd.date_range(START_TIME, END_TIME, freq='D')
        return pd.DataFrame({'date': dates, 'news': ['test news'] * len(dates), 'label': np.random.choice([1,0,-1], len(dates))})
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df['date'] = pd.to_datetime(combined_df['date'], errors='coerce').dt.normalize()
    return combined_df

print("1/11 뉴스 로드 시작")
news_df = load_all_news_data(NEWS_DIR)
print(f"1/11 뉴스 로드 완료: {len(news_df)}건, 기간 {news_df['date'].min()} ~ {news_df['date'].max()}")

print("2/11 뉴스 감성 집계 시작")
news_df = news_df.sort_values('date')
grouped = news_df.groupby('date')['label']
daily = grouped.agg(sent_mean='mean', sent_count='count').reset_index().set_index('date')
pos = grouped.apply(lambda x: (x==1).sum())
neu = grouped.apply(lambda x: (x==0).sum())
neg = grouped.apply(lambda x: (x==-1).sum())
props = pd.DataFrame({'pos_cnt': pos, 'neu_cnt': neu, 'neg_cnt': neg})
daily = daily.join(props)
def day_entropy(row):
    counts = np.array([row['pos_cnt'], row['neu_cnt'], row['neg_cnt']], dtype=float)
    s = counts.sum()
    if s <= 0:
        return 0.0
    p = counts / s
    p_nonzero = p[p>0]
    return -np.sum(p_nonzero * np.log(p_nonzero))
daily['sent_entropy'] = daily.apply(day_entropy, axis=1)
daily['sent_majority'] = news_df.groupby('date')['label'].apply(lambda sub: int(np.sign(np.round(sub.mean()))) if len(sub)>0 else 0)
daily = daily.sort_index()
alpha = 0.4
daily['sent_mean_ewma'] = daily['sent_mean'].ewm(alpha=alpha, adjust=False).mean()
all_dates_news = pd.date_range(daily.index.min(), daily.index.max(), freq='D')
daily = daily.reindex(all_dates_news).fillna({'sent_mean':0.0,'sent_count':0,'pos_cnt':0,'neu_cnt':0,'neg_cnt':0,'sent_entropy':0.0,'sent_majority':0,'sent_mean_ewma':0.0}).fillna(0)
print("2/11 뉴스 감성 집계 완료")

print("3/11 macro 파일 로드 시작")
if not os.path.exists(TARGET_MACRO_FILE):
    raise FileNotFoundError(f"{TARGET_MACRO_FILE} 파일이 필요합니다.")
macro_raw = pd.read_csv(TARGET_MACRO_FILE, parse_dates=['Date'])
macro_raw['Date'] = pd.to_datetime(macro_raw['Date']).dt.tz_convert(None).dt.normalize()
macro_raw = macro_raw.set_index('Date').sort_index()
print("3/11 macro 파일 로드 완료")

print("4/11 온체인 로드 시작")
if not os.path.exists(ONCHAIN_FILE):
    raise FileNotFoundError(f"{ONCHAIN_FILE} 파일이 필요합니다.")
onchain = pd.read_csv(ONCHAIN_FILE, parse_dates=['date']).set_index('date').sort_index()
onchain.index = pd.to_datetime(onchain.index)
print("4/11 온체인 로드 완료")

start = max(macro_raw.index.min(), onchain.index.min(), daily.index.min(), pd.to_datetime(START_TIME))
end = min(macro_raw.index.max(), onchain.index.max(), daily.index.max(), pd.to_datetime(END_TIME))
date_index_full = pd.date_range(start, end, freq='D')

print("5/11 날짜 정렬 및 리인덱스 시작")
macro_raw = macro_raw.reindex(date_index_full).ffill().bfill()
onchain = onchain.reindex(date_index_full).fillna(0)
daily = daily.reindex(date_index_full).fillna(0)
print("5/11 날짜 정렬 완료")

print("6/11 ETH 타깃 및 기술지표 준비 시작")
eth_cols = ['ETH_Open','ETH_High','ETH_Low','ETH_Close','ETH_Volume']
for c in eth_cols:
    if c not in macro_raw.columns:
        raise ValueError(f"{c} 컬럼이 macro 파일에 필요합니다.")
eth_price = macro_raw[eth_cols].rename(columns={'ETH_Open':'open','ETH_High':'high','ETH_Low':'low','ETH_Close':'close','ETH_Volume':'volume'})

def compute_technical_indicators(df):
    out = pd.DataFrame(index=df.index)
    pt = df['close']
    N = 14
    lowN = df['low'].rolling(N).min()
    highN = df['high'].rolling(N).max()
    out['stoch_k'] = (pt - lowN) / (highN - lowN + 1e-9) * 100
    out['stoch_d'] = out['stoch_k'].rolling(3).mean()
    out['williams_r'] = (highN - pt) / (highN - lowN + 1e-9) * 100
    out['ad_osc'] = (pt - pt.shift(1)) / (df['high'] - df['low'] + 1e-9)
    out['momentum'] = pt - pt.shift(10)
    out['disparity7'] = pt / pt.rolling(7).mean() * 100
    out['roc'] = pt / pt.shift(12) * 100
    return out.fillna(0)

tech = compute_technical_indicators(eth_price)
target_feats = pd.concat([eth_price, tech, onchain], axis=1).fillna(0)
print("6/11 ETH 타깃 및 기술지표 준비 완료")

print("7/11 top-n macro 입력 생성 시작")
cols = [c for c in macro_raw.columns if '_' in c]
coins = []
for c in cols:
    coin = c.split('_')[0]
    if coin not in coins:
        coins.append(coin)
coins = [c for c in coins if c.upper() != 'ETH']
if len(coins) < TOP_N:
    TOP_N = len(coins)
selected_coins = coins[:TOP_N]
macro_list = []
for coin in selected_coins:
    needed = [f"{coin}_Open", f"{coin}_Close", f"{coin}_High", f"{coin}_Low", f"{coin}_Volume"]
    for n in needed:
        if n not in macro_raw.columns:
            raise ValueError(f"{n} 컬럼이 macro 파일에 필요합니다.")
    arr = macro_raw[needed].values
    macro_list.append(arr)
macro_array = np.concatenate(macro_list, axis=1)
feat_suffix = ['Open','Close','High','Low','Volume']
feature_names = []
for coin in selected_coins:
    for sfx in feat_suffix:
        feature_names.append(f"{coin}_{sfx}")
if macro_array.shape[1] != len(feature_names):
    feature_names = [f"m{i}" for i in range(macro_array.shape[1])]
macro_df = pd.DataFrame(macro_array, index=date_index_full, columns=feature_names)
print("7/11 top-n macro 입력 생성 완료")

print("8/11 sentiment feature 준비 및 병합")
sent_cols = ['sent_mean','sent_count','pos_cnt','neu_cnt','neg_cnt','sent_entropy','sent_mean_ewma']
sent_df = daily[sent_cols].fillna(0)
print("8/11 sentiment 준비 완료")

print("9/11 리턴(target) 생성 및 정규화 준비")
p_all = eth_price.loc[date_index_full][['close']].copy()
r_all = (p_all['close'].shift(-1) - p_all['close']) / (p_all['close'] + 1e-9)
# drop last date because no next-day label
date_index = date_index_full[:-1]
xg_all = target_feats.loc[date_index]
xm_all = macro_df.loc[date_index]
s_all = sent_df.loc[date_index]
p_all = p_all.loc[date_index]
r_all = r_all.loc[date_index]

n_total = len(date_index)
n_train = int(n_total * 0.7)
n_val = int(n_total * 0.1)
n_test = n_total - n_train - n_val
dates = list(date_index)
train_dates = dates[L-1 : n_train]
val_dates = dates[n_train : n_train + n_val]
test_dates = dates[n_train + n_val : ]

scaler_xg = StandardScaler().fit(xg_all.iloc[:n_train].values)
scaler_xm = StandardScaler().fit(xm_all.iloc[:n_train].values)
scaler_s = StandardScaler().fit(s_all.iloc[:n_train].values)
scaler_r = StandardScaler().fit(r_all.iloc[:n_train].values.reshape(-1,1))
print("9/11 정규화 및 분할 준비 완료")

class CryptoDataset(Dataset):
    def __init__(self, dates_list, xg_df, xm_df, s_df, p_df, r_series, L, scalers):
        self.dates = dates_list
        self.xg = xg_df
        self.xm = xm_df
        self.s = s_df
        self.p = p_df
        self.r = r_series
        self.L = L
        self.scaler_xg, self.scaler_xm, self.scaler_s, self.scaler_r = scalers
    def __len__(self):
        return len(self.dates)
    def __getitem__(self, idx):
        t = self.dates[idx]
        start = t - timedelta(days=self.L-1)
        idxs = pd.date_range(start=start, end=t, freq='D')
        xg_win = self.xg.loc[idxs].values.astype(np.float32)
        xm_win = self.xm.loc[idxs].values.astype(np.float32)
        s_win = self.s.loc[idxs].values.astype(np.float32)
        p_last = float(self.p.loc[t]['close'])
        y_raw = float(self.r.loc[t])  # scaled return (or scaled target)

        Bxg = self.scaler_xg.transform(xg_win)
        Bxm = self.scaler_xm.transform(xm_win)
        Bs  = self.scaler_s.transform(s_win)
        y_s = self.scaler_r.transform([[y_raw]])[0,0]

        # numpy -> torch (batch-window 차원: (L, feat))
        Bxg_t = torch.from_numpy(Bxg).float()        # shape (L, in_target_c)
        Bxm_t = torch.from_numpy(Bxm).float()        # shape (L, in_macro_c)
        Bs_t  = torch.from_numpy(Bs).float()         # shape (L, in_sent_c)
        p_last_f = torch.tensor(p_last, dtype=torch.float32)  # scalar tensor
        y_s_f = torch.tensor(y_s, dtype=torch.float32)       # scalar tensor

        return Bxg_t, Bxm_t, Bs_t, p_last_f, y_s_f

train_ds = CryptoDataset(train_dates, xg_all, xm_all, s_all, p_all, r_all, L, (scaler_xg, scaler_xm, scaler_s, scaler_r))
val_ds = CryptoDataset(val_dates, xg_all, xm_all, s_all, p_all, r_all, L, (scaler_xg, scaler_xm, scaler_s, scaler_r))
test_ds = CryptoDataset(test_dates, xg_all, xm_all, s_all, p_all, r_all, L, (scaler_xg, scaler_xm, scaler_s, scaler_r))
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)
print("10/11 데이터셋 및 DataLoader 생성 완료")

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x):
        Lx = x.size(1)
        return x + self.pe[:Lx, :].unsqueeze(0)

class TimeEmbed(nn.Module):
    def __init__(self, in_c, d_model, kernel_size=3):
        super().__init__()
        padding = kernel_size // 2
        self.conv = nn.Conv1d(in_channels=in_c, out_channels=d_model, kernel_size=kernel_size, padding=padding)
        self.pos = PositionalEncoding(d_model, max_len=500)
    def forward(self, x):
        x_t = x.transpose(1,2)
        y = self.conv(x_t).transpose(1,2)
        y = self.pos(y)
        return y

class Zeta(nn.Module):
    def __init__(self, d_model, hidden=128, L=L):
        super().__init__()
        self.ff = nn.Sequential(nn.Linear(d_model, hidden), nn.ReLU(), nn.Linear(hidden, d_model))
        self.linear_time = nn.Linear(L, 1)
        self.linear_feat = nn.Linear(d_model, d_model)
    def forward(self, x):
        y = self.ff(x)
        y_t = self.linear_time(y.transpose(1,2)).squeeze(2)
        out = self.linear_feat(y_t)
        return out

def roll_tensor(x, shift):
    if shift == 0:
        return x
    return torch.cat([x[:, -shift:, :], x[:, :-shift, :]], dim=1)

def compute_macro_h(xembg, xembm):
    B, Lx, d = xembg.size()
    attn_scores = []
    rolled = []
    for tau in range(Lx):
        r = roll_tensor(xembm, tau)
        num = (xembg * r).sum(dim=2)
        den = (xembg.norm(dim=2) * r.norm(dim=2) + 1e-9)
        sim = (num / den).mean(dim=1)
        attn_scores.append(sim.unsqueeze(1))
        rolled.append(r.unsqueeze(1))
    attn = torch.cat(attn_scores, dim=1)
    a = F.softmax(attn, dim=1)
    rolled_stack = torch.cat(rolled, dim=1)
    a_exp = a.unsqueeze(-1).unsqueeze(-1)
    hm = (a_exp * rolled_stack).sum(dim=1)
    return hm

class PriceDynamics(nn.Module):
    def __init__(self, in_c, L=L):
        super().__init__()
        self.norm = nn.LayerNorm(in_c)
        self.lin = nn.Linear(L, 1)
    def forward(self, x):
        x = self.norm(x)
        x_t = x.transpose(1,2)
        out = self.lin(x_t)
        out = out.squeeze(-1)
        delta = out.mean(dim=1, keepdim=False)
        return delta

class CryptoPulseModel(nn.Module):
    def __init__(self, in_target_c, in_macro_c, in_sent_c, d_model=64, L=L):
        super().__init__()
        self.L = L
        self.embed_g = TimeEmbed(in_target_c, d_model)
        self.embed_m = TimeEmbed(in_macro_c, d_model)
        self.embed_s = TimeEmbed(in_sent_c, d_model)
        self.zeta = Zeta(d_model, hidden=128, L=L)
        self.price_dyn = PriceDynamics(in_target_c, L=L)
        self.macro_pred_head = nn.Sequential(nn.Linear(d_model, d_model//2), nn.ReLU(), nn.Linear(d_model//2, 1))
        self.dyn_from_emb = nn.Linear(d_model, 1)
        self.gamma_head = nn.Sequential(nn.Linear(2*d_model, 64), nn.ReLU(), nn.Linear(64,1))
    def forward(self, xg, xm, s, p_last_raw):
        xg = xg.to(DEVICE); xm = xm.to(DEVICE); s = s.to(DEVICE)
        xembg = self.embed_g(xg)
        xembm = self.embed_m(xm)
        semb = self.embed_s(s)
        hm = compute_macro_h(xembg, xembm)
        z = self.zeta(hm)
        delta_macro = self.macro_pred_head(z).squeeze(-1)  # scaled return space
        delta_dyn_scale = self.price_dyn(xg)
        delta_dyn_emb = self.dyn_from_emb(xembg.mean(dim=1)).squeeze(-1)
        delta_dyn = 0.5 * delta_dyn_scale + 0.5 * delta_dyn_emb
        kappa_vec = self.zeta(semb)
        kappa = torch.tanh(kappa_vec.mean(dim=1))
        if kappa.dim() > 1:
            kappa = kappa.mean(dim=1)
        cat = torch.cat([xembg.mean(dim=1), semb.mean(dim=1)], dim=1)
        gamma = torch.sigmoid(self.gamma_head(cat)).squeeze(-1)
        # predict scaled return
        r_hat_s = gamma * (kappa * delta_macro) + (1.0 - gamma) * (kappa * delta_dyn)
        return r_hat_s, delta_macro, delta_dyn, gamma, kappa

in_target_c = xg_all.shape[1]
in_macro_c = xm_all.shape[1]
in_sent_c = s_all.shape[1]
model = CryptoPulseModel(in_target_c, in_macro_c, in_sent_c, d_model=64, L=L).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
print("11/11 모델 초기화 완료")

def inverse_scale_r(x_scaled):
    arr = np.array(x_scaled).reshape(-1,1)
    return scaler_r.inverse_transform(arr).reshape(-1)

def evaluate_model(model, loader):
    model.eval()
    preds_prices = []
    trues_prices = []
    direction_hits = []
    return_maes = []
    with torch.no_grad():
        for xg, xm, s, p_last_raw, y_s in loader:
            xg = xg.to(DEVICE)
            xm = xm.to(DEVICE)
            s  = s.to(DEVICE)
            p_last_arr = p_last_raw.cpu().numpy().astype(float)  # 필요한 경우 NumPy로 변환
            y_s_t = y_s.to(DEVICE).float()
            r_hat_s, *_ = model(xg, xm, s, p_last_raw.to(DEVICE).float())
            r_hat_s = r_hat_s.detach().cpu().numpy().ravel()
            r_true_s = y_s_t.detach().cpu().numpy().ravel()
            r_hat = inverse_scale_r(r_hat_s)
            r_true = inverse_scale_r(r_true_s)
            p_hat = p_last_arr * (1.0 + r_hat)
            p_true = p_last_arr * (1.0 + r_true)
            preds_prices.append(p_hat)
            trues_prices.append(p_true)
            direction_hits.append((np.sign(r_hat) == np.sign(r_true)).astype(float))
            return_maes.append(np.abs(r_hat - r_true))
    preds_prices = np.concatenate(preds_prices)
    trues_prices = np.concatenate(trues_prices)
    direction_hits = np.concatenate(direction_hits)
    return_maes = np.concatenate(return_maes)
    mae = np.mean(np.abs(preds_prices - trues_prices))
    mse = np.mean((preds_prices - trues_prices)**2)
    corr = np.corrcoef(preds_prices, trues_prices)[0,1] if len(preds_prices)>1 and np.std(preds_prices)>0 and np.std(trues_prices)>0 else 0.0
    dir_acc = direction_hits.mean() if len(direction_hits)>0 else 0.0
    ret_mae = return_maes.mean() if len(return_maes)>0 else 0.0
    return mae, mse, corr, dir_acc, ret_mae

print("학습 시작")
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    count = 0
    for xg,xm,s,p_last_raw,y_s in train_loader:
        xg = xg.to(DEVICE)
        xm = xm.to(DEVICE)
        s = s.to(DEVICE)
        p_last_arr = np.array(p_last_raw).astype(float)
        y_t = y_s.to(DEVICE).float()
        optimizer.zero_grad()
        r_hat_s, *_ = model(xg, xm, s, torch.tensor(p_last_arr).to(DEVICE))
        loss = F.mse_loss(r_hat_s, y_t)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xg.size(0)
        count += xg.size(0)
    avg_loss = total_loss / count if count>0 else 0.0
    if epoch % 4 == 0:
        for param_group in optimizer.param_groups:
            param_group['lr'] *= 0.5
    val_mae, val_mse, val_corr, val_dir_acc, val_ret_mae = evaluate_model(model, val_loader)
    print(f"Epoch {epoch}/{EPOCHS} - train_mse(scaled):{avg_loss:.6f} val_price_mae:{val_mae:.6f} val_price_mse:{val_mse:.6f} val_corr:{val_corr:.4f} val_dir_acc:{val_dir_acc:.4f} val_ret_mae:{val_ret_mae:.6f}")

test_mae, test_mse, test_corr, test_dir_acc, test_ret_mae = evaluate_model(model, test_loader)
print(f"테스트 결과 - Price MAE:{test_mae:.6f} MSE:{test_mse:.6f} CORR:{test_corr:.4f} DirAcc:{test_dir_acc:.4f} ReturnMAE:{test_ret_mae:.6f}")
print("파이프라인 전체 완료")

1/11 뉴스 로드 시작
1/11 뉴스 로드 완료: 25947건, 기간 2020-01-01 00:00:00 ~ 2025-10-03 00:00:00
2/11 뉴스 감성 집계 시작
2/11 뉴스 감성 집계 완료
3/11 macro 파일 로드 시작
3/11 macro 파일 로드 완료
4/11 온체인 로드 시작
4/11 온체인 로드 완료
5/11 날짜 정렬 및 리인덱스 시작
5/11 날짜 정렬 완료
6/11 ETH 타깃 및 기술지표 준비 시작
6/11 ETH 타깃 및 기술지표 준비 완료
7/11 top-n macro 입력 생성 시작
7/11 top-n macro 입력 생성 완료
8/11 sentiment feature 준비 및 병합
8/11 sentiment 준비 완료
9/11 리턴(target) 생성 및 정규화 준비
9/11 정규화 및 분할 준비 완료
10/11 데이터셋 및 DataLoader 생성 완료
11/11 모델 초기화 완료
학습 시작
Epoch 1/12 - train_mse(scaled):0.966564 val_price_mae:66.650810 val_price_mse:9396.792590 val_corr:0.9799 val_dir_acc:0.5145 val_ret_mae:0.022765
Epoch 2/12 - train_mse(scaled):0.966473 val_price_mae:66.543676 val_price_mse:9362.161196 val_corr:0.9799 val_dir_acc:0.5145 val_ret_mae:0.022736
Epoch 3/12 - train_mse(scaled):0.965660 val_price_mae:66.420509 val_price_mse:9312.860802 val_corr:0.9800 val_dir_acc:0.5260 val_ret_mae:0.022703
Epoch 4/12 - train_mse(scaled):0.963134 val_price_mae:66.296595 val_price_mse:9175.0022