In [18]:
import os
import time
import json
import warnings
import gc
import pickle
import joblib
import glob
import traceback
from dotenv import load_dotenv
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import pandas_ta as ta
from collections import Counter
from numba import jit
from sklearn.cluster import DBSCAN
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.feature_selection import (
    SelectKBest, RFE,
    mutual_info_classif, mutual_info_regression
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
    ,log_loss
)
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
    AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier,
    GradientBoostingClassifier, HistGradientBoostingClassifier,
    RandomForestClassifier, StackingClassifier, VotingClassifier,
    AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor,
    GradientBoostingRegressor, RandomForestRegressor,
    StackingRegressor, VotingRegressor
)
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from lightgbm.callback import early_stopping
from xgboost import XGBClassifier, XGBRegressor
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    Input, Dense, Flatten, Dropout, Activation,
    LSTM, GRU, SimpleRNN, Bidirectional,
    Conv1D, MaxPooling1D, AveragePooling1D,
    GlobalAveragePooling1D, GlobalMaxPooling1D,
    BatchNormalization, LayerNormalization,
    Attention, MultiHeadAttention,
    Concatenate, Add, Multiply, Lambda,
    Reshape, Permute, RepeatVector, TimeDistributed
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.var_model import VAR
try:
    import torch
    import torch.nn as nn
except ImportError:
    pass
# ============================================================================
# 환경 설정 및 경고 무시
# ============================================================================

# GPU 메모리 증가 허용 설정
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

warnings.filterwarnings('ignore')


DATA_DIR_MAIN = './macro_data'
DATA_DIR_NEW = './macro_data/macro_data'

TRAIN_START_DATE = pd.to_datetime('2020-01-01')
LOOKBACK_DAYS = 200
LOOKBACK_START_DATE = TRAIN_START_DATE - timedelta(days=LOOKBACK_DAYS)


def standardize_date_column(df,file_name):
    """날짜 컬럼 자동 탐지 + datetime 통일 + tz 제거 + 시각 제거"""

    date_cols = [col for col in df.columns if 'date' in col.lower()]
    if not date_cols:
        print("[Warning] 날짜 컬럼을 찾을 수 없습니다.")
        return df
    date_col = date_cols[0]
    
    if date_col != 'date':
        df.rename(columns={date_col: 'date'}, inplace=True)
    

    if file_name == 'eth_onchain.csv':
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
    else:
        df['date'] = pd.to_datetime(df['date'], errors='coerce', infer_datetime_format=True)
    
    df = df.dropna(subset=['date'])
    df['date'] = df['date'].dt.normalize()  
    if pd.api.types.is_datetime64tz_dtype(df['date']):
        df['date'] = df['date'].dt.tz_convert(None)
    else:
        df['date'] = df['date'].dt.tz_localize(None)
    print(df.shape,file_name)
    return df


def load_csv(directory, filename):
    filepath = os.path.join(directory, filename)
    if not os.path.exists(filepath):
        print(f"[Warning] {filename} not found")
        return pd.DataFrame()
    df = pd.read_csv(filepath)
    return standardize_date_column(df, filename)


def add_prefix(df, prefix):
    if df.empty:
        return df
    df.columns = [f"{prefix}_{col}" if col != 'date' else col for col in df.columns]
    return df


def create_sentiment_features(news_df):
    if news_df.empty:
        return pd.DataFrame(columns=['date'])
    
    agg = news_df.groupby('date').agg(
        sentiment_mean=('label', 'mean'),
        sentiment_std=('label', 'std'),
        news_count=('label', 'count'),
        positive_ratio=('label', lambda x: (x == 1).sum() / len(x)),
        negative_ratio=('label', lambda x: (x == -1).sum() / len(x)),
        extreme_positive_count=('label', lambda x: (x == 1).sum()),
        extreme_negative_count=('label', lambda x: (x == -1).sum()),
        sentiment_sum=('label', 'sum'),
    ).reset_index().fillna(0)
    
    agg['sentiment_polarity'] = agg['positive_ratio'] - agg['negative_ratio']
    agg['sentiment_intensity'] = agg['positive_ratio'] + agg['negative_ratio']
    agg['sentiment_disagreement'] = agg['positive_ratio'] * agg['negative_ratio']
    agg['bull_bear_ratio'] = agg['positive_ratio'] / (agg['negative_ratio'] + 1e-10)
    agg['weighted_sentiment'] = agg['sentiment_mean'] * np.log1p(agg['news_count'])
    agg['extremity_index'] = (agg['extreme_positive_count'] + agg['extreme_negative_count']) / (agg['news_count'] + 1e-10)
    
    for window in [3,7]:
        agg[f'sentiment_ma{window}'] = agg['sentiment_mean'].rolling(window=window, min_periods=1).mean()
        agg[f'sentiment_volatility_{window}'] = agg['sentiment_mean'].rolling(window=window, min_periods=1).std()
    
    agg['sentiment_trend'] = agg['sentiment_mean'].diff()
    agg['sentiment_acceleration'] = agg['sentiment_trend'].diff()
    agg['news_volume_change'] = agg['news_count'].pct_change()
    
    for window in [7, 14]:
        agg[f'news_volume_ma{window}'] = agg['news_count'].rolling(window=window, min_periods=1).mean()
    
    return agg.fillna(0)


def smart_fill_missing(df_merged):
    REFERENCE_START_DATE = pd.to_datetime('2020-01-01')
    
    for col in df_merged.columns:
        if col == 'date':
            continue
        
        if df_merged[col].isnull().sum() == 0:
            continue
        
        non_null_idx = df_merged[col].first_valid_index()
        
        if non_null_idx is None:
            df_merged[col] = df_merged[col].fillna(0)
            continue
        
        first_date = df_merged.loc[non_null_idx, 'date']
        
        before_mask = df_merged['date'] < first_date
        after_mask = df_merged['date'] >= first_date
        
        df_merged.loc[before_mask, col] = df_merged.loc[before_mask, col].fillna(0)
        df_merged.loc[after_mask, col] = df_merged.loc[after_mask, col].fillna(method='ffill')
        
        remaining = df_merged.loc[after_mask, col].isnull().sum()
        if remaining > 0:
            df_merged.loc[after_mask, col] = df_merged.loc[after_mask, col].fillna(0)
    
    return df_merged


print("="*80)
print("DATA LOADING")
print("="*80)

#news_df = load_csv(DATA_DIR_MAIN, 'news_data.csv')
#eth_onchain_df = load_csv(DATA_DIR_MAIN, 'eth_onchain.csv')
macro_df = load_csv(DATA_DIR_NEW, 'macro_crypto_data.csv')
sp500_df = load_csv(DATA_DIR_NEW, 'SP500.csv')
vix_df = load_csv(DATA_DIR_NEW, 'VIX.csv')
gold_df = load_csv(DATA_DIR_NEW, 'GOLD.csv')
dxy_df = load_csv(DATA_DIR_NEW, 'DXY.csv')
fear_greed_df = load_csv(DATA_DIR_NEW, 'fear_greed.csv')
eth_funding_df = load_csv(DATA_DIR_NEW, 'eth_funding_rate.csv')
usdt_eth_mcap_df = load_csv(DATA_DIR_NEW, 'usdt_eth_mcap.csv')
aave_tvl_df = load_csv(DATA_DIR_NEW, 'aave_eth_tvl.csv')
lido_tvl_df = load_csv(DATA_DIR_NEW, 'lido_eth_tvl.csv')
makerdao_tvl_df = load_csv(DATA_DIR_NEW, 'makerdao_eth_tvl.csv')
uniswap_tvl_df = load_csv(DATA_DIR_NEW, 'uniswap_eth_tvl.csv')
curve_tvl_df = load_csv(DATA_DIR_NEW, 'curve-dex_eth_tvl.csv')
eth_chain_tvl_df = load_csv(DATA_DIR_NEW, 'eth_chain_tvl.csv')
layer2_tvl_df = load_csv(DATA_DIR_NEW, 'layer2_tvl.csv')

print(f"macro_df last date: {macro_df['date'].iloc[-1]}")
print(f"sp500_df last date: {sp500_df['date'].iloc[-1]}")
print(f"vix_df last date: {vix_df['date'].iloc[-1]}")
print(f"gold_df last date: {gold_df['date'].iloc[-1]}")
print(f"dxy_df last date: {dxy_df['date'].iloc[-1]}")
print(f"fear_greed_df last date: {fear_greed_df['date'].iloc[-1]}")
print(f"eth_funding_df last date: {eth_funding_df['date'].iloc[-1]}")
print(f"usdt_eth_mcap_df last date: {usdt_eth_mcap_df['date'].iloc[-1]}")
print(f"aave_tvl_df last date: {aave_tvl_df['date'].iloc[-1]}")
print(f"lido_tvl_df last date: {lido_tvl_df['date'].iloc[-1]}")
print(f"makerdao_tvl_df last date: {makerdao_tvl_df['date'].iloc[-1]}")
print(f"uniswap_tvl_df last date: {uniswap_tvl_df['date'].iloc[-1]}")
print(f"curve_tvl_df last date: {curve_tvl_df['date'].iloc[-1]}")
print(f"eth_chain_tvl_df last date: {eth_chain_tvl_df['date'].iloc[-1]}")
print(f"layer2_tvl_df last date: {layer2_tvl_df['date'].iloc[-1]}")


print(f"Loaded {len([df for df in [fear_greed_df, eth_funding_df, usdt_eth_mcap_df, aave_tvl_df, lido_tvl_df, makerdao_tvl_df, uniswap_tvl_df, curve_tvl_df, eth_chain_tvl_df, layer2_tvl_df] if not df.empty])} files")

all_dataframes = [
    macro_df, fear_greed_df, usdt_eth_mcap_df,
    aave_tvl_df, lido_tvl_df, makerdao_tvl_df, uniswap_tvl_df, curve_tvl_df,
    eth_chain_tvl_df, eth_funding_df, layer2_tvl_df, 
    sp500_df, vix_df, gold_df, dxy_df#,news_df, eth_onchain_df
]

last_dates = [
    pd.to_datetime(df['date']).max() 
    for df in all_dataframes 
    if not df.empty and 'date' in df.columns
]

end_date = min(last_dates) if last_dates else pd.Timestamp.today()
print(end_date)
print("\n" + "="*80)
print("SENTIMENT FEATURES")
print("="*80)
#sentiment_features = create_sentiment_features(news_df)
#print(f"Generated {sentiment_features.shape[1]-1} features")

print("\n" + "="*80)
print("DATA MERGING")
print("="*80)

#eth_onchain_df = add_prefix(eth_onchain_df, 'eth')
fear_greed_df = add_prefix(fear_greed_df, 'fg')
usdt_eth_mcap_df = add_prefix(usdt_eth_mcap_df, 'usdt')
aave_tvl_df = add_prefix(aave_tvl_df, 'aave')
lido_tvl_df = add_prefix(lido_tvl_df, 'lido')
makerdao_tvl_df = add_prefix(makerdao_tvl_df, 'makerdao')
uniswap_tvl_df = add_prefix(uniswap_tvl_df, 'uniswap')
curve_tvl_df = add_prefix(curve_tvl_df, 'curve')
eth_chain_tvl_df = add_prefix(eth_chain_tvl_df, 'chain')
eth_funding_df = add_prefix(eth_funding_df, 'funding')
layer2_tvl_df = add_prefix(layer2_tvl_df, 'l2')
sp500_df = add_prefix(sp500_df, 'sp500')
vix_df = add_prefix(vix_df, 'vix')
gold_df = add_prefix(gold_df, 'gold')
dxy_df = add_prefix(dxy_df, 'dxy')

date_range = pd.date_range(start=LOOKBACK_START_DATE, end=end_date, freq='D')
df_merged = pd.DataFrame(date_range, columns=['date'])

dataframes_to_merge = [
    macro_df,  fear_greed_df, usdt_eth_mcap_df,
    aave_tvl_df, lido_tvl_df, makerdao_tvl_df, uniswap_tvl_df, curve_tvl_df,
    eth_chain_tvl_df, eth_funding_df, layer2_tvl_df,
    sp500_df, vix_df, gold_df, dxy_df#,sentiment_features,eth_onchain_df,
]

for df in dataframes_to_merge:
    if not df.empty:
        df_merged = pd.merge(df_merged, df, on='date', how='left')

print(f"Merged shape: {df_merged.shape}")
print(f"Missing before fill: {df_merged.isnull().sum().sum():,}")

print("\n" + "="*80)
print("MISSING VALUE HANDLING")
print("="*80)

df_merged = smart_fill_missing(df_merged)

missing_after = df_merged.isnull().sum().sum()
print(f"Missing after fill: {missing_after:,}")

if missing_after > 0:
    df_merged = df_merged.fillna(0)
    print(f"Remaining filled with 0")

lookback_df = df_merged[df_merged['date'] < TRAIN_START_DATE]
cols_to_drop = [
    col for col in lookback_df.columns 
    if lookback_df[col].isnull().all() and col != 'date'
]

if cols_to_drop:
    print(f"\nDropping {len(cols_to_drop)} fully missing columns")
    df_merged = df_merged.drop(columns=cols_to_drop)

print(f"Shape: {df_merged.shape}")
print(f"Period: {df_merged['date'].min().date()} ~ {df_merged['date'].max().date()}")
print(f"Missing: {df_merged.isnull().sum().sum()}")

df_merged.to_csv("merge_data.csv",index=False)

display(df_merged)

DATA LOADING
(2980, 41) macro_crypto_data.csv
(2235, 2) SP500.csv
(2236, 2) VIX.csv
(2237, 2) GOLD.csv
(2238, 2) DXY.csv
(2847, 2) fear_greed.csv
(2187, 2) eth_funding_rate.csv
(2915, 6) usdt_eth_mcap.csv
(2013, 2) aave_eth_tvl.csv
(1799, 2) lido_eth_tvl.csv
(2515, 2) makerdao_eth_tvl.csv
(2577, 2) uniswap_eth_tvl.csv
(2107, 2) curve-dex_eth_tvl.csv
(2978, 2) eth_chain_tvl.csv
(1605, 5) layer2_tvl.csv
macro_df last date: 2025-11-21 00:00:00
sp500_df last date: 2025-11-20 00:00:00
vix_df last date: 2025-11-21 00:00:00
gold_df last date: 2025-11-21 00:00:00
dxy_df last date: 2025-11-21 00:00:00
fear_greed_df last date: 2025-11-21 00:00:00
eth_funding_df last date: 2025-11-21 00:00:00
usdt_eth_mcap_df last date: 2025-11-21 00:00:00
aave_tvl_df last date: 2025-11-21 00:00:00
lido_tvl_df last date: 2025-11-21 00:00:00
makerdao_tvl_df last date: 2025-11-21 00:00:00
uniswap_tvl_df last date: 2025-11-21 00:00:00
curve_tvl_df last date: 2025-11-21 00:00:00
eth_chain_tvl_df last date: 2025-11-21

Unnamed: 0,date,BTC_Open,BTC_High,BTC_Low,BTC_Close,BTC_Volume,ETH_Open,ETH_High,ETH_Low,ETH_Close,...,chain_eth_chain_tvl,funding_fundingRate,l2_arbitrum_tvl,l2_optimism_tvl,l2_base_tvl,l2_zksync era_tvl,sp500_SP500,vix_VIX,gold_GOLD,dxy_DXY
0,2019-06-15,10462000.0,10600000.0,10311000.0,10528000.0,6509.174133,317900.0,326500.0,313000.0,319500.0,...,484243010,0.000000,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,0.000000
1,2019-06-16,10523000.0,11150000.0,10451000.0,10710000.0,12029.902237,319500.0,330500.0,317000.0,320150.0,...,495664501,0.000000,0.000000e+00,0.0,0.000000e+00,0.0,0.000000,0.000000,0.000000,0.000000
2,2019-06-17,10710000.0,11187000.0,10694000.0,11055000.0,8591.802320,320200.0,326050.0,319550.0,324100.0,...,509130301,0.000000,0.000000e+00,0.0,0.000000e+00,0.0,2889.669922,15.350000,1338.699951,97.559998
3,2019-06-18,11055000.0,11100000.0,10750000.0,10902000.0,7165.309059,324350.0,324900.0,314650.0,317800.0,...,513188682,0.000000,0.000000e+00,0.0,0.000000e+00,0.0,2917.750000,15.150000,1346.599976,97.639999
4,2019-06-19,10901000.0,11100000.0,10845000.0,11063000.0,5153.060822,317800.0,322500.0,316150.0,320400.0,...,504467906,0.000000,0.000000e+00,0.0,0.000000e+00,0.0,2926.459961,14.330000,1344.599976,97.120003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346,2025-11-16,144265000.0,144888000.0,138663000.0,140320000.0,3306.730953,4786000.0,4876000.0,4500000.0,4613000.0,...,149718439825,0.000093,2.868912e+09,300706531.0,4.483150e+09,41005406.0,6734.109863,19.830000,4087.600098,99.269997
2347,2025-11-17,140320000.0,142926000.0,137127000.0,137706000.0,3402.729073,4613000.0,4799000.0,4461000.0,4528000.0,...,146706008886,0.000087,2.833770e+09,296461366.0,4.380960e+09,41025892.0,6672.410156,22.379999,4068.300049,99.589996
2348,2025-11-18,137706000.0,139184000.0,132200000.0,137750000.0,6342.520244,4528000.0,4695000.0,4386000.0,4627000.0,...,143734941269,0.000083,2.770454e+09,269449790.0,4.277518e+09,39182638.0,6617.319824,24.690001,4061.300049,99.550003
2349,2025-11-19,137750000.0,137807000.0,132449000.0,136500000.0,2862.635006,4629000.0,4635000.0,4301000.0,4510000.0,...,146360583931,0.000066,2.825563e+09,295610162.0,4.340401e+09,39802577.0,6642.160156,23.660000,4077.699951,100.230003


In [7]:
import ccxt
import pandas as pd
import time
from datetime import datetime

def fetch_eth_hourly_data(start_date='2019-12-31', end_date=None):
    exchange = ccxt.binance({
        'enableRateLimit': True,
    })
    
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    start_ts = exchange.parse8601(f"{start_date}T00:00:00Z")
    end_ts = exchange.parse8601(f"{end_date}T23:59:59Z")
    
    symbol = 'ETH/USDT'
    timeframe = '1h'
    limit = 1000
    
    all_data = []
    current_ts = start_ts
    
    print(f"Fetching ETH 1H data: {start_date} ~ {end_date}")
    
    while current_ts < end_ts:
        try:
            ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since=current_ts, limit=limit)
            
            if not ohlcv:
                break
                
            all_data.extend(ohlcv)
            current_ts = ohlcv[-1][0] + 1
            
            print(f"  Fetched {len(all_data)} candles... (last: {pd.to_datetime(ohlcv[-1][0], unit='ms')})")
            
            time.sleep(0.1)
            
        except Exception as e:
            print(f"  Error: {e}, retrying in 5s...")
            time.sleep(5)
    
    df = pd.DataFrame(all_data, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
    df = df.drop_duplicates(subset='timestamp').reset_index(drop=True)
    
    print(f"Done! Total {len(df)} hourly candles")
    
    return df

# 실행
eth_hourly = fetch_eth_hourly_data('2019-12-31')
eth_hourly.to_csv('eth_hour.csv', index=False)

Fetching ETH 1H data: 2019-12-31 ~ 2025-11-22
  Fetched 1000 candles... (last: 2020-02-10 16:00:00)
  Fetched 2000 candles... (last: 2020-03-23 14:00:00)
  Fetched 3000 candles... (last: 2020-05-04 08:00:00)
  Fetched 4000 candles... (last: 2020-06-15 00:00:00)
  Fetched 5000 candles... (last: 2020-07-26 19:00:00)
  Fetched 6000 candles... (last: 2020-09-06 11:00:00)
  Fetched 7000 candles... (last: 2020-10-18 03:00:00)
  Fetched 8000 candles... (last: 2020-11-28 19:00:00)
  Fetched 9000 candles... (last: 2021-01-09 17:00:00)
  Fetched 10000 candles... (last: 2021-02-20 10:00:00)
  Fetched 11000 candles... (last: 2021-04-03 03:00:00)
  Fetched 12000 candles... (last: 2021-05-15 00:00:00)
  Fetched 13000 candles... (last: 2021-06-25 16:00:00)
  Fetched 14000 candles... (last: 2021-08-06 08:00:00)
  Fetched 15000 candles... (last: 2021-09-17 04:00:00)
  Fetched 16000 candles... (last: 2021-10-28 22:00:00)
  Fetched 17000 candles... (last: 2021-12-09 14:00:00)
  Fetched 18000 candles... (

In [2]:
eth_hourly.tail(10)

Unnamed: 0,timestamp,open,high,low,close,volume,datetime
51608,1763740800000,2705.31,2764.0,2675.0,2762.8,60752.9527,2025-11-21 16:00:00
51609,1763744400000,2762.81,2806.84,2754.89,2780.48,41775.1804,2025-11-21 17:00:00
51610,1763748000000,2780.49,2794.49,2716.49,2779.15,58923.6202,2025-11-21 18:00:00
51611,1763751600000,2779.15,2787.96,2740.22,2752.24,24520.2215,2025-11-21 19:00:00
51612,1763755200000,2752.23,2756.21,2708.55,2737.86,29540.1573,2025-11-21 20:00:00
51613,1763758800000,2737.83,2777.01,2731.49,2767.23,22201.3102,2025-11-21 21:00:00
51614,1763762400000,2767.22,2774.09,2720.72,2721.88,12754.972,2025-11-21 22:00:00
51615,1763766000000,2721.89,2769.8,2713.15,2765.85,17494.4632,2025-11-21 23:00:00
51616,1763769600000,2765.86,2780.59,2758.21,2766.35,11148.7247,2025-11-22 00:00:00
51617,1763773200000,2766.36,2784.12,2761.49,2781.77,5257.386,2025-11-22 01:00:00


In [6]:
eth_hourly.head(10)


Unnamed: 0,timestamp,open,high,low,close,volume,datetime
0,1577836800000,129.16,129.19,128.68,128.87,7769.17336,2020-01-01 00:00:00
1,1577840400000,128.87,130.65,128.78,130.64,11344.65516,2020-01-01 01:00:00
2,1577844000000,130.63,130.98,130.35,130.85,7603.35623,2020-01-01 02:00:00
3,1577847600000,130.85,130.89,129.94,130.2,4968.55433,2020-01-01 03:00:00
4,1577851200000,130.21,130.74,130.15,130.2,3397.90747,2020-01-01 04:00:00
5,1577854800000,130.2,130.47,130.11,130.3,4243.6064,2020-01-01 05:00:00
6,1577858400000,130.31,130.75,130.26,130.44,3668.90166,2020-01-01 06:00:00
7,1577862000000,130.47,130.71,130.14,130.24,4147.17413,2020-01-01 07:00:00
8,1577865600000,130.24,130.41,129.87,130.36,7541.44497,2020-01-01 08:00:00
9,1577869200000,130.4,130.62,130.13,130.17,4808.20496,2020-01-01 09:00:00


In [1]:
import requests
import pandas as pd
import time

def fetch_upbit_hourly(market='KRW-ETH', start_date='2020-01-01'):
    all_data = []
    to_date = pd.Timestamp.now()
    start_ts = pd.to_datetime(start_date)
    
    print(f"Fetching Upbit {market} 1H data: {start_date} ~ now")
    
    while to_date > start_ts:
        url = "https://api.upbit.com/v1/candles/minutes/60"
        params = {
            'market': market,
            'to': to_date.strftime('%Y-%m-%dT%H:%M:%S'),
            'count': 200
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            if not data or isinstance(data, dict):
                break
            
            all_data.extend(data)
            
            oldest = pd.to_datetime(data[-1]['candle_date_time_kst'])
            to_date = oldest
            
            print(f"  Fetched {len(all_data)} candles... (oldest: {oldest})")
            
            time.sleep(0.15)
            
        except Exception as e:
            print(f"  Error: {e}, retrying...")
            time.sleep(1)
    
    df = pd.DataFrame(all_data)
    df = df.rename(columns={
        'candle_date_time_kst': 'datetime',
        'opening_price': 'open',
        'high_price': 'high',
        'low_price': 'low',
        'trade_price': 'close',
        'candle_acc_trade_volume': 'volume'
    })
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').reset_index(drop=True)
    df = df[['datetime', 'open', 'high', 'low', 'close', 'volume']]
    df = df.drop_duplicates(subset='datetime').reset_index(drop=True)
    
    mask = df['datetime'] >= start_date
    df = df[mask].reset_index(drop=True)
    
    print(f"Done! Total {len(df)} hourly candles")
    print(f"Period: {df['datetime'].min()} ~ {df['datetime'].max()}")
    
    return df

eth_hourly_krw = fetch_upbit_hourly('KRW-ETH', '2020-01-01')
eth_hourly_krw.to_csv('eth_hour.csv', index=False)

Fetching Upbit KRW-ETH 1H data: 2020-01-01 ~ now
  Fetched 200 candles... (oldest: 2025-11-19 07:00:00)
  Fetched 400 candles... (oldest: 2025-11-11 08:00:00)
  Fetched 600 candles... (oldest: 2025-11-03 09:00:00)
  Fetched 800 candles... (oldest: 2025-10-26 10:00:00)
  Fetched 1000 candles... (oldest: 2025-10-18 06:00:00)
  Fetched 1200 candles... (oldest: 2025-10-10 07:00:00)
  Fetched 1400 candles... (oldest: 2025-10-02 08:00:00)
  Fetched 1600 candles... (oldest: 2025-09-24 09:00:00)
  Fetched 1800 candles... (oldest: 2025-09-16 10:00:00)
  Fetched 2000 candles... (oldest: 2025-09-08 11:00:00)
  Fetched 2200 candles... (oldest: 2025-08-31 12:00:00)
  Fetched 2400 candles... (oldest: 2025-08-23 13:00:00)
  Fetched 2600 candles... (oldest: 2025-08-15 14:00:00)
  Fetched 2800 candles... (oldest: 2025-08-07 15:00:00)
  Fetched 3000 candles... (oldest: 2025-07-30 16:00:00)
  Fetched 3200 candles... (oldest: 2025-07-22 17:00:00)
  Fetched 3400 candles... (oldest: 2025-07-14 18:00:00)
  F

In [9]:
eth_hourly_krw.head(1)

Unnamed: 0,datetime,open,high,low,close,volume
0,2020-01-01,149900.0,150100.0,148300.0,148550.0,3676.176695


In [10]:
eth_hourly_krw.tail(4)

Unnamed: 0,datetime,open,high,low,close,volume
51585,2025-11-22 20:00:00,4077000.0,4119000.0,4062000.0,4106000.0,1373.884989
51586,2025-11-22 21:00:00,4105000.0,4110000.0,4075000.0,4091000.0,1138.311529
51587,2025-11-22 22:00:00,4091000.0,4110000.0,4077000.0,4087000.0,1126.442067
51588,2025-11-22 23:00:00,4087000.0,4134000.0,4087000.0,4122000.0,945.695091
