In [None]:
import os
import time
import warnings
import pandas as pd
import requests
import yfinance as yf
from datetime import datetime, timedelta
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from binance.client import Client
import pytz

warnings.filterwarnings('ignore')

# --- 설정 ---
START_DATE = "2019-01-01"
END_DATE = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')
OUTPUT_DIR = "./macro_data_4h"
os.makedirs(OUTPUT_DIR, exist_ok=True)

KST = pytz.timezone('Asia/Seoul')
UTC = pytz.UTC

UPBIT_CRYPTO_TICKERS = {
    'KRW-BTC': ('BTC', 'BTC'), 'KRW-ETH': ('ETH', 'ETH'), 'KRW-XRP': ('XRP', 'XRP'),
    'KRW-SOL': ('SOL', 'SOL'), 'KRW-ADA': ('ADA', 'ADA'), 'KRW-DOGE': ('DOGE', 'DOGE'),
    'KRW-AVAX': ('AVAX', 'AVAX'), 'KRW-DOT': ('DOT', 'DOT')
}

MACRO_TICKERS = {
    'DX-Y.NYB': 'DXY', 'GC=F': 'GOLD', '^VIX': 'VIX', '^GSPC': 'SP500'
}

# 주소는 그대로 사용
DEFI_PROTOCOLS = {
    'eth_chain': 'https://api.llama.fi/v2/historicalChainTvl/Ethereum',
    'makerdao': 'https://api.llama.fi/protocol/makerdao',
    'lido': 'https://api.llama.fi/protocol/lido',
    'aave': 'https://api.llama.fi/protocol/aave',
    'uniswap': 'https://api.llama.fi/protocol/uniswap-v3', # 데이터 큼
    'curve-dex': 'https://api.llama.fi/protocol/curve-dex' # 데이터 큼
}

# --- 공통 함수 ---
def get_session():
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "application/json"
    })
    retry = Retry(total=5, backoff_factor=2, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retry))
    return session

def save_df(df, filename):
    if df is not None and not df.empty:
        path = os.path.join(OUTPUT_DIR, filename)
        df.to_csv(path, index=False)
        print(f"  - Saved: {filename} ({len(df)} rows)")
    else:
        print(f"  - Warning: {filename} is empty or failed.")

# --- [수정됨] 2. 바이낸스 수집 (Timestamp 에러 해결) ---
def collect_binance_crypto_4h():
    print(f"[2/7] Collecting Binance Data (UTC->KST)...")
    client = Client("", "") # 키 없어도 됨
    symbols = ['BTCUSDT', 'ETHUSDT', 'XRPUSDT', 'SOLUSDT', 'ADAUSDT', 'DOGEUSDT', 'AVAXUSDT', 'DOTUSDT']
    merged_df = None
    
    for symbol in symbols:
        try:
            # 1. 데이터 가져오기
            klines = client.get_historical_klines(symbol, Client.KLINE_INTERVAL_4HOUR, START_DATE, END_DATE)
            if not klines:
                print(f"  - {symbol}: No data")
                continue
                
            # 2. DataFrame 생성 (모두 문자열 상태)
            df = pd.DataFrame(klines, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume', 'ct', 'qv', 'tr', 'tbb', 'tbq', 'ig'])
            
            # 3. Timestamp 변환 (Datetime 객체로 변환)
            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
            df['timestamp'] = df['timestamp'].dt.tz_localize(UTC).dt.tz_convert(KST).dt.tz_localize(None)
            
            # 4. [핵심 수정] 필요한 컬럼만 '따로' 숫자로 변환 (전체 변환 X)
            coin = symbol.replace('USDT', '')
            df_final = df[['timestamp']].copy() # 타임스탬프 복사
            
            # pd.to_numeric으로 안전하게 숫자 변환
            df_final[f'{coin}_Binance_Close'] = pd.to_numeric(df['close'], errors='coerce')
            df_final[f'{coin}_Binance_Volume'] = pd.to_numeric(df['volume'], errors='coerce')
            
            # 5. 병합
            if merged_df is None:
                merged_df = df_final
            else:
                merged_df = pd.merge(merged_df, df_final, on='timestamp', how='outer')
                
        except Exception as e:
            print(f"  - {symbol} Failed: {str(e)}")
            
    if merged_df is not None:
        save_df(merged_df.sort_values('timestamp'), "binance_4h_kst.csv")


def collect_defi_tvl_safe():
    print(f"[7/7] Collecting DeFi TVL (Direct API)...")
    session = get_session()
    
    for name, url in DEFI_PROTOCOLS.items():
        try:
            print(f"  - Fetching {name}...", end=" ")
            time.sleep(2) # 요청 간격
            
            # [핵심 수정] 데이터가 큰 Uniswap/Curve를 위해 timeout을 120초로 연장
            resp = session.get(url, timeout=120) 
            resp.raise_for_status()
            data = resp.json()
            
            # 데이터 파싱
            if name == 'eth_chain':
                records = data
                val_key = 'tvl'
            else:
                # Protocol은 전체 체인 데이터라 Ethereum만 발라내야 함
                records = data.get('chainTvls', {}).get('Ethereum', {}).get('tvl', [])
                val_key = 'totalLiquidityUSD'

            if not records:
                print("No data found.")
                continue

            # DataFrame 변환
            df = pd.DataFrame(records)
            df['timestamp'] = pd.to_datetime(df['date'], unit='s').dt.tz_localize(UTC).dt.tz_convert(KST).dt.tz_localize(None)
            df = df.set_index('timestamp').sort_index()
            
            # 4시간 봉으로 보간 (Interpolation)
            df_4h = df[val_key].resample('4H').interpolate(method='linear').reset_index()
            df_4h.columns = ['timestamp', f'{name}_tvl']
            
            save_df(df_4h, f"{name}_tvl_4h.csv")
            
        except Exception as e:
            print(f"Failed: {str(e)[:50]}") # 에러 메시지 짧게 출력

    # USDT Market Cap
    try:
        print(f"  - Fetching USDT Mcap...", end=" ")
        time.sleep(2)
        resp = session.get("https://stablecoins.llama.fi/stablecoincharts/ethereum?stablecoin=1", timeout=60)
        df = pd.DataFrame(resp.json())
        df['timestamp'] = pd.to_datetime(df['date'], unit='s').dt.tz_localize(UTC).dt.tz_convert(KST).dt.tz_localize(None)
        df = df.set_index('timestamp')['totalCirculating'].resample('4H').interpolate().reset_index()
        df.columns = ['timestamp', 'usdt_eth_mcap']
        save_df(df, "usdt_eth_mcap_4h.csv")
    except Exception as e:
        print(f"Failed: {str(e)[:50]}")

# --- 나머지 함수들 (기존 정상 작동 코드) ---
def collect_upbit_crypto_prices_4h():
    print(f"[1/7] Collecting Upbit Data (KST)...")
    session = get_session()
    start_dt = pd.to_datetime(START_DATE)
    merged_df = None
    for market, (symbol, _) in UPBIT_CRYPTO_TICKERS.items():
        try:
            all_candles = []
            to_date = None
            while True:
                params = {'market': market, 'count': 200}
                if to_date: params['to'] = to_date
                resp = session.get("https://api.upbit.com/v1/candles/minutes/240", params=params, timeout=10)
                resp.raise_for_status()
                candles = resp.json()
                if not candles: break
                all_candles.extend(candles)
                last_date = pd.to_datetime(candles[-1]['candle_date_time_kst'])
                if last_date <= start_dt: break
                to_date = candles[-1]['candle_date_time_utc']
                time.sleep(0.05)
            if all_candles:
                df = pd.DataFrame(all_candles)
                df['timestamp'] = pd.to_datetime(df['candle_date_time_kst']).dt.tz_localize(None)
                df = df.rename(columns={'trade_price': f'{symbol}_Close', 'candle_acc_trade_volume': f'{symbol}_Volume'})
                df = df[['timestamp', f'{symbol}_Close', f'{symbol}_Volume']].sort_values('timestamp')
                df = df[df['timestamp'] >= start_dt]
                merged_df = df if merged_df is None else pd.merge(merged_df, df, on='timestamp', how='outer')
        except Exception as e: print(f"  - {symbol} Failed: {e}")
    if merged_df is not None: save_df(merged_df.sort_values('timestamp'), "crypto_4h_kst.csv")

def collect_macro_indicators_4h():
    print(f"[3/7] Collecting Macro Indicators (yfinance)...")
    for ticker, name in MACRO_TICKERS.items():
        try:
            df = yf.download(ticker, start=START_DATE, end=END_DATE, interval='1d', progress=False)
            if df.empty: continue
            if isinstance(df.columns, pd.MultiIndex): df = df.xs('Close', level=0, axis=1)
            df.index = df.index.tz_localize(None) if df.index.tz is None else df.index.tz_convert(KST).tz_localize(None)
            df_4h = df.resample('4H').ffill()
            df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)]
            df_4h.columns = [name]
            save_df(df_4h.reset_index(), f"{name}_4h.csv")
        except Exception: pass

def collect_fear_greed_4h():
    print(f"[4/7] Collecting Fear & Greed Index...")
    try:
        resp = get_session().get("https://api.alternative.me/fng/?limit=4000&format=json", timeout=10)
        df = pd.DataFrame(resp.json()['data'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s').dt.tz_localize(UTC).dt.tz_convert(KST).dt.tz_localize(None)
        df = df.set_index('timestamp').sort_index()
        df_4h = df['value'].astype(float).resample('4H').ffill().reset_index().rename(columns={'value': 'fear_greed'})
        save_df(df_4h, "fear_greed_4h.csv")
    except Exception: pass

def collect_funding_rate_4h():
    print(f"[5/7] Collecting ETH Funding Rate...")
    try:
        client = Client("", "")
        funding_rates = []
        start_ts = int(pd.to_datetime(START_DATE).timestamp() * 1000)
        end_ts = int(pd.to_datetime(END_DATE).timestamp() * 1000)
        while start_ts < end_ts:
            rates = client.futures_funding_rate(symbol='ETHUSDT', startTime=start_ts, limit=1000)
            if not rates: break
            funding_rates.extend(rates)
            start_ts = rates[-1]['fundingTime'] + 1
            time.sleep(0.1)
        df = pd.DataFrame(funding_rates)
        df['timestamp'] = pd.to_datetime(df['fundingTime'], unit='ms').dt.tz_localize(UTC).dt.tz_convert(KST).dt.tz_localize(None)
        df = df[['timestamp', 'fundingRate']].astype({'fundingRate': float}).sort_values('timestamp')
        save_df(df, "eth_funding_rate_8h.csv")
    except Exception: pass

def collect_eth_metrics_safe():
    print(f"[6/7] Collecting ETH Metrics (yfinance)...")
    try:
        df = yf.download("ETH-USD", start=START_DATE, end=END_DATE, interval="1d", progress=False)
        if isinstance(df.columns, pd.MultiIndex): df = df.xs('Close', level=0, axis=1)
        df.index = df.index.tz_localize(None) if df.index.tz is None else df.index.tz_convert(KST).tz_localize(None)
        df_4h = df.resample('4H').ffill()
        df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)]
        path = os.path.join(OUTPUT_DIR, "eth_metrics_4h.csv")
        df_4h.to_csv(path)
        print(f"  - Saved: eth_metrics_4h.csv ({len(df_4h)} rows)")
    except Exception: pass

# --- 메인 실행 ---
print("="*60)
print("Starting Data Collection Pipeline (Fix V3)")
print("="*60)

collect_upbit_crypto_prices_4h()
collect_binance_crypto_4h() # 수정된 버전 실행
collect_macro_indicators_4h()
collect_fear_greed_4h()
collect_funding_rate_4h()
collect_eth_metrics_safe()
collect_defi_tvl_safe() # 수정된 버전 실행

print("\nPipeline Completed.")



Starting Data Collection Pipeline (Fix V3)
[1/7] Collecting Upbit Data (KST)...


In [4]:
import os
import time
import warnings
from datetime import datetime, timedelta
import pytz

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import yfinance as yf
from binance.client import Client
from defillama2 import DefiLlama

warnings.filterwarnings('ignore')

START_DATE = "2019-01-01"
END_DATE = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')
OUTPUT_DIR = "./macro_data_4h"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 한국 시간대 설정
KST = pytz.timezone('Asia/Seoul')
UTC = pytz.UTC

UPBIT_CRYPTO_TICKERS = {
    'KRW-BTC': ('BTC', 'BTC'),
    'KRW-ETH': ('ETH', 'ETH'),
    'KRW-XRP': ('XRP', 'XRP'),
    'KRW-SOL': ('SOL', 'SOL'),
    'KRW-ADA': ('ADA', 'ADA'),
    'KRW-DOGE': ('DOGE', 'DOGE'),
    'KRW-AVAX': ('AVAX', 'AVAX'),
    'KRW-DOT': ('DOT', 'DOT')
}

MACRO_TICKERS = {
    'DX-Y.NYB': 'DXY',
    'GC=F': 'GOLD',
    '^VIX': 'VIX',
    '^GSPC': 'SP500'
}

DEFI_PROTOCOLS = ['makerdao', 'lido', 'aave', 'uniswap', 'curve-dex']

# 재시도 로직이 포함된 세션 생성 함수 (User-Agent 추가)
def get_retry_session(retries=5, backoff_factor=2, status_forcelist=(429, 500, 502, 503, 504)):
    session = requests.Session()
    # 차단 방지를 위한 User-Agent 헤더 추가
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    })
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def collect_upbit_crypto_prices_4h():
    """업비트 4시간봉 암호화폐 가격 수집 (KST 기준)"""
    print(f"\n[1/7] Collecting 4H cryptocurrency prices from Upbit (KST)...")
    session = get_retry_session()
    
    start_dt = pd.to_datetime(START_DATE)
    merged_df = None
    
    for market, (symbol, _) in UPBIT_CRYPTO_TICKERS.items():
        try:
            all_candles = []
            to_date = None
            
            while True:
                url = f"https://api.upbit.com/v1/candles/minutes/240"
                params = {'market': market, 'count': 200}
                if to_date:
                    params['to'] = to_date
                
                response = session.get(url, params=params, timeout=10)
                response.raise_for_status()
                candles = response.json()
                
                if not candles:
                    break
                
                all_candles.extend(candles)
                oldest_date = pd.to_datetime(candles[-1]['candle_date_time_kst'])
                
                if oldest_date <= start_dt:
                    break
                
                to_date = candles[-1]['candle_date_time_utc']
                time.sleep(0.1)
            
            if not all_candles:
                print(f"  - {symbol}: No data")
                continue
            
            df = pd.DataFrame(all_candles)
            df['timestamp'] = pd.to_datetime(df['candle_date_time_kst'])
            df['timestamp'] = df['timestamp'].dt.tz_localize(None)
            
            df = df.rename(columns={
                'opening_price': f'{symbol}_Open',
                'high_price': f'{symbol}_High',
                'low_price': f'{symbol}_Low',
                'trade_price': f'{symbol}_Close',
                'candle_acc_trade_volume': f'{symbol}_Volume'
            })
            
            df = df[['timestamp', f'{symbol}_Open', f'{symbol}_High', 
                    f'{symbol}_Low', f'{symbol}_Close', f'{symbol}_Volume']]
            df = df.sort_values('timestamp')
            df = df[df['timestamp'] >= start_dt]
            
            if merged_df is None:
                merged_df = df
            else:
                merged_df = pd.merge(merged_df, df, on='timestamp', how='outer')
            
            print(f"  - {symbol}: {len(df)} 4H candles")
            
        except Exception as e:
            print(f"  - {symbol}: Failed ({str(e)[:50]})")
    
    if merged_df is not None:
        merged_df = merged_df.sort_values('timestamp')
        output_path = os.path.join(OUTPUT_DIR, "crypto_4h_kst.csv")
        merged_df.to_csv(output_path, index=False)
        print(f"  Saved: {output_path}")

def collect_binance_crypto_4h():
    """바이낸스에서 4시간봉 데이터 수집 (UTC -> KST 변환)"""
    print(f"\n[2/7] Collecting 4H data from Binance (UTC->KST)...")
    
    # API 키가 없어도 공개 데이터는 수집 가능할 수 있으나, 제한이 있을 수 있음
    client = Client("", "")
    symbols = ['BTCUSDT', 'ETHUSDT', 'XRPUSDT', 'SOLUSDT', 'ADAUSDT', 
               'DOGEUSDT', 'AVAXUSDT', 'DOTUSDT']
    
    merged_df = None
    
    for symbol in symbols:
        try:
            coin_name = symbol.replace('USDT', '')
            klines = client.get_historical_klines(
                symbol, 
                Client.KLINE_INTERVAL_4HOUR,
                START_DATE,
                END_DATE
            )
            
            df = pd.DataFrame(klines, columns=[
                'timestamp', 'open', 'high', 'low', 'close', 'volume',
                'close_time', 'quote_volume', 'trades', 'taker_buy_base',
                'taker_buy_quote', 'ignore'
            ])
            
            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
            df['timestamp'] = df['timestamp'].dt.tz_localize(UTC).dt.tz_convert(KST)
            df['timestamp'] = df['timestamp'].dt.tz_localize(None)
            df = df[['timestamp', 'open', 'high', 'low', 'close', 'volume']]
            
            for col in ['open', 'high', 'low', 'close', 'volume']:
                df[col] = df[col].astype(float)
            
            df = df.rename(columns={
                'open': f'{coin_name}_Binance_Open',
                'high': f'{coin_name}_Binance_High',
                'low': f'{coin_name}_Binance_Low',
                'close': f'{coin_name}_Binance_Close',
                'volume': f'{coin_name}_Binance_Volume'
            })
            
            if merged_df is None:
                merged_df = df
            else:
                merged_df = pd.merge(merged_df, df, on='timestamp', how='outer')
            
            print(f"  - {coin_name}: {len(df)} 4H candles")
            
        except Exception as e:
            print(f"  - {symbol}: Failed ({str(e)[:50]})")
    
    if merged_df is not None:
        output_path = os.path.join(OUTPUT_DIR, "binance_4h_kst.csv")
        merged_df.to_csv(output_path, index=False)
        print(f"  Saved: {output_path}")

def collect_macro_indicators_4h():
    """거시경제 지표 수집 (1일 -> 4시간)"""
    print(f"\n[3/7] Collecting and resampling macro indicators to 4H (KST)...")
    
    for ticker, name in MACRO_TICKERS.items():
        try:
            df = yf.download(ticker, start=START_DATE, end=END_DATE, 
                           interval='1d', progress=False)
            
            if df.empty:
                print(f"  - {name}: No data available")
                continue
            
            if isinstance(df.columns, pd.MultiIndex):
                df = df.xs('Close', level=0, axis=1)
            else:
                df = df[['Close']]
                
            df.index = pd.to_datetime(df.index)
            
            if df.index.tz is None:
                df.index = df.index.tz_localize(UTC)
            else:
                df.index = df.index.tz_convert(UTC)
            
            df.index = df.index.tz_convert(KST)
            df.index = df.index.tz_localize(None)
            
            df_4h = df.resample('4H').ffill()
            df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)]
            
            df_4h.columns = [name]
            df_4h.index.name = 'timestamp'
            
            output_path = os.path.join(OUTPUT_DIR, f"{name}_4h.csv")
            df_4h.to_csv(output_path)
            print(f"  - {name}: {len(df_4h)} 4H candles")
        except Exception as e:
            print(f"  - {name}: Failed ({str(e)[:50]})")

def collect_fear_greed_4h():
    """Fear & Greed Index (4H, KST)"""
    print(f"\n[4/7] Collecting Fear & Greed Index (interpolated to 4H, KST)...")
    session = get_retry_session()
    
    try:
        url = "https://api.alternative.me/fng/?limit=4000&format=json"
        response = session.get(url, timeout=10)
        data = response.json()['data']
        
        df = pd.DataFrame(data)
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        df['timestamp'] = df['timestamp'].dt.tz_localize(UTC).dt.tz_convert(KST)
        df['timestamp'] = df['timestamp'].dt.tz_localize(None)
        df = df[['timestamp', 'value']].rename(columns={'value': 'fear_greed'})
        df['fear_greed'] = df['fear_greed'].astype(float)
        df = df.sort_values('timestamp').reset_index(drop=True)
        
        df = df.set_index('timestamp')
        df_4h = df.resample('4H').ffill().reset_index()
        
        output_path = os.path.join(OUTPUT_DIR, "fear_greed_4h.csv")
        df_4h.to_csv(output_path, index=False)
        print(f"  - Fear & Greed: {len(df_4h)} 4H candles")
    except Exception as e:
        print(f"  - Fear & Greed: Failed ({str(e)[:50]})")

def collect_funding_rate_4h():
    """ETH 펀딩비 (8H -> KST)"""
    print(f"\n[5/7] Collecting ETH funding rate (8H intervals, KST)...")
    
    try:
        client = Client("", "")
        funding_rates = []
        
        start_time = int(datetime.strptime(START_DATE, "%Y-%m-%d").timestamp() * 1000)
        end_time = int(datetime.strptime(END_DATE, "%Y-%m-%d").timestamp() * 1000)
        current_ts = start_time
        
        while current_ts < end_time:
            rates = client.futures_funding_rate(
                symbol='ETHUSDT',
                startTime=current_ts,
                limit=1000
            )
            if not rates:
                break
            funding_rates.extend(rates)
            current_ts = rates[-1]['fundingTime'] + 1
            time.sleep(0.1)
        
        df = pd.DataFrame(funding_rates)
        df['timestamp'] = pd.to_datetime(df['fundingTime'], unit='ms')
        df['timestamp'] = df['timestamp'].dt.tz_localize(UTC).dt.tz_convert(KST)
        df['timestamp'] = df['timestamp'].dt.tz_localize(None)
        df['fundingRate'] = df['fundingRate'].astype(float)
        df = df[['timestamp', 'fundingRate']].sort_values('timestamp')
        
        output_path = os.path.join(OUTPUT_DIR, "eth_funding_rate_8h.csv")
        df.to_csv(output_path, index=False)
        print(f"  - Funding Rate: {len(df)} 8H intervals")
    except Exception as e:
        print(f"  - Funding Rate: Failed ({str(e)[:50]})")

def collect_coingecko_eth_metrics():
    """CoinGecko 메트릭 수집 (Retry 로직 강화 및 타임아웃 연장)"""
    print(f"\n[6/7] Collecting ETH metrics from CoinGecko (KST)...")
    session = get_retry_session()
    
    try:
        url = "https://api.coingecko.com/api/v3/coins/ethereum/market_chart"
        params = {
            'vs_currency': 'usd',
            'days': 'max',
            'interval': 'daily'
        }
        
        # 타임아웃을 30초로 증가
        response = session.get(url, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        df = pd.DataFrame({
            'timestamp': [pd.to_datetime(x[0], unit='ms') for x in data['prices']],
            'market_cap': [x[1] for x in data['market_caps']],
            'total_volume': [x[1] for x in data['total_volumes']]
        })
        
        df['timestamp'] = df['timestamp'].dt.tz_localize(UTC).dt.tz_convert(KST)
        df['timestamp'] = df['timestamp'].dt.tz_localize(None)
        
        df = df.set_index('timestamp')
        df_4h = df.resample('4H').interpolate(method='linear').reset_index()
        
        output_path = os.path.join(OUTPUT_DIR, "eth_coingecko_4h.csv")
        df_4h.to_csv(output_path, index=False)
        print(f"  - ETH Market Data: {len(df_4h)} 4H candles")
        
    except Exception as e:
        print(f"  - CoinGecko: Failed ({str(e)[:100]})")

def safe_tz_convert(date_series, target_tz):
    """타임존 유무를 확인하여 안전하게 변환하는 헬퍼 함수"""
    if date_series.dt.tz is None:
        return date_series.dt.tz_localize(UTC).dt.tz_convert(target_tz)
    else:
        return date_series.dt.tz_convert(target_tz)

def collect_defi_tvl_4h():
    """DeFi TVL 수집 (Retry 대기시간 증가 및 예외 처리 강화)"""
    print(f"\n[7/7] Collecting DeFi TVL (interpolated to 4H, KST)...")
    
    obj = DefiLlama()
    
    # Retry 래퍼 함수 (Exponential Backoff 적용)
    def run_with_retry(func, *args, retries=5):
        for attempt in range(retries):
            try:
                return func(*args)
            except Exception as e:
                if attempt == retries - 1:
                    raise e
                # 5, 10, 15, 20, 25초 대기 (Uniswap 같은 큰 데이터 타임아웃 방지)
                sleep_time = 5 * (attempt + 1)
                print(f"    ...Retry {attempt+1}/{retries} after {sleep_time}s ({str(e)[:30]}...)")
                time.sleep(sleep_time)
    
    # 1. Ethereum Chain TVL
    try:
        df = run_with_retry(obj.get_chain_hist_tvl, 'Ethereum')
        df = df.reset_index().rename(columns={'tvl': 'eth_chain_tvl'})
        df['date'] = pd.to_datetime(df['date'])
        
        df['date'] = safe_tz_convert(df['date'], KST)
        df['date'] = df['date'].dt.tz_localize(None)
        
        df = df.set_index('date')
        df_4h = df.resample('4H').interpolate(method='linear').reset_index()
        df_4h = df_4h.rename(columns={'date': 'timestamp'})
        
        output_path = os.path.join(OUTPUT_DIR, 'eth_chain_tvl_4h.csv')
        df_4h.to_csv(output_path, index=False)
        print(f"  - ETH Chain TVL: {len(df_4h)} 4H candles")
    except Exception as e:
        print(f"  - ETH Chain TVL: Failed ({str(e)[:50]})")
    
    # 2. Protocol TVL
    for protocol in DEFI_PROTOCOLS:
        try:
            tvl_dict = run_with_retry(obj.get_protocol_hist_tvl_by_chain, protocol)
            
            if 'Ethereum' in tvl_dict:
                df = tvl_dict['Ethereum'].reset_index()
                df = df.rename(columns={'tvl': f'{protocol}_eth_tvl'})
                df['date'] = pd.to_datetime(df['date'])
                
                df['date'] = safe_tz_convert(df['date'], KST)
                df['date'] = df['date'].dt.tz_localize(None)
                
                df = df.set_index('date')
                df_4h = df.resample('4H').interpolate(method='linear').reset_index()
                df_4h = df_4h.rename(columns={'date': 'timestamp'})
                
                output_path = os.path.join(OUTPUT_DIR, f'{protocol}_eth_tvl_4h.csv')
                df_4h.to_csv(output_path, index=False)
                print(f"  - {protocol}: {len(df_4h)} 4H candles")
            else:
                print(f"  - {protocol}: No Ethereum data")
        except Exception as e:
            print(f"  - {protocol}: Failed ({str(e)[:50]})")
    
    # 3. USDT Market Cap
    try:
        df = run_with_retry(obj.get_stablecoin_hist_mcap_on_a_chain, 1, 'ethereum')
        df = df.reset_index().rename(columns={'mcap': 'usdt_eth_mcap'})
        df['date'] = pd.to_datetime(df['date'])
        
        df['date'] = safe_tz_convert(df['date'], KST)
        df['date'] = df['date'].dt.tz_localize(None)
        
        df = df.set_index('date')
        df_4h = df.resample('4H').interpolate(method='linear').reset_index()
        df_4h = df_4h.rename(columns={'date': 'timestamp'})
        
        output_path = os.path.join(OUTPUT_DIR, 'usdt_eth_mcap_4h.csv')
        df_4h.to_csv(output_path, index=False)
        print(f"  - USDT ETH Mcap: {len(df_4h)} 4H candles")
    except Exception as e:
        print(f"  - USDT ETH Mcap: Failed ({str(e)[:50]})")

print("=" * 80)
print("ETH Price Prediction - 4H Data Collection Pipeline (UTC Aligned)")
print("=" * 80)

collect_upbit_crypto_prices_4h()
collect_binance_crypto_4h()
collect_macro_indicators_4h()
collect_fear_greed_4h()
collect_funding_rate_4h()
collect_coingecko_eth_metrics()
collect_defi_tvl_4h()

print("\n" + "=" * 80)
print("4H Data collection completed!")
print(f"Output directory: {OUTPUT_DIR}")
print("=" * 80)


ETH Price Prediction - 4H Data Collection Pipeline (UTC Aligned)

[1/7] Collecting 4H cryptocurrency prices from Upbit (KST)...
  - BTC: 15134 4H candles
  - ETH: 15134 4H candles
  - XRP: 15134 4H candles
  - SOL: 9023 4H candles
  - ADA: 15134 4H candles
  - DOGE: 10421 4H candles
  - AVAX: 8326 4H candles
  - DOT: 11273 4H candles
  Saved: ./macro_data_4h/crypto_4h_kst.csv

[2/7] Collecting 4H data from Binance (UTC->KST)...
  - BTC: 15128 4H candles
  - ETH: 15128 4H candles
  - XRP: 15128 4H candles
  - SOL: 11604 4H candles
  - ADA: 15128 4H candles
  - DOGE: 14018 4H candles
  - AVAX: 11352 4H candles
  - DOT: 11558 4H candles
  Saved: ./macro_data_4h/binance_4h_kst.csv

[3/7] Collecting and resampling macro indicators to 4H (KST)...
  - DXY: 15121 4H candles
  - GOLD: 15121 4H candles
  - VIX: 15121 4H candles
  - SP500: 15121 4H candles

[4/7] Collecting Fear & Greed Index (interpolated to 4H, KST)...
  - Fear & Greed: 17137 4H candles

[5/7] Collecting ETH funding rate (8H in

In [None]:
import pandas as pd
import os
import glob

OUTPUT_DIR = "./macro_data_4h"

def merge_all_datasets():
    print("=" * 50)
    print("Starting Data Merge Process...")
    print("=" * 50)


    base_file = os.path.join(OUTPUT_DIR, "crypto_4h_kst.csv")
    
    if not os.path.exists(base_file):
        print(f"Error: Base file {base_file} not found!")
        return

    df_master = pd.read_csv(base_file)
    df_master['timestamp'] = pd.to_datetime(df_master['timestamp'])
    df_master = df_master.sort_values('timestamp')
    
    print(f"Base Dataset (Binance): {df_master.shape} rows loaded.")
    print(f"Range: {df_master['timestamp'].min()} ~ {df_master['timestamp'].max()}")

    # 2. 나머지 모든 CSV 파일 찾기
    all_files = glob.glob(os.path.join(OUTPUT_DIR, "*.csv"))
    
    for file_path in all_files:
        # 기준 파일은 건너뜀
        if file_path == base_file:
            continue
            
        file_name = os.path.basename(file_path)
        print(f"Merging {file_name}...", end=" ")
        
        try:
            # 파일 로드
            df_temp = pd.read_csv(file_path)
            df_temp['timestamp'] = pd.to_datetime(df_temp['timestamp'])
            
            df_master = pd.merge(df_master, df_temp, on='timestamp', how='left')
            print("Done.")
            
        except Exception as e:
            print(f"Failed! ({str(e)})")


    print("\nFilling missing values (Forward Fill)...")
    df_master = df_master.ffill()
    
    # 4. 최종 저장
    final_path = "./final_dataset_4h_kst.csv"
    df_master.to_csv(final_path, index=False)
    
    print("=" * 50)
    print(f"Merge Completed! Final Shape: {df_master.shape}")
    print(f"Saved to: {final_path}")
    print("=" * 50)

# 실행
merge_all_datasets()