In [7]:
import os
import time
import warnings
from datetime import datetime, timedelta
import pandas as pd
import requests
import yfinance as yf
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from binance.client import Client

warnings.filterwarnings('ignore')

START_DATE = "2017-01-01"
OUTPUT_DIR = "./macro_data_4h"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# [ÏõêÏπô] Ïã§Ìñâ ÏãúÏ†êÎ≥¥Îã§ ÎØ∏ÎûòÏù∏ Îç∞Ïù¥ÌÑ∞Îäî Î¨¥Ï°∞Í±¥ ÏÇ≠Ï†ú
NOW = pd.Timestamp.now()

UPBIT_TICKERS = {
    'KRW-BTC': ('BTC', 'BTC'), 'KRW-ETH': ('ETH', 'ETH'), 'KRW-XRP': ('XRP', 'XRP'),
    'KRW-SOL': ('SOL', 'SOL'), 'KRW-ADA': ('ADA', 'ADA'), 'KRW-DOGE': ('DOGE', 'DOGE'),
    'KRW-AVAX': ('AVAX', 'AVAX'), 'KRW-DOT': ('DOT', 'DOT')
}

BINANCE_SYMBOLS = {
    'BTCUSDT': 'BTC', 'ETHUSDT': 'ETH', 'XRPUSDT': 'XRP', 'SOLUSDT': 'SOL',
    'ADAUSDT': 'ADA', 'DOGEUSDT': 'DOGE', 'AVAXUSDT': 'AVAX', 'DOTUSDT': 'DOT'
}

MACRO_TICKERS = {
    'DX-Y.NYB': 'DXY', 'GC=F': 'GOLD', '^VIX': 'VIX', '^GSPC': 'SP500'
}

DEFI_PROTOCOLS = ['makerdao', 'lido', 'aave', 'uniswap', 'curve-dex']
L2_CHAINS = ['Arbitrum', 'Optimism', 'Base', 'zkSync Era']

def get_session():
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0",
        "Accept": "application/json"
    })
    retry = Retry(total=5, backoff_factor=2, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retry))
    return session

# -----------------------------------------------------------------------------
# [1] Upbit (Í∏∞Ï§Ä Îç∞Ïù¥ÌÑ∞)
# -----------------------------------------------------------------------------
def collect_upbit_crypto_prices_4h():
    print(f"\n[1/7] Collecting Upbit (Safe Cut: <= NOW)...")
    session = get_session()
    start_dt = pd.to_datetime(START_DATE)
    merged_df = None
    
    for market, (symbol, _) in UPBIT_TICKERS.items():
        try:
            all_candles = []
            to_date = None
            while True:
                url = "https://api.upbit.com/v1/candles/minutes/240"
                params = {'market': market, 'count': 200}
                if to_date: params['to'] = to_date
                resp = session.get(url, params=params, timeout=10)
                candles = resp.json()
                if not candles: break
                all_candles.extend(candles)
                if pd.to_datetime(candles[-1]['candle_date_time_kst']) <= start_dt: break
                to_date = candles[-1]['candle_date_time_utc']
                time.sleep(0.1)
            
            if not all_candles: continue
            
            df = pd.DataFrame(all_candles)
            df['timestamp'] = pd.to_datetime(df['candle_date_time_kst'])
            
            # [ÏõêÏπô Ï†ÅÏö©] ÎØ∏Îûò Îç∞Ïù¥ÌÑ∞ ÏÇ≠Ï†ú
            df = df[df['timestamp'] <= NOW]
            
            df = df.rename(columns={
                'opening_price': f'{symbol}_Open', 'high_price': f'{symbol}_High',
                'low_price': f'{symbol}_Low', 'trade_price': f'{symbol}_Close',
                'candle_acc_trade_volume': f'{symbol}_Volume'
            })
            df = df[['timestamp', f'{symbol}_Open', f'{symbol}_High', f'{symbol}_Low', f'{symbol}_Close', f'{symbol}_Volume']]
            df = df.sort_values('timestamp').drop_duplicates('timestamp')
            df = df[df['timestamp'] >= start_dt]
            
            merged_df = df if merged_df is None else pd.merge(merged_df, df, on='timestamp', how='outer')
            print(f"  - {symbol}: {len(df)} rows")
        except Exception as e: print(f"  - {symbol}: Failed {e}")
    
    if merged_df is not None:
        merged_df = merged_df.sort_values('timestamp')
        merged_df.to_csv(os.path.join(OUTPUT_DIR, "crypto_4h_kst.csv"), index=False)

# -----------------------------------------------------------------------------
# [2] Binance (KST Aligned & Safe Cut)
# -----------------------------------------------------------------------------
def collect_binance_crypto_prices_4h():
    print(f"\n[2/7] Collecting Binance (KST Aligned & Safe Cut)...")
    client = Client("", "")
    start_ms = int(datetime.strptime(START_DATE, "%Y-%m-%d").timestamp() * 1000)
    merged_df = None

    for symbol, base in BINANCE_SYMBOLS.items():
        try:
            klines = client.get_historical_klines(symbol, Client.KLINE_INTERVAL_4HOUR, start_ms)
            df = pd.DataFrame(klines, columns=["open_time", "o", "h", "l", "c", "v", "ct", "qav", "nt", "tbb", "tbq", "ig"])
            
            # UTC -> KST (+9h)
            df["timestamp"] = pd.to_datetime(df["open_time"], unit="ms") + pd.Timedelta(hours=9)
            
            # [ÏõêÏπô Ï†ÅÏö©] ÎØ∏Îûò Îç∞Ïù¥ÌÑ∞ ÏÇ≠Ï†ú
            df = df[df['timestamp'] <= NOW]
            
            # ÏóÖÎπÑÌä∏ ÏãúÍ∞Ñ(1, 5, 9...) Ï†ïÎ†¨
            df = df[df['timestamp'].dt.hour % 4 == 1]
            
            df = df.rename(columns={
                "o": f"{base}_Bin_Open", "h": f"{base}_Bin_High",
                "l": f"{base}_Bin_Low", "c": f"{base}_Bin_Close",
                "v": f"{base}_Bin_Vol"
            })
            df = df[['timestamp', f'{base}_Bin_Open', f'{base}_Bin_High', f'{base}_Bin_Low', f'{base}_Bin_Close', f'{base}_Bin_Vol']]
            df = df[df["timestamp"] >= pd.to_datetime(START_DATE)]
            
            merged_df = df if merged_df is None else pd.merge(merged_df, df, on="timestamp", how="outer")
            print(f"  - {symbol}: {len(df)} rows")
        except: pass
            
    if merged_df is not None:
        merged_df = merged_df.sort_values("timestamp")
        merged_df.to_csv(os.path.join(OUTPUT_DIR, "crypto_binance_4h_kst.csv"), index=False)

# -----------------------------------------------------------------------------
# [3] Macro (Shifted & Safe Cut)
# -----------------------------------------------------------------------------
def collect_macro_indicators_4h():
    print(f"\n[3/7] Collecting Macro (Shifted & Safe Cut)...")
    for ticker, name in MACRO_TICKERS.items():
        try:
            df = yf.download(ticker, start=START_DATE, end=None, progress=False, interval='1d')
            if isinstance(df.columns, pd.MultiIndex): df = df.xs('Close', level=0, axis=1)
            else: df = df[['Close']]
            
            df.index = pd.to_datetime(df.index).tz_localize(None)
            df.columns = [name]
            df.index.name = 'timestamp'

            # 1Ïùº + 9ÏãúÍ∞Ñ Shift
            df.index = df.index + pd.Timedelta(days=1, hours=9)
            
            # [ÏõêÏπô Ï†ÅÏö©] Shift ÌõÑ ÎØ∏ÎûòÎ°ú ÎÑòÏñ¥Í∞Ñ Îç∞Ïù¥ÌÑ∞(ÎÇ¥Ïùº ÏïÑÏπ® Îì±) ÏÇ≠Ï†ú
            # Ïò§Îäò Ïû•Ï§ë Îç∞Ïù¥ÌÑ∞ -> ÎÇ¥Ïùº ÏïÑÏπ®ÏúºÎ°ú Î∞ÄÎ¶º -> Ïó¨Í∏∞ÏÑú ÏûòÎ¶º -> ÏïàÏ†Ñ
            df = df[df.index <= NOW]
            
            # ffillÎ°ú ÌòÑÏû¨ ÏãúÏ†êÍπåÏßÄ Ï±ÑÏõÄ
            full_idx = pd.date_range(start=df.index[0], end=NOW, freq='1H')
            df = df.reindex(full_idx, method='ffill')
            
            # ÏóÖÎπÑÌä∏ ÏãúÍ∞Ñ Ï∂îÏ∂ú
            df_4h = df[df.index.hour % 4 == 1].copy()
            
            df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)]
            df_4h.to_csv(os.path.join(OUTPUT_DIR, f"{name}_4h.csv"))
            print(f"  - {name}: {len(df_4h)} rows")
        except Exception as e:
            print(f"  - {name}: Failed {e}")

# -----------------------------------------------------------------------------
# [4] Fear & Greed (Shifted & Safe Cut)
# -----------------------------------------------------------------------------
def collect_fear_greed_4h():
    print(f"\n[4/7] Collecting Fear & Greed (Shifted & Safe Cut)...")
    try:
        session = get_session()
        resp = session.get("https://api.alternative.me/fng/?limit=4000&format=json", timeout=10)
        data = resp.json()['data']
        
        df = pd.DataFrame(data)
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        df = df[['timestamp', 'value']].rename(columns={'value': 'fear_greed'})
        df['fear_greed'] = df['fear_greed'].astype(float)
        df = df.set_index('timestamp').sort_index()
        
        df.index = df.index + pd.Timedelta(days=1, hours=9)
        
        # [ÏõêÏπô Ï†ÅÏö©] ÎØ∏Îûò Îç∞Ïù¥ÌÑ∞ ÏÇ≠Ï†ú
        df = df[df.index <= NOW]
        
        full_idx = pd.date_range(start=df.index[0], end=NOW, freq='1H')
        df = df.reindex(full_idx, method='ffill')
        
        df_4h = df[df.index.hour % 4 == 1].copy()
        df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)].reset_index()
        
        df_4h.to_csv(os.path.join(OUTPUT_DIR, "fear_greed_4h.csv"), index=False)
        print(f"  - Fear & Greed: {len(df_4h)} rows")
    except Exception as e:
        print(f"  - Fear & Greed: Failed {e}")

# -----------------------------------------------------------------------------
# [5] Funding Rate (KST Aligned & Safe Cut)
# -----------------------------------------------------------------------------
def collect_funding_rate_4h():
    print(f"\n[5/7] Collecting ETH Funding Rate (KST Aligned & Safe Cut)...")
    try:
        client = Client("", "")
        funding_rates = []
        start_ts = int(datetime.strptime(START_DATE, "%Y-%m-%d").timestamp() * 1000)
        end_ts = int(datetime.now().timestamp() * 1000) # ÌòÑÏû¨ÍπåÏßÄÎßå
        
        while start_ts < end_ts:
            rates = client.futures_funding_rate(symbol='ETHUSDT', startTime=start_ts, limit=1000)
            if not rates: break
            funding_rates.extend(rates)
            start_ts = rates[-1]['fundingTime'] + 1
            time.sleep(0.1)
        
        df = pd.DataFrame(funding_rates)
        df['timestamp'] = pd.to_datetime(df['fundingTime'], unit='ms') + pd.Timedelta(hours=9)
        df['fundingRate'] = df['fundingRate'].astype(float)
        df = df[['timestamp', 'fundingRate']].sort_values('timestamp').set_index('timestamp')
        
        # [ÏõêÏπô Ï†ÅÏö©] ÎØ∏Îûò Îç∞Ïù¥ÌÑ∞ ÏÇ≠Ï†ú
        df = df[df.index <= NOW]
        
        full_idx = pd.date_range(start=df.index[0], end=NOW, freq='1H')
        df = df.reindex(full_idx, method='ffill')
        
        df_4h = df[df.index.hour % 4 == 1].copy()
        df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)].reset_index()
        
        df_4h.to_csv(os.path.join(OUTPUT_DIR, "eth_funding_rate_4h.csv"), index=False)
        print(f"  - Funding Rate: {len(df_4h)} rows")
    except Exception as e:
        print(f"  - Funding Rate: Failed {e}")

# -----------------------------------------------------------------------------
# [6, 7] TVL (Shifted & Safe Cut)
# -----------------------------------------------------------------------------
def collect_tvl_all():
    print(f"\n[6~7/7] Collecting TVL (Shifted & Safe Cut)...")
    
    def get_data_manual(url):
        session = get_session()
        for i in range(3):
            try:
                time.sleep(2) 
                resp = session.get(url, timeout=30)
                if resp.status_code == 200: return resp.json()
                time.sleep(5)
            except: time.sleep(5)
        return None

    def process_and_save(df, filename, col_name):
        if df is None or df.empty: return
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        if df['timestamp'].dt.tz is not None:
            df['timestamp'] = df['timestamp'].dt.tz_localize(None)
            
        # 1Ïùº + 9ÏãúÍ∞Ñ Shift
        df['timestamp'] = df['timestamp'] + pd.Timedelta(days=1, hours=9)
        
        df = df.sort_values('timestamp').drop_duplicates(subset=['timestamp']).set_index('timestamp')
        
        # [ÏõêÏπô Ï†ÅÏö©] ÎØ∏Îûò Îç∞Ïù¥ÌÑ∞ ÏÇ≠Ï†ú
        df = df[df.index <= NOW]
        
        # ffillÎ°ú ÌòÑÏû¨ÍπåÏßÄ Ï±ÑÏõÄ
        full_idx = pd.date_range(start=df.index[0], end=NOW, freq='1H')
        df = df.reindex(full_idx, method='ffill')
        
        df_4h = df[df.index.hour % 4 == 1].copy()
        
        df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)].reset_index()
        df_4h.to_csv(os.path.join(OUTPUT_DIR, filename), index=False)
        print(f"  - {col_name}: {len(df_4h)} rows")

    # ETH Chain
    try:
        data = get_data_manual("https://api.llama.fi/v2/historicalChainTvl/Ethereum")
        if data:
            df = pd.DataFrame(data)
            df['timestamp'] = pd.to_datetime(df['date'], unit='s')
            df = df.rename(columns={'tvl': 'eth_chain_tvl'})[['timestamp', 'eth_chain_tvl']]
            process_and_save(df, 'eth_chain_tvl_4h.csv', 'ETH Chain TVL')
    except: pass

    # Protocols
    for protocol in DEFI_PROTOCOLS:
        try:
            data = get_data_manual(f"https://api.llama.fi/protocol/{protocol}")
            if data:
                chain_data = data.get('chainTvls', {}).get('Ethereum', {}).get('tvl', [])
                if not chain_data: chain_data = data.get('tvl', [])
                if chain_data:
                    df = pd.DataFrame(chain_data)
                    df['timestamp'] = pd.to_datetime(df['date'], unit='s')
                    val_col = 'totalLiquidityUSD' if 'totalLiquidityUSD' in df.columns else 'tvl'
                    df = df.rename(columns={val_col: f'{protocol}_eth_tvl'})
                    process_and_save(df[['timestamp', f'{protocol}_eth_tvl']], f'{protocol}_eth_tvl_4h.csv', protocol)
        except: pass

    # L2 Chains
    for chain in L2_CHAINS:
        try:
            chain_name = chain.replace(' ', '_').lower()
            data = get_data_manual(f"https://api.llama.fi/v2/historicalChainTvl/{chain}")
            if data:
                df = pd.DataFrame(data)
                df['timestamp'] = pd.to_datetime(df['date'], unit='s')
                df = df.rename(columns={'tvl': f'{chain_name}_tvl'})
                process_and_save(df[['timestamp', f'{chain_name}_tvl']], f'{chain_name}_tvl_4h.csv', chain)
        except: pass
    
    # USDT Mcap
    try:
        data = get_data_manual("https://stablecoins.llama.fi/stablecoincharts/Ethereum?stablecoin=1")
        if data:
            df = pd.DataFrame(data)

            if 'totalCirculatingUSD' in df.columns:
                df['usdt_eth_mcap'] = df['totalCirculatingUSD'].apply(
                    lambda x: x.get('peggedUSD') if isinstance(x, dict) else x
                )
                
                # 2. ÌÉÄÏûÑÏä§ÌÉ¨ÌîÑ Ï≤òÎ¶¨
                df['timestamp'] = pd.to_datetime(df['date'], unit='s')
                
                # 3. Ï†ÄÏû•
                process_and_save(df[['timestamp', 'usdt_eth_mcap']], 'usdt_eth_mcap_4h.csv', 'USDT Mcap')
    except Exception as e:
        print(f"  - USDT Mcap: Failed ({str(e)[:50]})")

# -----------------------------------------------------------------------------
# Run All
# -----------------------------------------------------------------------------
print("üöÄ Starting Final Pipeline (Safe Cut Rule Applied)...")
collect_upbit_crypto_prices_4h()
collect_binance_crypto_prices_4h()
collect_macro_indicators_4h()
collect_fear_greed_4h()
collect_funding_rate_4h()
collect_tvl_all()
print("‚úÖ Done.")



üöÄ Starting Final Pipeline (Safe Cut Rule Applied)...

[1/7] Collecting Upbit (Safe Cut: <= NOW)...
  - BTC: 17909 rows
  - ETH: 17886 rows
  - XRP: 17885 rows
  - SOL: 9032 rows
  - ADA: 17825 rows
  - DOGE: 10430 rows
  - AVAX: 8335 rows
  - DOT: 11282 rows

[2/7] Collecting Binance (KST Aligned & Safe Cut)...
  - BTCUSDT: 18137 rows
  - ETHUSDT: 18137 rows
  - XRPUSDT: 16583 rows
  - SOLUSDT: 11613 rows
  - ADAUSDT: 16686 rows
  - DOGEUSDT: 14027 rows
  - AVAXUSDT: 11361 rows
  - DOTUSDT: 11567 rows

[3/7] Collecting Macro (Shifted & Safe Cut)...
  - DXY: 19504 rows
  - GOLD: 19504 rows
  - VIX: 19504 rows
  - SP500: 19504 rows

[4/7] Collecting Fear & Greed (Shifted & Safe Cut)...
  - Fear & Greed: 17140 rows

[5/7] Collecting ETH Funding Rate (KST Aligned & Safe Cut)...
  - Funding Rate: 13160 rows

[6~7/7] Collecting TVL (Shifted & Safe Cut)...
  - ETH Chain TVL: 17902 rows
  - makerdao: 15118 rows
  - lido: 10822 rows
  - aave: 12106 rows
  - uniswap: 15490 rows
  - curve-dex:

In [17]:
def get_data_manual(url):
    session = get_session()
    for i in range(3):
        try:
            time.sleep(2) 
            resp = session.get(url, timeout=30)
            if resp.status_code == 200: return resp.json()
            time.sleep(5)
        except: time.sleep(5)
    return None

def process_and_save(df, filename, col_name):
    if df is None or df.empty: return
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    if df['timestamp'].dt.tz is not None:
        df['timestamp'] = df['timestamp'].dt.tz_localize(None)

    # 1Ïùº + 9ÏãúÍ∞Ñ Shift
    df['timestamp'] = df['timestamp'] + pd.Timedelta(days=1, hours=9)

    df = df.sort_values('timestamp').drop_duplicates(subset=['timestamp']).set_index('timestamp')

    # [ÏõêÏπô Ï†ÅÏö©] ÎØ∏Îûò Îç∞Ïù¥ÌÑ∞ ÏÇ≠Ï†ú
    df = df[df.index <= NOW]

    # ffillÎ°ú ÌòÑÏû¨ÍπåÏßÄ Ï±ÑÏõÄ
    full_idx = pd.date_range(start=df.index[0], end=NOW, freq='1H')
    df = df.reindex(full_idx, method='ffill')

    df_4h = df[df.index.hour % 4 == 1].copy()

    df_4h = df_4h[df_4h.index >= pd.to_datetime(START_DATE)].reset_index()
    df_4h.to_csv(os.path.join(OUTPUT_DIR, filename), index=False)
    print(f"  - {col_name}: {len(df_4h)} rows")

# ETH Chain
try:
    data = get_data_manual("https://api.llama.fi/v2/historicalChainTvl/Ethereum")
    if data:
        df = pd.DataFrame(data)
        df['timestamp'] = pd.to_datetime(df['date'], unit='s')
        df = df.rename(columns={'tvl': 'eth_chain_tvl'})[['timestamp', 'eth_chain_tvl']]
        process_and_save(df, 'eth_chain_tvl_4h.csv', 'ETH Chain TVL')
except: pass
    
data = get_data_manual("https://stablecoins.llama.fi/stablecoincharts/Ethereum?stablecoin=1")

df = pd.DataFrame(data)

if 'totalCirculatingUSD' in df.columns:
    df['usdt_eth_mcap'] = df['totalCirculatingUSD'].apply(
        lambda x: x.get('peggedUSD') if isinstance(x, dict) else x)
    # 2. ÌÉÄÏûÑÏä§ÌÉ¨ÌîÑ Ï≤òÎ¶¨
    df['timestamp'] = pd.to_datetime(df['date'], unit='s')

    # 3. Ï†ÄÏû•
    process_and_save(df[['timestamp', 'usdt_eth_mcap']], 'usdt_eth_mcap_4h.csv', 'USDT Mcap')

  - ETH Chain TVL: 17902 rows
  - USDT Mcap: 17524 rows


In [18]:
import pandas as pd
import os
import glob

# ÏÑ§Ï†ï
OUTPUT_DIR = "./macro_data_4h"
FINAL_FILENAME = "eth_4hour.csv"

def create_eth_dataset():
    print("üîÑ Starting Final Merge (Target: eth_4hour.csv)...")
    
    # 1. Í∏∞Ï§Ä ÌååÏùº Î°úÎìú (Upbit Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞)
    base_path = os.path.join(OUTPUT_DIR, "crypto_4h_kst.csv")
    
    if not os.path.exists(base_path):
        print("‚ùå Error: Base file 'crypto_4h_kst.csv' not found.")
        return
        
    print(f"üìç Base Timeline: {os.path.basename(base_path)}")
    base_df = pd.read_csv(base_path)
    
    # Timestamp Ï†ïÍ∑úÌôî
    if 'timestamp' not in base_df.columns:
        base_df.rename(columns={base_df.columns[0]: 'timestamp'}, inplace=True)
    base_df['timestamp'] = pd.to_datetime(base_df['timestamp'])
    base_df = base_df.sort_values('timestamp')
    
    print(f"   -> Start Date: {base_df['timestamp'].min()}")
    print(f"   -> End Date:   {base_df['timestamp'].max()}")
    print(f"   -> Total Rows: {len(base_df)}")

    # 2. Î≥ëÌï©Ìï† ÌååÏùº Î¶¨Ïä§Ìä∏ÏóÖ
    all_files = glob.glob(os.path.join(OUTPUT_DIR, "*.csv"))

    exclude_keywords = ['crypto_4h_kst.csv', 'eth_4hour.csv', 'final_', 'merged', 'upbit_']
    
    files_to_merge = []
    for f in all_files:
        fname = os.path.basename(f)
        if not any(k in fname for k in exclude_keywords):
            files_to_merge.append(f)
            
    print(f"üìÇ Files to merge: {len(files_to_merge)}")

    # 3. Left Join Î∞òÎ≥µ
    # (Left JoinÏùÑ Ïì∞Î©¥ Base(Upbit)Î≥¥Îã§ Í≥ºÍ±∞Ïù∏ Îç∞Ïù¥ÌÑ∞Îäî ÏûêÎèôÏúºÎ°ú ÏûòÎ†§ÎÇòÍ∞ëÎãàÎã§)
    for file_path in files_to_merge:
        try:
            fname = os.path.basename(file_path)
            df = pd.read_csv(file_path)
            
            # Ïª¨ÎüºÎ™Ö Î≥¥Ï†ï (Ï≤´Î≤àÏß∏ Ïª¨Îüº -> timestamp)
            if 'timestamp' not in df.columns:
                df.rename(columns={df.columns[0]: 'timestamp'}, inplace=True)
            
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.drop_duplicates('timestamp')
            
            # Î≥ëÌï© ÏàòÌñâ
            # on='timestamp' Ïô∏Ïóê Í≤πÏπòÎäî Ïª¨ÎüºÏù¥ ÏóÜÎã§Í≥† Í∞ÄÏ†ï (Left Join)
            base_df = pd.merge(base_df, df, on='timestamp', how='left')
            print(f"  ‚úÖ Merged: {fname}")
            
        except Exception as e:
            print(f"  ‚ö†Ô∏è Error merging {fname}: {e}")

    # 4. Í≤∞Ï∏°Ïπò Ï≤òÎ¶¨
    # (1) ffill: Ï£ºÎßê/Ìú¥Ïùº Îì±ÏúºÎ°ú Ïù∏Ìïú Macro Îç∞Ïù¥ÌÑ∞ Í≥µÎ∞± Î©îÍøà
    # (2) fillna(0): ÏÉÅÏû• Ï†ÑÏù¥Îùº ÏïÑÏòà Îç∞Ïù¥ÌÑ∞Í∞Ä ÏóÜÎäî Íµ¨Í∞Ñ 0ÏúºÎ°ú Ï±ÑÏõÄ
    print("üõ†  Handling Missing Values...")
    base_df = base_df.ffill().fillna(0)

    # 5. ÏµúÏ¢Ö Ï†ÄÏû•
    save_path = os.path.join(OUTPUT_DIR, FINAL_FILENAME)
    base_df.to_csv(save_path, index=False)
    
    print("\n" + "="*50)
    print(f"üéâ COMPLETE! File created: {FINAL_FILENAME}")
    print(f"üìä Final Shape: {base_df.shape}")
    print("="*50)

if __name__ == "__main__":
    create_eth_dataset()



üîÑ Starting Final Merge (Target: eth_4hour.csv)...
üìç Base Timeline: crypto_4h_kst.csv
   -> Start Date: 2017-09-26 17:00:00
   -> End Date:   2025-11-28 21:00:00
   -> Total Rows: 17909
üìÇ Files to merge: 18
  ‚úÖ Merged: usdt_eth_mcap_4h.csv
  ‚úÖ Merged: zksync_era_tvl_4h.csv
  ‚úÖ Merged: base_tvl_4h.csv
  ‚úÖ Merged: lido_eth_tvl_4h.csv
  ‚úÖ Merged: eth_chain_tvl_4h.csv
  ‚úÖ Merged: aave_eth_tvl_4h.csv
  ‚úÖ Merged: DXY_4h.csv
  ‚úÖ Merged: VIX_4h.csv
  ‚úÖ Merged: curve-dex_eth_tvl_4h.csv
  ‚úÖ Merged: makerdao_eth_tvl_4h.csv
  ‚úÖ Merged: fear_greed_4h.csv
  ‚úÖ Merged: SP500_4h.csv
  ‚úÖ Merged: arbitrum_tvl_4h.csv
  ‚úÖ Merged: uniswap_eth_tvl_4h.csv
  ‚úÖ Merged: optimism_tvl_4h.csv
  ‚úÖ Merged: eth_funding_rate_4h.csv
  ‚úÖ Merged: GOLD_4h.csv
  ‚úÖ Merged: crypto_binance_4h_kst.csv
üõ†  Handling Missing Values...

üéâ COMPLETE! File created: eth_4hour.csv
üìä Final Shape: (17909, 98)


In [11]:
import requests
import pandas as pd
import time

def fetch_upbit_hourly(market='KRW-ETH', start_date='2020-01-01'):
    all_data = []
    to_date = pd.Timestamp.now()
    start_ts = pd.to_datetime(start_date)
    
    print(f"Fetching Upbit {market} 1H data: {start_date} ~ now")
    
    while to_date > start_ts:
        url = "https://api.upbit.com/v1/candles/minutes/60"
        params = {
            'market': market,
            'to': to_date.strftime('%Y-%m-%dT%H:%M:%S'),
            'count': 200
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            if not data or isinstance(data, dict):
                break
            
            all_data.extend(data)
            
            oldest = pd.to_datetime(data[-1]['candle_date_time_kst'])
            to_date = oldest
            
            print(f"  Fetched {len(all_data)} candles... (oldest: {oldest})")
            
            time.sleep(0.15)
            
        except Exception as e:
            print(f"  Error: {e}, retrying...")
            time.sleep(1)
    
    df = pd.DataFrame(all_data)
    df = df.rename(columns={
        'candle_date_time_kst': 'datetime',
        'opening_price': 'open',
        'high_price': 'high',
        'low_price': 'low',
        'trade_price': 'close',
        'candle_acc_trade_volume': 'volume'
    })
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').reset_index(drop=True)
    df = df[['datetime', 'open', 'high', 'low', 'close', 'volume']]
    df = df.drop_duplicates(subset='datetime').reset_index(drop=True)
    
    mask = df['datetime'] >= start_date
    df = df[mask].reset_index(drop=True)
    
    print(f"Done! Total {len(df)} hourly candles")
    print(f"Period: {df['datetime'].min()} ~ {df['datetime'].max()}")
    
    return df

eth_hourly_krw = fetch_upbit_hourly('KRW-ETH', '2020-01-01')
eth_hourly_krw.to_csv('eth_hour.csv', index=False)

Fetching Upbit KRW-ETH 1H data: 2020-01-01 ~ now
  Fetched 200 candles... (oldest: 2025-11-20 15:00:00)
  Fetched 400 candles... (oldest: 2025-11-12 16:00:00)
  Fetched 600 candles... (oldest: 2025-11-04 17:00:00)
  Fetched 800 candles... (oldest: 2025-10-27 18:00:00)
  Fetched 1000 candles... (oldest: 2025-10-19 14:00:00)
  Fetched 1200 candles... (oldest: 2025-10-11 15:00:00)
  Fetched 1400 candles... (oldest: 2025-10-03 16:00:00)
  Fetched 1600 candles... (oldest: 2025-09-25 17:00:00)
  Fetched 1800 candles... (oldest: 2025-09-17 18:00:00)
  Fetched 2000 candles... (oldest: 2025-09-09 19:00:00)
  Fetched 2200 candles... (oldest: 2025-09-01 20:00:00)
  Fetched 2400 candles... (oldest: 2025-08-24 21:00:00)
  Fetched 2600 candles... (oldest: 2025-08-16 22:00:00)
  Fetched 2800 candles... (oldest: 2025-08-08 23:00:00)
  Fetched 3000 candles... (oldest: 2025-08-01 00:00:00)
  Fetched 3200 candles... (oldest: 2025-07-24 01:00:00)
  Fetched 3400 candles... (oldest: 2025-07-16 02:00:00)
  F

  Fetched 29000 candles... (oldest: 2022-09-29 05:00:00)
  Fetched 29200 candles... (oldest: 2022-09-21 06:00:00)
  Fetched 29400 candles... (oldest: 2022-09-13 07:00:00)
  Fetched 29600 candles... (oldest: 2022-09-05 08:00:00)
  Fetched 29800 candles... (oldest: 2022-08-28 09:00:00)
  Fetched 30000 candles... (oldest: 2022-08-20 10:00:00)
  Fetched 30200 candles... (oldest: 2022-08-12 11:00:00)
  Fetched 30400 candles... (oldest: 2022-08-04 12:00:00)
  Fetched 30600 candles... (oldest: 2022-07-27 13:00:00)
  Fetched 30800 candles... (oldest: 2022-07-19 14:00:00)
  Fetched 31000 candles... (oldest: 2022-07-11 15:00:00)
  Fetched 31200 candles... (oldest: 2022-07-03 16:00:00)
  Fetched 31400 candles... (oldest: 2022-06-25 14:00:00)
  Fetched 31600 candles... (oldest: 2022-06-17 15:00:00)
  Fetched 31800 candles... (oldest: 2022-06-09 16:00:00)
  Fetched 32000 candles... (oldest: 2022-06-01 17:00:00)
  Fetched 32200 candles... (oldest: 2022-05-24 18:00:00)
  Fetched 32400 candles... (old