In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Binance OHLCV Minute Data Explorer
----------------------------------
1) Load file CSV mới nhất cho mỗi symbol: {SYMBOL}_{candle_level}_*.csv
2) Chuẩn hoá tên cột về schema thống nhất:
   ["open_time","Open","High","Low","Close","Volume","QuoteVolume","Trades",
    "TakerBuyBase","TakerBuyQuote"]
3) Kiểm tra chất lượng dữ liệu: duplicates, NaN, giá âm/0, thiếu mốc thời gian (gaps)
4) Thống kê cơ bản & enrich (returns, log-returns, rolling vol, dollar volume)
5) Resample ví dụ (5m, 1h, 1d)
6) Vẽ: Price+MA, Volume, Histogram log-returns, Corr heatmap giữa các symbol
"""

import os, glob, argparse
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# ---------- CONFIG ----------
DEFAULT_CANDLE = "1m"
DEFAULT_DATA_DIR = f"../../work/data/binance/spot/{DEFAULT_CANDLE}"
DEFAULT_SYMBOLS = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"]
OUT_DIR = "./_out"  # nơi lưu hình nếu --save-plots

CANDLE_TO_NS = {
    "1m": 60_000_000_000,
    "3m": 3*60_000_000_000,
    "5m": 5*60_000_000_000,
    "15m": 15*60_000_000_000,
    "30m": 30*60_000_000_000,
    "1h": 60*60_000_000_000,
    "4h": 4*60*60_000_000_000,
    "1d": 24*60*60_000_000_000,
}

In [3]:

# ---------- I/O & NORMALIZE ----------

def find_latest_file(sym: str, data_dir: str, candle_level: str) -> str:
    pattern = os.path.join(data_dir, f"{sym}_{candle_level}_*.csv")
    files = sorted(glob.glob(pattern))
    assert files, f"[ERROR] Không thấy file cho {sym} trong {data_dir} với pattern {os.path.basename(pattern)}"
    return files[-1]

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    rename_map = {
        "open": "Open",
        "high": "High",
        "low": "Low",
        "close": "Close",
        "volume": "Volume",
        "quote_asset_volume": "QuoteVolume",
        "number_of_trades": "Trades",
        "taker_buy_base_asset_volume": "TakerBuyBase",
        "taker_buy_quote_asset_volume": "TakerBuyQuote",
    }
    if "Open time" in df.columns and "open_time" not in df.columns:
        df = df.rename(columns={"Open time": "open_time"})
    if "open_time" not in df.columns and "open_time_ms" in df.columns:
        df["open_time"] = pd.to_datetime(df["open_time_ms"], unit="ms", utc=True)
    df = df.rename(columns=rename_map)
    return df

def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    core_required = ["open_time", "Open", "High", "Low", "Close", "Volume"]
    for c in core_required:
        assert c in df.columns, f"[ERROR] Thiếu cột bắt buộc: {c}"

    if not np.issubdtype(df["open_time"].dtype, np.datetime64):
        df["open_time"] = pd.to_datetime(df["open_time"], errors="coerce", utc=True)

    df = df.sort_values("open_time").reset_index(drop=True)

    for c in df.columns:
        if c in ["Open", "High", "Low", "Close", "Volume", "QuoteVolume", "Trades", "TakerBuyBase", "TakerBuyQuote"]:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df


In [4]:
# ---------- ENRICH FEATURES ----------

def enrich_features(df: pd.DataFrame, ret_horizons: List[int] = [1], roll_N: int = 60) -> pd.DataFrame:
    df = df.copy()
    # returns
    df["Ret_1"] = df["Close"].pct_change()
    df["LogRet_1"] = np.log(df["Close"]).diff()

    # multi-horizon (optional; forward returns để xem ACE baseline)
    for h in ret_horizons:
        if h <= 0: continue
        df[f"FwdLogRet_{h}"] = np.log(df["Close"]).shift(-h) - np.log(df["Close"])

    # rolling vol (realized)
    df["RollVol_{}".format(roll_N)] = df["LogRet_1"].rolling(roll_N, min_periods=roll_N//3).std() * np.sqrt(roll_N)

    # dollar volume (xấp xỉ)
    if "QuoteVolume" in df.columns and df["QuoteVolume"].notna().any():
        df["DollarVol"] = df["QuoteVolume"]
    else:
        df["DollarVol"] = df["Close"] * df["Volume"]
    return df

In [5]:
# ---------- RESAMPLE ----------

def resample_ohlcv(df: pd.DataFrame, rule: str) -> pd.DataFrame:
    """
    Resample OHLCV theo rule (vd: '5T','1H','1D'). open_time làm index.
    """
    x = df.set_index("open_time")
    agg = {
        "Open": "first",
        "High": "max",
        "Low": "min",
        "Close": "last",
        "Volume": "sum",
    }
    if "QuoteVolume" in x.columns: agg["QuoteVolume"] = "sum"
    if "Trades" in x.columns: agg["Trades"] = "sum"
    if "TakerBuyBase" in x.columns: agg["TakerBuyBase"] = "sum"
    if "TakerBuyQuote" in x.columns: agg["TakerBuyQuote"] = "sum"
    y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
    y = y.reset_index()
    return y

In [6]:
# ---------- PLOTS ----------

def ensure_outdir(save_plots: bool, out_dir: str):
    if save_plots and not os.path.isdir(out_dir):
        os.makedirs(out_dir, exist_ok=True)

def plot_price_ma(df: pd.DataFrame, sym: str, save: bool):
    plt.figure(figsize=(10,4))
    plt.plot(df["open_time"], df["Close"], label="Close")
    if len(df) > 100:
        ma_fast = df["Close"].rolling(20).mean()
        ma_slow = df["Close"].rolling(100).mean()
        plt.plot(df["open_time"], ma_fast, label="MA20")
        plt.plot(df["open_time"], ma_slow, label="MA100")
    plt.title(f"{sym} - Price & MA")
    plt.legend(); plt.tight_layout()
    if save:
        plt.savefig(os.path.join(OUT_DIR, f"{sym}_price_ma.png"))
    else:
        plt.show()
    plt.close()

def plot_volume(df: pd.DataFrame, sym: str, save: bool):
    plt.figure(figsize=(10,3))
    plt.plot(df["open_time"], df["DollarVol"], label="DollarVol")
    plt.title(f"{sym} - Dollar Volume")
    plt.tight_layout()
    if save:
        plt.savefig(os.path.join(OUT_DIR, f"{sym}_dollarvol.png"))
    else:
        plt.show()
    plt.close()

def plot_logret_hist(df: pd.DataFrame, sym: str, save: bool):
    plt.figure(figsize=(6,4))
    x = df["LogRet_1"].dropna().clip(lower=-0.2, upper=0.2)  # tránh outlier phá biểu đồ
    plt.hist(x.values, bins=100, density=True)
    plt.title(f"{sym} - Histogram(LogRet_1)")
    plt.tight_layout()
    if save:
        plt.savefig(os.path.join(OUT_DIR, f"{sym}_logret_hist.png"))
    else:
        plt.show()
    plt.close()

def plot_corr_heatmap(close_by_sym: Dict[str, pd.Series], save: bool):
    # tính corr của LogRet_1 giữa các symbol
    df = pd.DataFrame({s: np.log(v).diff() for s, v in close_by_sym.items()}).dropna(how="any")
    corr = df.corr()
    plt.figure(figsize=(5+0.6*len(corr), 4+0.4*len(corr)))
    im = plt.imshow(corr.values, interpolation="nearest")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(corr)), corr.columns, rotation=45, ha="right")
    plt.yticks(range(len(corr)), corr.index)
    plt.title("Log-Return Correlation Heatmap")
    plt.tight_layout()
    if save:
        plt.savefig(os.path.join(OUT_DIR, f"corr_heatmap.png"))
    else:
        plt.show()
    plt.close()

Namespace(data_dir='../../work/data/binance/spot/1m', candle_level='1m', symbols=['BTCUSDT', 'ETHUSDT', 'BNBUSDT', 'SOLUSDT'], save_plots=True, resample=['5T', '1H', '1D'], show_qc=True)

In [9]:
# ---------- QUALITY CHECKS ----------

def qc_report(df: pd.DataFrame, candle_level: str) -> Dict[str, int]:
    rep = {}
    rep["rows"] = len(df)
    rep["dupe_open_time"] = int(df["open_time"].duplicated().sum())
    rep["n_nan_any"] = int(df.isna().any(axis=1).sum())
    rep["n_nan_price"] = int(df[["Open","High","Low","Close"]].isna().any(axis=1).sum())
    rep["n_zero_or_neg_price"] = int(((df[["Open","High","Low","Close"]] <= 0).any(axis=1)).sum())

    # gaps: mong đợi sai khác open_time đều nhau theo candle
    if len(df) > 1:
        deltas = df["open_time"].view("int64").diff().dropna()  # ns
        expect = CANDLE_TO_NS[candle_level]
        rep["gaps"] = int((deltas != expect).sum())
    else:
        rep["gaps"] = 0
    return rep

In [10]:
def main(args):

    ensure_outdir(args.save_plots, OUT_DIR)

    per_symbol: Dict[str, pd.DataFrame] = {}
    close_by_sym: Dict[str, pd.Series] = {}

    print(f"[INFO] Loading symbols: {args.symbols} from {args.data_dir} ({args.candle_level})")

    for sym in args.symbols:
        path = find_latest_file(sym, args.data_dir, args.candle_level)
        df = pd.read_csv(path)
        df = normalize_columns(df)
        df = ensure_schema(df)

        qc = qc_report(df, args.candle_level)
        if args.show_qc:
            print(f"[QC] {sym}: {qc}")

        df = enrich_features(df, ret_horizons=[1, 5, 15, 60], roll_N=60)
        per_symbol[sym] = df
        close_by_sym[sym] = df.set_index("open_time")["Close"]

        # plots per-symbol
        plot_price_ma(df, sym, args.save_plots)
        plot_volume(df, sym, args.save_plots)
        plot_logret_hist(df, sym, args.save_plots)

        # demo resample
        for rule in args.resample:
            r = resample_ohlcv(df, rule)
            if args.save_plots:
                # Lưu CSV resample demo (tuỳ nhu cầu)
                outp = os.path.join(OUT_DIR, f"{sym}_resample_{rule}.csv")
                r.to_csv(outp, index=False)

    # multi-symbol corr
    if len(close_by_sym) >= 2:
        plot_corr_heatmap(close_by_sym, args.save_plots)

    print("[DONE] Explorer hoàn tất. Xem hình ở", OUT_DIR if args.save_plots else "on-screen.")


In [12]:
import argparse

ap = argparse.ArgumentParser()
ap.add_argument("--data-dir", type=str, default=DEFAULT_DATA_DIR)
ap.add_argument("--candle-level", type=str, default=DEFAULT_CANDLE, choices=CANDLE_TO_NS.keys())
ap.add_argument("--symbols", nargs="+", default=DEFAULT_SYMBOLS)
ap.add_argument("--save-plots", action="store_true")
ap.add_argument("--resample", nargs="*", default=["5T","1H","1D"])
ap.add_argument("--show-qc", action="store_true")

args = ap.parse_args([
    "--data-dir", "../../../work/data/binance/spot/1m",
    "--candle-level", "1m",
    "--symbols", "BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT",
    "--save-plots",
    "--resample", "5T", "1H", "1D",
    "--show-qc",
])

args


Namespace(data_dir='../../../work/data/binance/spot/1m', candle_level='1m', symbols=['BTCUSDT', 'ETHUSDT', 'BNBUSDT', 'SOLUSDT'], save_plots=True, resample=['5T', '1H', '1D'], show_qc=True)

In [13]:
main(args)

[INFO] Loading symbols: ['BTCUSDT', 'ETHUSDT', 'BNBUSDT', 'SOLUSDT'] from ../../../work/data/binance/spot/1m (1m)


  deltas = df["open_time"].view("int64").diff().dropna()  # ns


[QC] BTCUSDT: {'rows': 3507322, 'dupe_open_time': 0, 'n_nan_any': 0, 'n_nan_price': 0, 'n_zero_or_neg_price': 0, 'gaps': 22}


  plt.legend(); plt.tight_layout()
  plt.savefig(os.path.join(OUT_DIR, f"{sym}_price_ma.png"))
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
  deltas = df["open_time"].view("int64").diff().dropna()  # ns


[QC] ETHUSDT: {'rows': 3507344, 'dupe_open_time': 0, 'n_nan_any': 0, 'n_nan_price': 0, 'n_zero_or_neg_price': 0, 'gaps': 22}


  plt.legend(); plt.tight_layout()
  plt.savefig(os.path.join(OUT_DIR, f"{sym}_price_ma.png"))
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
  deltas = df["open_time"].view("int64").diff().dropna()  # ns


[QC] BNBUSDT: {'rows': 3507365, 'dupe_open_time': 0, 'n_nan_any': 0, 'n_nan_price': 0, 'n_zero_or_neg_price': 0, 'gaps': 22}


  plt.savefig(os.path.join(OUT_DIR, f"{sym}_price_ma.png"))
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
  deltas = df["open_time"].view("int64").diff().dropna()  # ns


[QC] SOLUSDT: {'rows': 2662968, 'dupe_open_time': 0, 'n_nan_any': 0, 'n_nan_price': 0, 'n_zero_or_neg_price': 0, 'gaps': 10}


  plt.legend(); plt.tight_layout()
  plt.savefig(os.path.join(OUT_DIR, f"{sym}_price_ma.png"))
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])
  y = x.resample(rule, label="right", closed="right").agg(agg).dropna(subset=["Open","High","Low","Close"])


[DONE] Explorer hoàn tất. Xem hình ở ./_out
