In [1]:
# 0) Imports
import math, time, requests, io, os, json, warnings
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt


In [2]:
warnings.filterwarnings("ignore")
pd.set_option("display.width", 120)
pd.set_option("display.max_columns", 50)

In [3]:


# ---------------------------------------------------------
# 1) LOAD DATA from Binance public API (no auth)
# ---------------------------------------------------------
BINANCE = "https://api.binance.com/api/v3/klines"

def fetch_klines(symbol="BTCUSDT", interval="1h", start=None, end=None, limit=1000, sleep=0.2):
    """
    Fetch klines in chunks. start/end: python datetime (UTC). 
    """
    params = {"symbol": symbol, "interval": interval, "limit": limit}
    if start is None:
        # default: 90 days back
        start = datetime.now(timezone.utc) - timedelta(days=90)
    if end is None:
        end = datetime.now(timezone.utc)
    start_ms = int(start.timestamp() * 1000)
    end_ms   = int(end.timestamp() * 1000)

    out = []
    cur = start_ms
    while True:
        p = {**params, "startTime": cur, "endTime": end_ms}
        r = requests.get(BINANCE, params=p, timeout=30)
        r.raise_for_status()
        batch = r.json()
        if not batch:
            break
        out.extend(batch)
        # next window begins at last candle open time + interval
        next_open = batch[-1][0] + 1
        if next_open >= end_ms:
            break
        cur = next_open
        time.sleep(sleep)
    # columns per docs
    cols = [
        "open_time","open","high","low","close","volume",
        "close_time","quote_vol","num_trades","taker_base","taker_quote","ignore"
    ]
    df = pd.DataFrame(out, columns=cols)
    for c in ["open","high","low","close","volume","quote_vol","taker_base","taker_quote"]:
        df[c] = df[c].astype(float)
    df["open_time"] = pd.to_datetime(df["open_time"], unit="ms", utc=True)
    df["close_time"] = pd.to_datetime(df["close_time"], unit="ms", utc=True)
    df = df.set_index("close_time").sort_index()
    return df


In [4]:
import os
from datetime import datetime, timezone

def download_binance_data():
    """
    Tải dữ liệu lịch sử giá từ Binance cho các token đã chọn trong khoảng thời gian cố định 2 năm.
    Dữ liệu được lưu dưới dạng file Parquet trong thư mục 'data'.
    """

    # Cố định mốc thời gian 2 năm: 30/10/2023 → 30/10/2025
    start_dt = datetime(2023, 10, 30, 0, 0, 0, tzinfo=timezone.utc)
    end_dt   = datetime(2025, 10, 30, 23, 59, 59, tzinfo=timezone.utc)  # trọn ngày 30/10/2025
    interval = "1h"
    print(f"Khoảng thời gian: {start_dt.isoformat()} → {end_dt.isoformat()} (interval={interval})")

    # Danh sách token chỉ gồm các mã có cặp spot trên Binance
    token_to_symbol = {
        "BTC": "BTCUSDT",
        "BNB": "BNBUSDT",
        "SOL": "SOLUSDT",
        "ETH": "ETHUSDT",
        "AVAX": "AVAXUSDT",
        "TRON": "TRXUSDT",
        "ARBtrium": "ARBUSDT",
        "Sui": "SUIUSDT",
        "Polygon": "MATICUSDT",
    }

    os.makedirs("data", exist_ok=True)

    results = {}
    for name, sym in token_to_symbol.items():
        try:
            df = fetch_klines(symbol=sym, interval=interval, start=start_dt, end=end_dt)
            print(f"- {name} ({sym}): {len(df)} dòng")
            fn = f"data/binance_{sym.lower()}_{interval}_{start_dt.strftime('%Y%m%d')}_{end_dt.strftime('%Y%m%d')}.parquet"
            df.to_parquet(fn)
            print(f"  -> saved: {fn}")
            results[name] = df
        except Exception as e:
            print(f"- Lỗi khi tải {name} ({sym}): {e}")


# download_binance_data()


In [5]:
# Helpers
from pretty_helpers import pretty_print_sample, read_and_pretty_print_all_downloaded

# 1) Pretty print all downloaded token data (saved in step trước)
#    Adjust data_dir if needed (e.g., "simplest_ml/w1/data" when running from repo root)
read_and_pretty_print_all_downloaded(
    data_dir="data",      # or "simplest_ml/w1/data"
    interval="1h",
    start="20231030",
    end="20251030",
    head=3,
    tail=3,
)

# 2) If bạn đã có sẵn một DataFrame `df` trong bộ nhớ:
# pretty_print_sample(df, head=3, tail=3)



=== ARBUSDT | 1h | 20231030->20251030 | rows=17568 ===
+-------+---------------------------+--------+--------+--------+--------+-----------+
| Index |         Open Time         |  Open  |  High  |  Low   | Close  |   Volume  |
+-------+---------------------------+--------+--------+--------+--------+-----------+
|   0   | 2023-10-30 00:00:00+00:00 | 0.9483 | 0.9525 | 0.9403 | 0.9442 | 1038785.4 |
|   1   | 2023-10-30 01:00:00+00:00 | 0.9441 | 0.9515 | 0.944  | 0.9445 |  990870.1 |
|   2   | 2023-10-30 02:00:00+00:00 | 0.9442 | 0.9495 | 0.9363 | 0.941  | 1624302.9 |
| 17565 | 2025-10-30 21:00:00+00:00 | 0.2857 | 0.2877 | 0.2841 | 0.2867 | 3109658.4 |
| 17566 | 2025-10-30 22:00:00+00:00 | 0.2867 | 0.2874 | 0.2856 | 0.2871 | 2477814.7 |
| 17567 | 2025-10-30 23:00:00+00:00 | 0.2872 | 0.2904 | 0.2868 | 0.2902 | 4050488.8 |
+-------+---------------------------+--------+--------+--------+--------+-----------+

=== AVAXUSDT | 1h | 20231030->20251030 | rows=17568 ===
+-------+------------------