In [2]:
# 0) Imports
import math, time, requests, io, os, json, warnings
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt


In [3]:
warnings.filterwarnings("ignore")
pd.set_option("display.width", 120)
pd.set_option("display.max_columns", 50)

In [None]:
import os
from datetime import datetime, timezone

# Cố định mốc thời gian 2 năm: 30/10/2023 → 30/10/2025
start_dt = datetime(2023, 10, 30, 0, 0, 0, tzinfo=timezone.utc)
end_dt   = datetime(2025, 10, 30, 23, 59, 59, tzinfo=timezone.utc)  # trọn ngày 30/10/2025
interval = "1h"
print(f"Khoảng thời gian: {start_dt.isoformat()} → {end_dt.isoformat()} (interval={interval})")

# Danh sách token chỉ gồm các mã có cặp spot trên Binance
token_to_symbol = {
    "BTC": "BTCUSDT",
    "BNB": "BNBUSDT",
    "SOL": "SOLUSDT",
    "ETH": "ETHUSDT",
    "AVAX": "AVAXUSDT",
    "TRON": "TRXUSDT",
    "ARBtrium": "ARBUSDT",
    "Sui": "SUIUSDT",
    "Polygon": "MATICUSDT",
}

In [1]:
# Multi-token modeling runner

# 1) Import pipeline (robust to different working directories)
try:
    from multi_token_modeling import run_per_token_pipeline, summarize_results
except Exception:
    import sys, os
    sys.path.append("simplest_ml/w1")
    from multi_token_modeling import run_per_token_pipeline, summarize_results

# 2) Detect data directory (works whether CWD is repo root or simplest_ml/w1)
import os
data_dir = "data"
if not os.path.isdir(data_dir) and os.path.isdir("simplest_ml/w1/data"):
    data_dir = "simplest_ml/w1/data"
print(f"Using data_dir = {data_dir}")

# 3) Configure tokens and params
tokens = ["BTC", "BNB", "SOL", "ETH", "AVAX", "TRON", "ARBtrium", "Sui", "Polygon"]
interval = "1h"
start = "20231030"
end   = "20251030"
windows = (3, 6, 12, 24)

# 4) Run pipeline
results = run_per_token_pipeline(
    tokens=tokens,
    data_dir=data_dir,
    interval=interval,
    start=start,
    end=end,
    windows=windows,
    n_splits=5,
    C=0.5,
    max_iter=2000,
    show_eda=False,   # set True to print head/tail + describe()
    verbose=True      # step-by-step logs
)

# 5) Summarize across tokens
summarize_results(results)


Using data_dir = data

[BTC] ==== PIPELINE START ====
[BTC] Step 1/4 LOAD: rows=17568, cols=11, span=2023-10-30 00:59:59.999000+00:00 → 2025-10-30 23:59:59.999000+00:00, cols_req_present=True
[BTC] Step 3/4 FEATURES: windows=(3, 6, 12, 24)
[BTC] prepare_Xy: input rows=17568 [2023-10-30 00:59:59.999000+00:00 → 2025-10-30 23:59:59.999000+00:00]
[BTC] add_features: start, rows=17568, windows=(3, 6, 12, 24)
[BTC] add_features: generated 29 features; total NaNs after lag=288
[BTC] make_label: positives=8939 / 17568
[BTC] prepare_Xy: dropped NaNs -> rows 17568 → 17543; features=29
[BTC] Step 4/4 TRAIN: TSCV n_splits=5, C=0.5, max_iter=2000
[BTC] train: fold 1/5 | train=2928 test=2923
[BTC] train: fold 1 metrics | AUC=0.5235 ACC=0.5180
[BTC] train: fold 2/5 | train=5851 test=2923
[BTC] train: fold 2 metrics | AUC=0.5292 ACC=0.5293
[BTC] train: fold 3/5 | train=8774 test=2923
[BTC] train: fold 3 metrics | AUC=0.5220 ACC=0.5084
[BTC] train: fold 4/5 | train=11697 test=2923
[BTC] train: fold 4 m