# AutoGluon 滾動訓練（前兩年訓練、預測第三年）

- **輸入**：`merged_for_autogluon_0900.csv`（由 `merge_and_train.py` 產出）。
- **流程**：依「年」滾動 — 每年用**前兩年**資料訓練，預測**第三年**；輸出模型、完整 leaderboard、每年各模型表現（RMSE/Sharpe）、回測 Sharpe（與 working/1129 相同邏輯）到 `data/models/`，供本地分析。
- **Colab**：掛載 Drive 後將 `DATA_ROOT` 設為專案 `data/`；**本機**：設為專案 `data/` 或指定 `MERGED_CSV_PATH`。

## 1. 掛載 Google Drive（Colab 必跑；本機可略）

In [1]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    IN_COLAB = True
except Exception:
    IN_COLAB = False
print("Colab:", IN_COLAB)

Mounted at /content/drive
Colab: True


## 2. 路徑與參數

In [None]:
from pathlib import Path

# Colab：設為 Drive 上專案 data 目錄；本機：專案 data/
DATA_ROOT = Path("/content/drive/MyDrive/Thesis-AutoGluon-TXF-Research/data") if IN_COLAB else Path.cwd().resolve().parent.parent / "data"
MERGED_CSV_PATH = "/content/drive/MyDrive/2026/論文/Thesis-AutoGluon-TXF-Research/data/merged_for_autogluon_0900/merged_for_autogluon_0900.csv" if MERGED_CSV_PATH is None:
MERGED_CSV_PATH = DATA_ROOT / "output_0900" / "merged_for_autogluon_0900" / "merged_for_autogluon_0900.csv"
MERGED_CSV_PATH = Path(MERGED_CSV_PATH)

# 滾動設定：前 TRAIN_YEARS 年訓練，預測下一年
TRAIN_YEARS = 2
LABEL = "target_return"
TIME_LIMIT = 600  # 秒；每段訓練時間
ROLL_OUTPUT = DATA_ROOT / "models"  # 每段存 data/models/roll_YYYY/
ROLL_OUTPUT.mkdir(parents=True, exist_ok=True)

print("MERGED_CSV_PATH:", MERGED_CSV_PATH)
print("ROLL_OUTPUT:", ROLL_OUTPUT)
print("TRAIN_YEARS:", TRAIN_YEARS)

MERGED_CSV_PATH: /content/drive/MyDrive/2026/論文/Thesis-AutoGluon-TXF-Research/data/merged_for_autogluon_0900/merged_for_autogluon_0900.csv
MODEL_SAVE_DIR: /content/drive/MyDrive/Thesis-AutoGluon-TXF-Research/data/output_0900/models/autogluon_merged


## 3. 安裝 AutoGluon（Colab 通常需執行一次）

In [4]:
!pip install autogluon.tabular --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/515.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m358.4/515.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.9/98.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.4/74.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

## 4. 載入資料、去 date、dropna、切分

In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv(MERGED_CSV_PATH)
# 保留 date 以便依年切分
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"]).drop(columns=["datetime"], errors="ignore")
df["year"] = df["date"].dt.year
df = df.dropna()
if LABEL not in df.columns:
    raise ValueError(f"No column '{LABEL}' in CSV.")

years = sorted(df["year"].unique())
# 預測年：至少前面有 TRAIN_YEARS 年可訓練
predict_years = [y for y in years if y > years[0] and (y - years[0]) >= TRAIN_YEARS]
# 若依連續年算：預測年 y 需存在 y-2, y-1
predict_years = [y for y in years if (y - 1) in years and (y - 2) in years]
predict_years = sorted(set(predict_years))

print("Shape:", df.shape)
print("Years in data:", years)
print("Predict years (train 2yr → predict 1yr):", predict_years)

Shape after drop date + dropna: (2271, 50)
Train: 1362, Val: 454, Test: 455


In [None]:
# Sharpe 回測（與 working/1129 intraday_macro_features_test sharpe.ipynb 相同邏輯）
COST_PER_TRADE = 0.0005
TRADE_THRESHOLD = 0.0001

def compute_sharpe_backtest(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    positions = np.zeros_like(y_pred, dtype=float)
    positions[y_pred > TRADE_THRESHOLD] = 1.0
    positions[y_pred < -TRADE_THRESHOLD] = -1.0
    strategy_returns = positions * y_true
    trades = np.sum(np.abs(np.diff(np.insert(positions, 0, 0))) > 0)
    total_cost = trades * COST_PER_TRADE
    mean_return_with_cost = (np.sum(strategy_returns) - total_cost) / max(len(strategy_returns), 1)
    std_return = np.std(strategy_returns)
    if std_return == 0:
        return 0.0
    return float(mean_return_with_cost / (std_return + 1e-9) * np.sqrt(252))

## 5. 滾動訓練：每段「前兩年訓練 → 預測第三年」並輸出到 data/

（執行下方一格即完成滾動訓練與輸出）

In [7]:
from autogluon.tabular import TabularPredictor
import shutil
import json

summary_list = []
for predict_year in predict_years:
    train_df = df[(df["year"] >= predict_year - TRAIN_YEARS) & (df["year"] < predict_year)].copy()
    test_df = df[df["year"] == predict_year].copy()
    train_ag = train_df.drop(columns=["date", "datetime", "year"], errors="ignore").dropna()
    test_ag = test_df.drop(columns=["date", "datetime", "year"], errors="ignore").dropna()
    if len(train_ag) < 20 or len(test_ag) < 5:
        print(f"Skip {predict_year}: train={len(train_ag)}, test={len(test_ag)}")
        continue

    path_roll = ROLL_OUTPUT / f"roll_{predict_year}"
    if path_roll.exists():
        shutil.rmtree(path_roll)
    path_roll.mkdir(parents=True, exist_ok=True)

    predictor = TabularPredictor(
        label=LABEL,
        problem_type="regression",
        eval_metric="rmse",
        path=str(path_roll),
    ).fit(
        train_ag,
        time_limit=TIME_LIMIT,
        presets="best_quality",
        dynamic_stacking=False,
        num_stack_levels=1,
    )

    preds = predictor.predict(test_ag)
    rmse = float(np.sqrt(np.mean((np.asarray(preds) - test_ag[LABEL].values) ** 2)))
    sharpe = compute_sharpe_backtest(test_ag[LABEL].values, np.asarray(preds))

    leaderboard = predictor.leaderboard(test_ag, silent=True)
    leaderboard.to_csv(path_roll / "leaderboard.csv", index=False)

    fi = predictor.feature_importance()
    if fi is not None and not fi.empty:
        fi.to_csv(path_roll / "feature_importance.csv", index=True)

    model_perf = []
    for m in leaderboard["model"].tolist():
        try:
            p = predictor.predict(test_ag, model=m)
            s = compute_sharpe_backtest(test_ag[LABEL].values, np.asarray(p))
            score_col = next((c for c in leaderboard.columns if c != "model" and pd.api.types.is_numeric_dtype(leaderboard[c])), None)
            rmse_val = float(leaderboard.loc[leaderboard["model"] == m, score_col].iloc[0]) if score_col else np.nan
            model_perf.append({"model": m, "rmse": rmse_val, "sharpe": s})
        except Exception:
            pass
    if model_perf:
        pd.DataFrame(model_perf).to_csv(path_roll / "models_performance.csv", index=False)

    out_pred = test_df.loc[test_ag.index].copy()
    out_pred["pred"] = preds.values
    out_pred[["date", LABEL, "pred"]].to_csv(path_roll / "predictions.csv", index=False)

    metrics = {
        "predict_year": predict_year,
        "train_years": [predict_year - TRAIN_YEARS, predict_year - 1],
        "n_train": len(train_ag),
        "n_test": len(test_ag),
        "rmse": rmse,
        "sharpe": sharpe,
        "best_model": predictor.model_best,
    }
    with open(path_roll / "metrics.json", "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)
    summary_list.append(metrics)
    print(f"Done {predict_year}: rmse={rmse:.6f}, sharpe={sharpe:.4f}, best={predictor.model_best}")

if summary_list:
    pd.DataFrame(summary_list).to_csv(ROLL_OUTPUT / "roll_summary.csv", index=False)
    print("Roll summary saved to:", ROLL_OUTPUT / "roll_summary.csv")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
Pytorch Version:    2.9.0+cpu
CUDA Version:       CUDA is not available
Memory Avail:       11.35 GB / 12.67 GB (89.6%)
Disk Space Avail:   82.12 GB / 107.72 GB (76.2%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of t

AssertionError: Learner is already fit.

## 6. 彙總表（數值輸出供本地分析）

In [None]:
# 輸出均在 data/models/：各年 roll_YYYY/ 含 leaderboard.csv、predictions.csv、metrics.json、models_performance.csv；roll_summary.csv 為各年 rmse/sharpe 彙總
if summary_list:
    display(pd.DataFrame(summary_list))
else:
    print("No rolls completed. Check predict_years and data.")