# Final Method Full Notebook

この notebook は **単体完結** で、以下を Run All だけで実行します。

1. yfinance からテーマETFの価格・出来高を取得（キャッシュ再利用あり）
2. 最終採択 proxy（`price_only_mms`）を計算
3. Top-K テーマ選抜とETFウェイト構築
4. `neutral_mode = none / A_weight_adjust / B_eqw_hedge` を同一条件で比較
5. バックテスト（gross/net、turnover、cost、rolling beta）
6. train/test 指標表と図表を `outputs/` に保存

制約:
- `src/` の関数・クラスは **使用しない**（importしない）
- `target volatility` は実装しない
- 先読み回避（signal窓、beta推定窓、exec_lag）を厳守する


## (1) Problem Setup / Notation

本 notebook では、テーマETFローテーションを以下の記号で定義します。

- テーマ集合: $\Theta$
- ETF集合: $\mathcal{I}$
- 日次時点: $t$
- リバランス時点: $t_r$
- テーマ $\theta$ に属するETF集合: $\mathcal{I}_\theta \subset \mathcal{I}$

ETF $i$ の調整後終値を $P_{i,t}$、日次リターンを

$$
r_{i,t} = \frac{P_{i,t}}{P_{i,t-1}} - 1
$$

とします。

取引可能集合はローンチ日制約込みで

$$
\mathcal{I}(t)=\{i\in\mathcal{I}\mid t\ge \tau_i^{\text{launch}},\ P_{i,t}\text{ が有効}\}
$$

と定義します。ここで $\tau_i^{\text{launch}}$ は `first_valid_date` です。


In [None]:
# 2) Config

THEME_TO_TICKERS = {
  "Ageing Society Opportunities": ["AGNG"],
  "Autonomous Tech & Industrial Innovation": ["ARKQ", "ROBO"],
  "Digital Health": ["EDOC"],
  "Efficient Energy": ["QCLN"],
  "Fintech Innovation": ["FINX"],
  "Future Education": ["LRNZ"],
  "Future Mobility": ["DRIV"],
  "Genomic Innovation": ["ARKG"],
  "Millennials": ["MILN"],
  "Next Gen Internet Innovation": ["ARKW"],
  "Robotics": ["BOTZ"],
  "Smart Cities": ["KOMP"],
  "Blockchain Economy": ["BLOK"],
  "Clean Energy Infrastructure": ["ICLN"],
  "Cybersecurity": ["HACK"],
  "Food Revolution": ["KROP"],
  "Natural Resources Stewardship": ["WOOD"],
  "Renewables & Energy Efficiency": ["TAN", "FAN"],
  "Sharing Economy": ["GIGE"],
  "Space Exploration": ["ARKX"],
  "Sustainable Water Transition": ["PHO"],
}

ALL_TICKERS = sorted({t for tickers in THEME_TO_TICKERS.values() for t in tickers})

CONFIG = {
    "start": "2014-01-01",
    "end": None,
    "force_download": False,
    "rebalance": "M",           # "M" or "Q"
    "split_date": "2021-01-01",
    "fee_bps": 10.0,
    "slippage_bps": 1.0,
    "exec_lag": 1,
    "top_k": 4,
    "lookback_3m": 63,
    "beta_target": 0.0,
    "beta_lookback": 60,
    "allow_short_hedge": False,
    "max_leverage": 1.5,
    "neutral_modes": ["none", "A_weight_adjust", "B_eqw_hedge"],
    "force_ff_download": False,
}

print("tickers:", len(ALL_TICKERS))
print("neutral_modes:", CONFIG["neutral_modes"])


In [None]:
# 3) Imports & Utilities

import os
from pathlib import Path

ROOT = Path(".").resolve()
MPL_DIR = ROOT / "outputs" / ".mplconfig"
MPL_DIR.mkdir(parents=True, exist_ok=True)
os.environ.setdefault("MPLCONFIGDIR", str(MPL_DIR))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf

DATA_RAW_DIR = ROOT / "data" / "raw"
DATA_PROCESSED_DIR = ROOT / "data" / "processed"
OUT_FIG_DIR = ROOT / "outputs" / "figures"
OUT_TABLE_DIR = ROOT / "outputs" / "tables"
OUT_LOG_DIR = ROOT / "outputs" / "logs"


def ensure_dirs() -> None:
    DATA_RAW_DIR.mkdir(parents=True, exist_ok=True)
    DATA_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    OUT_FIG_DIR.mkdir(parents=True, exist_ok=True)
    OUT_TABLE_DIR.mkdir(parents=True, exist_ok=True)
    OUT_LOG_DIR.mkdir(parents=True, exist_ok=True)


def align_to_index(target, index, method="ffill"):
    if len(index) == 0:
        return None
    target_ts = pd.Timestamp(target)
    loc = index.get_indexer([target_ts], method=method)[0]
    if loc == -1:
        return None
    return pd.Timestamp(index[loc])


def _load_cached_data(expected_tickers):
    close_path = DATA_RAW_DIR / "close.csv"
    volume_path = DATA_RAW_DIR / "volume.csv"
    first_valid_path = DATA_RAW_DIR / "first_valid_date.csv"
    returns_path = DATA_PROCESSED_DIR / "returns.csv"

    required = [close_path, volume_path, first_valid_path, returns_path]
    if not all(p.exists() for p in required):
        return None

    close = pd.read_csv(close_path, index_col=0, parse_dates=True).sort_index()
    volume = pd.read_csv(volume_path, index_col=0, parse_dates=True).sort_index()
    returns = pd.read_csv(returns_path, index_col=0, parse_dates=True).sort_index()

    first_valid_raw = pd.read_csv(first_valid_path, index_col=0)
    first_valid = {}
    for ticker in first_valid_raw.index:
        val = first_valid_raw.loc[ticker].iloc[0]
        if isinstance(val, str) and val:
            first_valid[ticker] = pd.Timestamp(val)
        elif pd.notna(val):
            first_valid[ticker] = pd.Timestamp(val)
        else:
            first_valid[ticker] = None

    cached_tickers = set(close.columns)
    if not set(expected_tickers).issubset(cached_tickers):
        return None

    close = close.reindex(columns=expected_tickers)
    volume = volume.reindex(index=close.index, columns=expected_tickers)
    returns = returns.reindex(index=close.index, columns=expected_tickers)
    first_valid = {t: first_valid.get(t) for t in expected_tickers}
    return close, volume, returns, first_valid


def _download_data_yf(tickers, start=None, end=None):
    data = yf.download(
        tickers=tickers,
        start=start,
        end=end,
        interval="1d",
        auto_adjust=True,
        progress=False,
        group_by="column",
        threads=True,
    )
    if data.empty:
        raise ValueError("yfinance returned no data.")

    if isinstance(data.columns, pd.MultiIndex):
        close = data["Close"].copy()
        volume = data["Volume"].copy()
    else:
        ticker = tickers[0]
        close = data[["Close"]].copy()
        close.columns = [ticker]
        volume = data[["Volume"]].copy()
        volume.columns = [ticker]

    close = close.sort_index().dropna(how="all").ffill()
    volume = volume.reindex(close.index)

    close = close.reindex(columns=tickers)
    volume = volume.reindex(columns=tickers)
    return close, volume


def _first_valid_dates(close):
    out = {}
    for col in close.columns:
        d = close[col].first_valid_index()
        out[col] = pd.Timestamp(d) if d is not None else None
    return out


def _apply_launch_mask(close, volume, first_valid):
    close_m = close.copy()
    volume_m = volume.copy()
    for ticker, launch_dt in first_valid.items():
        if launch_dt is None:
            continue
        mask = close_m.index < launch_dt
        close_m.loc[mask, ticker] = np.nan
        volume_m.loc[mask, ticker] = np.nan
    return close_m, volume_m


def _save_cache(close, volume, returns, first_valid):
    ensure_dirs()
    close.to_csv(DATA_RAW_DIR / "close.csv")
    volume.to_csv(DATA_RAW_DIR / "volume.csv")

    first_valid_ser = pd.Series({
        k: (v.isoformat() if v is not None else "")
        for k, v in first_valid.items()
    })
    first_valid_ser.to_csv(DATA_RAW_DIR / "first_valid_date.csv", header=["first_valid_date"])

    returns.to_csv(DATA_PROCESSED_DIR / "returns.csv")


def load_or_download_data(tickers, start=None, end=None, force_download=False):
    tickers = sorted(set(tickers))
    ensure_dirs()

    if not force_download:
        cached = _load_cached_data(tickers)
        if cached is not None:
            return cached

    close, volume = _download_data_yf(tickers=tickers, start=start, end=end)
    first_valid = _first_valid_dates(close)
    close, volume = _apply_launch_mask(close, volume, first_valid)
    returns = close.pct_change(fill_method=None)

    _save_cache(close, volume, returns, first_valid)
    return close, volume, returns, first_valid


def zscore_cross_section(series: pd.Series) -> pd.Series:
    mu = series.mean(skipna=True)
    sigma = series.std(skipna=True, ddof=0)
    if not np.isfinite(sigma) or sigma < 1e-12:
        return series * 0.0
    return (series - mu) / sigma


def winsorize(series: pd.Series, k: float = 3.0) -> pd.Series:
    return series.clip(lower=-k, upper=k)


def build_theme_series(close: pd.DataFrame, volume: pd.DataFrame, theme_to_tickers: dict):
    etf_returns = close.pct_change(fill_method=None)

    theme_returns = {}
    theme_dollar_volume = {}

    for theme, tickers in theme_to_tickers.items():
        cols = [t for t in tickers if t in close.columns]
        if not cols:
            continue

        theme_ret = etf_returns[cols].mean(axis=1, skipna=True)
        theme_dv = (close[cols] * volume[cols]).sum(axis=1, min_count=1)

        theme_returns[theme] = theme_ret
        theme_dollar_volume[theme] = theme_dv

    theme_returns_df = pd.DataFrame(theme_returns).sort_index()
    theme_dv_df = pd.DataFrame(theme_dollar_volume).reindex(theme_returns_df.index).sort_index()
    return theme_returns_df, theme_dv_df


def compute_price_only_scores(theme_returns: pd.DataFrame, lookback_3m: int = 63, eps: float = 1e-12):
    idx = theme_returns.index
    month_ends = theme_returns.resample("ME").last().index
    month_ends = pd.DatetimeIndex([d for d in month_ends if d in idx])

    score_rows = []
    mom_rows = []
    prev_rank = None

    for date in month_ends:
        loc = idx.get_loc(date)
        if loc < (2 * lookback_3m + 5):
            continue

        w0 = slice(loc - lookback_3m + 1, loc + 1)
        w1 = slice(loc - 2 * lookback_3m + 1, loc - lookback_3m + 1)

        ret_w0 = theme_returns.iloc[w0]
        ret_w1 = theme_returns.iloc[w1]

        valid_now = theme_returns.loc[date].notna()
        valid_history = (
            (ret_w0.notna().sum(axis=0) >= lookback_3m)
            & (ret_w1.notna().sum(axis=0) >= lookback_3m)
        )
        valid_theme = valid_now & valid_history

        r0 = (1.0 + ret_w0).prod(axis=0, skipna=True) - 1.0
        s0 = ret_w0.std(axis=0, ddof=0)
        mom0 = r0 / (s0 + eps)

        r1 = (1.0 + ret_w1).prod(axis=0, skipna=True) - 1.0
        s1 = ret_w1.std(axis=0, ddof=0)
        mom1 = r1 / (s1 + eps)

        mom_abs = mom0 - mom1
        mom_pct = (mom0 - mom1) / (mom1.abs() + eps)

        raw = zscore_cross_section(mom_abs) + zscore_cross_section(mom_pct)
        score = zscore_cross_section(raw)
        score = winsorize(score, k=3.0).round(1)

        score = score.where(valid_theme)
        mom0 = mom0.where(valid_theme)

        if score.notna().any():
            if prev_rank is None:
                ordered = score.dropna().sort_values(ascending=False, kind="mergesort").index
            else:
                rank_df = pd.DataFrame({
                    "score": score,
                    "prev": prev_rank.reindex(score.index),
                })
                max_prev = float(prev_rank.max()) if len(prev_rank) > 0 else 0.0
                rank_df["prev"] = rank_df["prev"].fillna(max_prev + 1.0)
                ordered = (
                    rank_df.dropna(subset=["score"])
                    .sort_values(["score", "prev"], ascending=[False, True], kind="mergesort")
                    .index
                )
            prev_rank = pd.Series(np.arange(1, len(ordered) + 1), index=ordered, dtype=float)

        score_rows.append(score.rename(date))
        mom_rows.append(mom0.rename(date))

    score_m = pd.DataFrame(score_rows).sort_index().reindex(columns=theme_returns.columns)
    mom_m = pd.DataFrame(mom_rows).sort_index().reindex(columns=theme_returns.columns)
    return score_m, mom_m


def _normalize_rebalance_freq(freq: str) -> str:
    f = str(freq).upper()
    if f == "M":
        return "ME"
    if f == "Q":
        return "QE"
    return f


def _theme_has_active_etf(theme: str, date: pd.Timestamp, close: pd.DataFrame, theme_to_tickers: dict) -> bool:
    tickers = [t for t in theme_to_tickers.get(theme, []) if t in close.columns]
    if not tickers:
        return False
    row = close.loc[date, tickers]
    return bool(row.notna().any())


def build_rebalance_theme_weights(score_m, mom_m, close, theme_to_tickers, rebalance="M", top_k=4):
    alias = _normalize_rebalance_freq(rebalance)
    score_r = score_m.resample(alias).last()
    mom_r = mom_m.resample(alias).last()
    rebalance_dates = score_r.index

    theme_weights = pd.DataFrame(0.0, index=rebalance_dates, columns=score_m.columns)

    for date in rebalance_dates:
        decision_date = align_to_index(date, close.index, method="ffill")
        if decision_date is None:
            continue

        score = score_r.loc[date].copy()
        mom = mom_r.loc[date].copy()

        for theme in score.index:
            if not _theme_has_active_etf(theme, decision_date, close, theme_to_tickers):
                score.loc[theme] = np.nan
                mom.loc[theme] = np.nan

        selected = []
        valid_score = score.dropna()
        if not valid_score.empty:
            selected = list(valid_score.sort_values(ascending=False, kind="mergesort").head(top_k).index)

        if len(selected) < top_k:
            valid_mom = mom.dropna().sort_values(ascending=False, kind="mergesort")
            for theme in valid_mom.index:
                if theme not in selected:
                    selected.append(theme)
                if len(selected) >= top_k:
                    break

        if selected:
            w = 1.0 / len(selected)
            theme_weights.loc[date, selected] = w

    return theme_weights


def map_theme_to_etf_weights(theme_weights, close, theme_to_tickers):
    etf_weights = pd.DataFrame(0.0, index=theme_weights.index, columns=close.columns)

    for date in theme_weights.index:
        decision_date = align_to_index(date, close.index, method="ffill")
        if decision_date is None:
            continue

        row = theme_weights.loc[date]
        for theme, theme_weight in row.items():
            if theme_weight <= 0:
                continue

            tickers = [t for t in theme_to_tickers.get(theme, []) if t in close.columns]
            if not tickers:
                continue

            available = close.loc[decision_date, tickers].dropna().index.tolist()
            if not available:
                continue

            per_etf = theme_weight / len(available)
            etf_weights.loc[date, available] += per_etf

        total = etf_weights.loc[date].sum()
        if total > 0:
            etf_weights.loc[date] = etf_weights.loc[date] / total

    return etf_weights


def expand_weights_daily(rebalance_weights, daily_index, exec_lag_days=1):
    daily_weights = pd.DataFrame(np.nan, index=daily_index, columns=rebalance_weights.columns)

    for date in rebalance_weights.index:
        apply_anchor = align_to_index(date, daily_index, method="bfill")
        if apply_anchor is None:
            continue

        loc = daily_index.get_loc(apply_anchor)
        apply_loc = min(loc + int(exec_lag_days), len(daily_index) - 1)
        apply_date = daily_index[apply_loc]
        daily_weights.loc[apply_date] = rebalance_weights.loc[date]

    daily_weights = daily_weights.ffill().fillna(0.0)
    return daily_weights


def compute_eqw_weights(close):
    available = close.notna().astype(float)
    counts = available.sum(axis=1)
    counts = counts.where(counts > 0)
    weights = available.div(counts, axis=0)
    return weights.fillna(0.0)


def backtest_from_daily_weights(daily_weights, returns, fee_bps=10.0, slippage_bps=1.0):
    weights = daily_weights.reindex(returns.index).fillna(0.0)
    ret = returns.reindex(weights.index).fillna(0.0)

    gross_ret = (weights * ret).sum(axis=1)
    turnover = weights.diff().abs().sum(axis=1).fillna(0.0)

    cost = turnover * (fee_bps + slippage_bps) * 1e-4
    net_ret = gross_ret - cost

    out = pd.DataFrame(
        {
            "gross_ret": gross_ret,
            "net_ret": net_ret,
            "turnover": turnover,
            "cost": cost,
            "gross_nav": (1.0 + gross_ret).cumprod(),
            "net_nav": (1.0 + net_ret).cumprod(),
            "cum_cost": cost.cumsum(),
            "leverage": weights.abs().sum(axis=1),
        },
        index=weights.index,
    )
    return out


def compute_eqw_backtest(close, returns, fee_bps=10.0, slippage_bps=1.0):
    eqw_daily = compute_eqw_weights(close).reindex(returns.index).fillna(0.0)
    return backtest_from_daily_weights(eqw_daily, returns, fee_bps=fee_bps, slippage_bps=slippage_bps)


def estimate_beta_ols(port_ret: pd.Series, factor_ret: pd.Series) -> float:
    aligned = pd.concat([port_ret.rename("p"), factor_ret.rename("f")], axis=1).dropna()
    if len(aligned) < 2:
        return 0.0

    var_f = aligned["f"].var(ddof=0)
    if not np.isfinite(var_f) or var_f < 1e-12:
        return 0.0

    cov_pf = ((aligned["p"] - aligned["p"].mean()) * (aligned["f"] - aligned["f"].mean())).mean()
    return float(cov_pf / var_f)


def _extract_lookback_window(returns, factor_ret, decision_date, lookback):
    end_loc = returns.index.get_loc(decision_date) - 1
    start_loc = end_loc - int(lookback) + 1

    if start_loc < 0 or end_loc < 0:
        return pd.DataFrame(columns=returns.columns), pd.Series(dtype=float), start_loc, end_loc

    returns_window = returns.iloc[start_loc : end_loc + 1].fillna(0.0)
    factor_window = factor_ret.iloc[start_loc : end_loc + 1].fillna(0.0)
    return returns_window, factor_window, start_loc, end_loc


def _eqw_vector_on_date(close, date):
    vector = pd.Series(0.0, index=close.columns)
    row = close.loc[date]
    available = row.dropna().index.tolist()
    if not available:
        return vector
    vector.loc[available] = 1.0 / len(available)
    return vector


def _project_weights(weights, allow_short_hedge, max_leverage, enforce_sum_one):
    projected = weights.copy()

    if not allow_short_hedge:
        projected = projected.clip(lower=0.0)

    if enforce_sum_one:
        total = projected.sum()
        if abs(total) > 1e-12:
            projected = projected / total
        else:
            projected[:] = 0.0

    gross = projected.abs().sum()
    if gross > max_leverage and gross > 1e-12:
        projected = projected * (max_leverage / gross)

    return projected


def _portfolio_beta(weights, returns_window, factor_window):
    port_ret = returns_window.mul(weights, axis=1).sum(axis=1)
    return estimate_beta_ols(port_ret, factor_window)


def _line_search_lambda(
    base_weight,
    eqw_vector,
    returns_window,
    factor_window,
    beta_target,
    lambda_init,
    allow_short_hedge,
    max_leverage,
    enforce_sum_one,
):
    if abs(lambda_init) < 1e-10:
        coarse = [0.0, 0.05, -0.05]
    else:
        coarse = [0.0, 0.5 * lambda_init, lambda_init, 1.5 * lambda_init, -0.5 * lambda_init]

    best_weight = None
    best_lambda = 0.0
    best_beta = 0.0
    best_error = np.inf

    def _evaluate(lmbd):
        candidate = base_weight - lmbd * eqw_vector
        candidate = _project_weights(
            candidate,
            allow_short_hedge=allow_short_hedge,
            max_leverage=max_leverage,
            enforce_sum_one=enforce_sum_one,
        )
        beta = _portfolio_beta(candidate, returns_window, factor_window)
        err = abs(beta - beta_target)
        return candidate, beta, err

    for lmbd in coarse:
        w, beta, err = _evaluate(lmbd)
        if err < best_error:
            best_weight = w
            best_lambda = float(lmbd)
            best_beta = float(beta)
            best_error = float(err)

    step = max(abs(best_lambda) * 0.25, 0.05)
    fine = [best_lambda - step, best_lambda, best_lambda + step]
    for lmbd in fine:
        w, beta, err = _evaluate(lmbd)
        if err < best_error:
            best_weight = w
            best_lambda = float(lmbd)
            best_beta = float(beta)
            best_error = float(err)

    if best_weight is None:
        best_weight = _project_weights(
            base_weight,
            allow_short_hedge=allow_short_hedge,
            max_leverage=max_leverage,
            enforce_sum_one=enforce_sum_one,
        )
        best_beta = float(_portfolio_beta(best_weight, returns_window, factor_window))
        best_lambda = 0.0

    return best_weight, best_beta, best_lambda


def apply_beta_neutralization(
    rebalance_weights,
    daily_weights_unhedged,
    close,
    returns,
    eqw_factor_ret,
    neutral_mode="none",
    beta_target=0.0,
    beta_lookback=60,
    allow_short_hedge=False,
    max_leverage=1.5,
):
    if neutral_mode not in {"none", "A_weight_adjust", "B_eqw_hedge"}:
        raise ValueError(f"Unsupported neutral_mode: {neutral_mode}")

    neutral_weights = rebalance_weights.copy()
    beta_before = pd.Series(index=rebalance_weights.index, dtype=float)
    beta_after = pd.Series(index=rebalance_weights.index, dtype=float)
    lambda_used = pd.Series(index=rebalance_weights.index, dtype=float)

    if neutral_mode == "none":
        beta_before[:] = np.nan
        beta_after[:] = np.nan
        lambda_used[:] = 0.0
        return neutral_weights, beta_before, beta_after, lambda_used

    unhedged_ret = (
        daily_weights_unhedged
        * returns.reindex(daily_weights_unhedged.index).fillna(0.0)
    ).sum(axis=1)

    for date in rebalance_weights.index:
        decision_date = align_to_index(date, returns.index, method="ffill")
        if decision_date is None:
            neutral_weights.loc[date] = rebalance_weights.loc[date]
            beta_before.loc[date] = 0.0
            beta_after.loc[date] = 0.0
            lambda_used.loc[date] = 0.0
            continue

        returns_window, factor_window, start_loc, end_loc = _extract_lookback_window(
            returns=returns,
            factor_ret=eqw_factor_ret,
            decision_date=decision_date,
            lookback=beta_lookback,
        )

        if start_loc < 0 or end_loc < 0:
            beta_hist = 0.0
        else:
            y = unhedged_ret.iloc[start_loc : end_loc + 1]
            x = eqw_factor_ret.iloc[start_loc : end_loc + 1]
            beta_hist = estimate_beta_ols(y, x)

        base_weight = rebalance_weights.loc[date].copy()
        eqw_vector = _eqw_vector_on_date(close, decision_date)

        enforce_sum_one = neutral_mode == "A_weight_adjust"

        if returns_window.empty or factor_window.empty or eqw_vector.abs().sum() < 1e-12:
            adjusted = _project_weights(
                base_weight,
                allow_short_hedge=allow_short_hedge,
                max_leverage=max_leverage,
                enforce_sum_one=enforce_sum_one,
            )
            beta_new = 0.0
            lambda_star = 0.0
        else:
            beta_u = _portfolio_beta(eqw_vector, returns_window, factor_window)
            if abs(beta_u) < 1e-12:
                lambda_init = 0.0
            else:
                lambda_init = (beta_hist - beta_target) / beta_u

            adjusted, beta_new, lambda_star = _line_search_lambda(
                base_weight=base_weight,
                eqw_vector=eqw_vector,
                returns_window=returns_window,
                factor_window=factor_window,
                beta_target=beta_target,
                lambda_init=lambda_init,
                allow_short_hedge=allow_short_hedge,
                max_leverage=max_leverage,
                enforce_sum_one=enforce_sum_one,
            )

        neutral_weights.loc[date] = adjusted
        beta_before.loc[date] = beta_hist
        beta_after.loc[date] = beta_new
        lambda_used.loc[date] = lambda_star

    return neutral_weights, beta_before, beta_after, lambda_used


def annualized_return(returns: pd.Series, periods_per_year: int = 252) -> float:
    series = returns.dropna()
    if series.empty:
        return np.nan
    gross = (1.0 + series).prod()
    years = len(series) / float(periods_per_year)
    if years <= 0:
        return np.nan
    return float(gross ** (1.0 / years) - 1.0)


def annualized_volatility(returns: pd.Series, periods_per_year: int = 252) -> float:
    series = returns.dropna()
    if series.empty:
        return np.nan
    return float(series.std(ddof=0) * np.sqrt(periods_per_year))


def sharpe_ratio(returns: pd.Series, periods_per_year: int = 252) -> float:
    ann_ret = annualized_return(returns, periods_per_year=periods_per_year)
    ann_vol = annualized_volatility(returns, periods_per_year=periods_per_year)
    if not np.isfinite(ann_ret) or not np.isfinite(ann_vol) or ann_vol < 1e-12:
        return np.nan
    return float(ann_ret / ann_vol)


def max_drawdown(nav: pd.Series) -> float:
    series = nav.dropna()
    if series.empty:
        return np.nan
    peak = series.cummax()
    dd = series / peak - 1.0
    return float(dd.min())


def rolling_beta(port_ret: pd.Series, factor_ret: pd.Series, lookback: int = 60) -> pd.Series:
    aligned = pd.concat([port_ret.rename("p"), factor_ret.rename("f")], axis=1)
    cov = aligned["p"].rolling(lookback).cov(aligned["f"])
    var = aligned["f"].rolling(lookback).var()
    return (cov / var).rename("rolling_beta")


def summarize_train_test(results_by_mode, eqw_factor_ret, split_date="2021-01-01", beta_lookback=60):
    split_ts = pd.Timestamp(split_date)
    rows = []

    for mode, payload in results_by_mode.items():
        bt = payload["bt"]
        beta_series = rolling_beta(bt["net_ret"], eqw_factor_ret, lookback=beta_lookback)

        period_map = {
            "all": bt.index,
            "train": bt.index[bt.index < split_ts],
            "test": bt.index[bt.index >= split_ts],
        }

        for period_name, idx in period_map.items():
            frame = bt.loc[idx]
            if frame.empty:
                rows.append(
                    {
                        "neutral_mode": mode,
                        "period": period_name,
                        "cagr": np.nan,
                        "ann_vol": np.nan,
                        "sharpe": np.nan,
                        "mdd": np.nan,
                        "avg_turnover": np.nan,
                        "total_cost": np.nan,
                        "avg_beta": np.nan,
                    }
                )
                continue

            rows.append(
                {
                    "neutral_mode": mode,
                    "period": period_name,
                    "cagr": annualized_return(frame["net_ret"]),
                    "ann_vol": annualized_volatility(frame["net_ret"]),
                    "sharpe": sharpe_ratio(frame["net_ret"]),
                    "mdd": max_drawdown(frame["net_nav"]),
                    "avg_turnover": float(frame["turnover"].mean()),
                    "total_cost": float(frame["cost"].sum()),
                    "avg_beta": float(beta_series.reindex(frame.index).mean(skipna=True)),
                }
            )

    return pd.DataFrame(rows)


def summarize_mode_all(results_by_mode, eqw_factor_ret, beta_lookback=60):
    rows = []
    for mode, payload in results_by_mode.items():
        bt = payload["bt"]
        rb = rolling_beta(bt["net_ret"], eqw_factor_ret, lookback=beta_lookback)
        rows.append(
            {
                "neutral_mode": mode,
                "ann_return_net": annualized_return(bt["net_ret"]),
                "ann_return_gross": annualized_return(bt["gross_ret"]),
                "sharpe_net": sharpe_ratio(bt["net_ret"]),
                "mdd": max_drawdown(bt["net_nav"]),
                "avg_turnover": float(bt["turnover"].mean()),
                "total_cost": float(bt["cost"].sum()),
                "avg_beta": float(rb.mean(skipna=True)),
                "avg_leverage": float(bt["leverage"].mean()),
                "final_nav_net": float(bt["net_nav"].iloc[-1]),
            }
        )
    return pd.DataFrame(rows).sort_values("neutral_mode").reset_index(drop=True)


ensure_dirs()
print("utility functions ready")


## (2) ETF Launch Date Handling（投資可能性）

各ETFについて `first_valid_date` を

$$
\tau_i^{\text{launch}} = \min\{t\mid P_{i,t}\text{ が欠損でない}\}
$$

として定義し、$t<\tau_i^{\text{launch}}$ は **投資不可** として価格・出来高を欠損（NaN）のまま保持します。

- ローンチ前を 0 リターンで埋めると、非存在期間を「無変動資産」と誤認してバイアスが入るため採用しません。
- 実装ではローンチ前を欠損で保持し、ウェイト計算時に「その時点で有効なETFのみ」を対象化します。


In [None]:
# 4) Download Data

close, volume, returns, first_valid_date = load_or_download_data(
    tickers=ALL_TICKERS,
    start=CONFIG["start"],
    end=CONFIG["end"],
    force_download=CONFIG["force_download"],
)

print("close shape:", close.shape)
print("volume shape:", volume.shape)
print("returns shape:", returns.shape)
print("sample first_valid_date:")
print(pd.Series(first_valid_date).sort_values().head(10))


## (3) Signal / Proxy Definition（最終採択proxy）

最終採択proxyは `price_only_mms` です（`docs/notes/70_final_spec.md` 準拠）。

まずテーマ日次リターンを、投資可能ETFの等ウェイト平均で作ります。

$$
r^{\text{theme}}_{\theta,t} = \frac{1}{|\mathcal{I}_\theta(t)|}\sum_{i\in\mathcal{I}_\theta(t)} r_{i,t}
$$

月末 $m$ で直近窓 $W_0$（63営業日）と一つ前の窓 $W_1$（63営業日）を使い、

$$
R_{\theta,j}=\prod_{t\in W_j}(1+r^{\text{theme}}_{\theta,t})-1,\quad
\sigma_{\theta,j}=\mathrm{std}_{t\in W_j}(r^{\text{theme}}_{\theta,t}),\quad
mom_{\theta,j}=\frac{R_{\theta,j}}{\sigma_{\theta,j}+\varepsilon}
$$

$$
\Delta^{abs}_{\theta}=mom_{\theta,0}-mom_{\theta,1},\quad
\Delta^{pct}_{\theta}=\frac{mom_{\theta,0}-mom_{\theta,1}}{|mom_{\theta,1}|+\varepsilon}
$$

とし、横断標準化後に

$$
score_{\theta,m}=\mathrm{round}_{0.1}\left(\mathrm{clip}\left(z\left(z(\Delta^{abs}_{\theta})+z(\Delta^{pct}_{\theta})\right),-3,3\right)\right)
$$

をテーマスコアとして使います。

一般形としては

$$
\mathrm{ThemeScore}_{\theta,t}=\frac{1}{|\mathcal{I}_\theta(t)|}\sum_{i\in\mathcal{I}_\theta(t)} \mathrm{Score}_{i,t}
$$

ですが、本 notebook 実装では ETF個別スコアを明示計算せず、$r^{\text{theme}}_{\theta,t}$ を先に構成して $score_{\theta,m}$ を直接求めます。

**直感**: 「最近3か月のリスク調整モメンタムが、その前3か月に対してどれだけ加速したか」を計る proxy です。


In [None]:
# 5) Build Signals (final adopted proxy: price_only_mms)

theme_returns, theme_dollar_volume = build_theme_series(
    close=close,
    volume=volume,
    theme_to_tickers=THEME_TO_TICKERS,
)

score_m, mom_m = compute_price_only_scores(
    theme_returns=theme_returns,
    lookback_3m=CONFIG["lookback_3m"],
)

print("theme_returns shape:", theme_returns.shape)
print("score_m shape:", score_m.shape)
print("latest score snapshot:")
print(score_m.tail(1).T.sort_values(score_m.tail(1).index[0], ascending=False).head(10))


## (4) Selection Rule（Top-K とリバランス）

リバランス時点 $t_r$ は `rebalance` で決まり、デフォルトは月次（`M`）です。

有効テーマ集合を $\mathcal{K}(t_r)$ とすると、選抜集合は

$$
\mathcal{S}(t_r)=\mathrm{TopK}_{\theta\in\mathcal{K}(t_r)}\left(score_{\theta,t_r}\right),\quad K=\text{top\_k}
$$

です。実装では同点や欠損で不足時、当月の $mom_{\theta,0}$ 順で補完します。

先読み回避のため `exec_lag` を適用し、$t_r$ で決めたウェイトは

$$
\text{first tradable date at/after } t_r + \text{lag}
$$

から反映されます（決定日当日に即時適用しません）。


In [None]:
# 6) Build Monthly/Period Rebalance Schedule

theme_weights = build_rebalance_theme_weights(
    score_m=score_m,
    mom_m=mom_m,
    close=close,
    theme_to_tickers=THEME_TO_TICKERS,
    rebalance=CONFIG["rebalance"],
    top_k=CONFIG["top_k"],
)

print("rebalance points:", len(theme_weights))
print("non-zero themes on last rebalance:")
print(theme_weights.iloc[-1][theme_weights.iloc[-1] > 0])


## (5) Portfolio Weight Construction（等ウェイト）

選抜テーマ間は等ウェイト、テーマ内ETFも等ウェイトです。

$$
w^{\text{theme}}_{\theta,t_r}=\begin{cases}
\frac{1}{|\mathcal{S}(t_r)|}, & \theta\in\mathcal{S}(t_r)\\
0, & \text{otherwise}
\end{cases}
$$

ETFレベルでは

$$
w^{\text{raw}}_{i,t_r}=\sum_{\theta\in\mathcal{S}(t_r)} w^{\text{theme}}_{\theta,t_r}
\cdot \frac{\mathbf{1}\{i\in\mathcal{I}_\theta(t_r)\}}{|\mathcal{I}_\theta(t_r)|}
$$

を計算し、合計が正の場合は $\sum_i w^{\text{raw}}_{i,t_r}=1$ に正規化します。

ベンチマークEQWは投資可能ETFのみで

$$
b_{i,t}=\begin{cases}
\frac{1}{|\mathcal{I}(t)|}, & i\in\mathcal{I}(t)\\
0, & \text{otherwise}
\end{cases},\quad
r^{EQW}_t=\sum_i b_{i,t}r_{i,t}
$$

と定義します。


In [None]:
# 7) Construct Weights (Top-K -> ETF equal weight)

etf_weights_raw = map_theme_to_etf_weights(
    theme_weights=theme_weights,
    close=close,
    theme_to_tickers=THEME_TO_TICKERS,
)

daily_weights_unhedged = expand_weights_daily(
    rebalance_weights=etf_weights_raw,
    daily_index=returns.index,
    exec_lag_days=CONFIG["exec_lag"],
)

print("rebalance ETF weight rows:", etf_weights_raw.shape)
print("daily unhedged weights shape:", daily_weights_unhedged.shape)
print("last rebalance weight sum:", float(etf_weights_raw.iloc[-1].sum()))


## (6) Beta Estimation vs EQW（先読み回避）

各リバランス意思決定時点で、EQWリターンを因子として過去窓のみで beta を推定します。

- ルックバック長: $L_\beta =$ `beta_lookback`
- 使用データ: $[t_r-L_\beta,\ t_r-1]$（実装上は取引日で前方詰めした decision date の前日まで）

$$
\beta_{t_r}=\frac{\mathrm{Cov}\left(r^{port}_{t_r-L_\beta:t_r-1},\ r^{EQW}_{t_r-L_\beta:t_r-1}\right)}{\mathrm{Var}\left(r^{EQW}_{t_r-L_\beta:t_r-1}\right)}
$$

未来のリターンは一切使いません。


## (7) Neutralization Modes（none / A / B）

EQW方向ベクトルを $u(t_r)$ とし、投資可能ETFにのみ等ウェイトを置きます。

$$
u_i(t_r)=\begin{cases}
\frac{1}{|\mathcal{I}(t_r)|}, & i\in\mathcal{I}(t_r)\\
0, & \text{otherwise}
\end{cases}
$$

初期ヘッジ係数は

$$
\lambda^{init}_{t_r}=\frac{\beta^{hist}_{t_r}-\beta^{target}}{\beta_u},\quad
\beta_u=\beta(u, r^{EQW})
$$

で作り、実装では coarse/fine の簡易ラインサーチで $|\beta-\beta^{target}|$ を最小化する $\lambda$ を選びます。

### mode = `none`

$$
w'_{t_r}=w^{raw}_{t_r}
$$

（調整なし）。

### mode = `A_weight_adjust`

$$
w'_{t_r}=\Pi_A\left(w^{raw}_{t_r}-\lambda_{t_r}u(t_r)\right)
$$

- `allow_short_hedge=False` のとき負ウェイトを 0 にクリップ
- 合計を 1 に再正規化（`enforce_sum_one=True`）
- 総レバレッジ $\|w\|_1$ が `max_leverage` 超過時は比例縮小

### mode = `B_eqw_hedge`

$$
w'_{t_r}=\Pi_B\left(w^{raw}_{t_r}-\lambda_{t_r}u(t_r)\right)
$$

- 負ウェイト許容は `allow_short_hedge` で制御
- `B` では合計1への強制正規化を行わない（`enforce_sum_one=False`）
- ただし $\|w\|_1 \le \text{max\_leverage}$ は常に満たすよう射影

制約で厳密一致できない場合は、上記射影後に「最も近い feasible」解を採用します。


In [None]:
# 8) Apply Neutralization (none / A_weight_adjust / B_eqw_hedge)

# EQW factor for beta estimation (costless factor return)
eqw_factor_bt = compute_eqw_backtest(
    close=close,
    returns=returns,
    fee_bps=0.0,
    slippage_bps=0.0,
)
eqw_factor_ret = eqw_factor_bt["net_ret"]

neutralized_payload = {}
for mode in CONFIG["neutral_modes"]:
    rebalance_w, beta_before, beta_after, lambda_used = apply_beta_neutralization(
        rebalance_weights=etf_weights_raw,
        daily_weights_unhedged=daily_weights_unhedged,
        close=close,
        returns=returns,
        eqw_factor_ret=eqw_factor_ret,
        neutral_mode=mode,
        beta_target=CONFIG["beta_target"],
        beta_lookback=CONFIG["beta_lookback"],
        allow_short_hedge=CONFIG["allow_short_hedge"],
        max_leverage=CONFIG["max_leverage"],
    )

    daily_w = expand_weights_daily(
        rebalance_weights=rebalance_w,
        daily_index=returns.index,
        exec_lag_days=CONFIG["exec_lag"],
    )

    neutralized_payload[mode] = {
        "rebalance_weights": rebalance_w,
        "daily_weights": daily_w,
        "beta_before": beta_before,
        "beta_after": beta_after,
        "lambda_used": lambda_used,
    }

beta_diag = pd.DataFrame(
    {
        mode: {
            "mean_beta_before": float(payload["beta_before"].mean(skipna=True)),
            "mean_beta_after": float(payload["beta_after"].mean(skipna=True)),
            "mean_lambda": float(payload["lambda_used"].mean(skipna=True)),
        }
        for mode, payload in neutralized_payload.items()
    }
).T

print(beta_diag)


## (8) Turnover & Transaction Costs

日次ポートフォリオ収益は

$$
r^{gross}_t=\sum_i w_{i,t}r_{i,t}
$$

turnover は

$$
TO_t=\sum_i |w_{i,t}-w_{i,t-1}|
$$

片道コスト係数を

$$
c=(\text{fee\_bps}+\text{slippage\_bps})\times 10^{-4}
$$

とすると、実装は日次控除で

$$
r^{net}_t=r^{gross}_t-c\,TO_t
$$

です（`cost_t = c·TO_t`, `cum_cost_t = \sum_{\tau\le t} cost_\tau`）。

EQWも同様の式で評価し、beta推定用因子では `fee=slippage=0` を使います。


In [None]:
# 9) Backtest (gross/net, turnover, cost)

results_by_mode = {}
for mode in CONFIG["neutral_modes"]:
    bt = backtest_from_daily_weights(
        daily_weights=neutralized_payload[mode]["daily_weights"],
        returns=returns,
        fee_bps=CONFIG["fee_bps"],
        slippage_bps=CONFIG["slippage_bps"],
    )
    rb = rolling_beta(bt["net_ret"], eqw_factor_ret, lookback=CONFIG["beta_lookback"])

    results_by_mode[mode] = {
        "bt": bt,
        "rolling_beta": rb,
        "beta_before": neutralized_payload[mode]["beta_before"],
        "beta_after": neutralized_payload[mode]["beta_after"],
        "lambda_used": neutralized_payload[mode]["lambda_used"],
    }

# Benchmark EQW (same cost setup as strategy for gross/net comparison)
eqw_bt = compute_eqw_backtest(
    close=close,
    returns=returns,
    fee_bps=CONFIG["fee_bps"],
    slippage_bps=CONFIG["slippage_bps"],
)

print("backtest done for modes:", list(results_by_mode.keys()))
print("eqw final net nav:", float(eqw_bt["net_nav"].iloc[-1]))


## (9) Outputs & Evaluation

`split_date` で train/test を分割し、modeごとに以下を計算します。

- CAGR: $\left(\prod_t (1+r_t)\right)^{252/T}-1$
- 年率ボラ: $\sigma(r_t)\sqrt{252}$
- Sharpe: $\mathrm{CAGR}/\mathrm{AnnVol}$
- MDD: $\min_t\left(\frac{NAV_t}{\max_{\tau\le t}NAV_\tau}-1\right)$
- 平均turnover, 総コスト, rolling beta（対EQW）

比較は `none / A_weight_adjust / B_eqw_hedge` を同一条件で行い、主に以下で確認します。

- 表: `outputs/tables/final_method_train_test_metrics.csv`
- 表: `outputs/tables/final_method_mode_summary.csv`
- 図: `outputs/figures/final_method_nav_none_A_B_vs_eqw.png`
- 図: `outputs/figures/final_method_rolling_beta_none_A_B.png`
- 図: `outputs/figures/final_method_turnover_none_A_B.png`
- 図: `outputs/figures/final_method_cumulative_cost_none_A_B.png`


In [None]:
# 10) Evaluate (train/test)

metrics_train_test = summarize_train_test(
    results_by_mode=results_by_mode,
    eqw_factor_ret=eqw_factor_ret,
    split_date=CONFIG["split_date"],
    beta_lookback=CONFIG["beta_lookback"],
)

mode_summary_all = summarize_mode_all(
    results_by_mode=results_by_mode,
    eqw_factor_ret=eqw_factor_ret,
    beta_lookback=CONFIG["beta_lookback"],
)

eqw_summary = pd.DataFrame(
    {
        "benchmark": ["EQW"],
        "ann_return_net": [annualized_return(eqw_bt["net_ret"])],
        "ann_return_gross": [annualized_return(eqw_bt["gross_ret"])],
        "sharpe_net": [sharpe_ratio(eqw_bt["net_ret"])],
        "mdd": [max_drawdown(eqw_bt["net_nav"])],
        "avg_turnover": [float(eqw_bt["turnover"].mean())],
        "total_cost": [float(eqw_bt["cost"].sum())],
        "avg_beta": [1.0],
        "avg_leverage": [float(eqw_bt["leverage"].mean())],
        "final_nav_net": [float(eqw_bt["net_nav"].iloc[-1])],
    }
)

metrics_train_test_path = OUT_TABLE_DIR / "final_method_train_test_metrics.csv"
mode_summary_path = OUT_TABLE_DIR / "final_method_mode_summary.csv"
eqw_summary_path = OUT_TABLE_DIR / "final_method_eqw_summary.csv"

metrics_train_test.to_csv(metrics_train_test_path, index=False)
mode_summary_all.to_csv(mode_summary_path, index=False)
eqw_summary.to_csv(eqw_summary_path, index=False)

print("saved:", metrics_train_test_path)
print("saved:", mode_summary_path)
print("saved:", eqw_summary_path)

print("\nmode summary (all period):")
print(mode_summary_all)


## 12) Weights Export & Visualization

### ウェイト定義
本 notebook の日次ETFウェイトを $w_{i,t}$ とする。テーマ $\theta$ の集合を $\mathcal{I}_\theta$ とすると、

$$
W^{\text{theme}}_{\theta,t}=\sum_{i\in\mathcal{I}_\theta} w_{i,t}
$$

でテーマウェイトを定義する。

### テーマ集計方法
- `weights_etf_<mode>.csv`: ETF日次ウェイト時系列
- `weights_theme_<mode>.csv`: 上式で集計したテーマ日次ウェイト時系列

### ローンチ日による有効集合の変化
ETFローンチ日前は欠損で扱うため、時点 $t$ で有効なETF集合 $\mathcal{I}(t)$ は時間変化する。したがってテーマ有効集合も時変であり、初期期間ほど投資可能テーマは限定される。


In [None]:
# 12) Weights Export & Visualization


def aggregate_theme_weights_from_etf(etf_weights: pd.DataFrame, theme_to_tickers: dict) -> pd.DataFrame:
    theme_weights = pd.DataFrame(0.0, index=etf_weights.index, columns=list(theme_to_tickers.keys()))
    for theme, tickers in theme_to_tickers.items():
        cols = [t for t in tickers if t in etf_weights.columns]
        if cols:
            theme_weights[theme] = etf_weights[cols].sum(axis=1)
    return theme_weights


weights_export_log = []
theme_weights_by_mode = {}
etf_weights_by_mode = {}

for mode in CONFIG["neutral_modes"]:
    etf_daily = neutralized_payload[mode]["daily_weights"].copy()
    etf_weights_by_mode[mode] = etf_daily

    etf_path = OUT_TABLE_DIR / f"weights_etf_{mode}.csv"
    etf_daily.to_csv(etf_path)

    theme_daily = aggregate_theme_weights_from_etf(etf_daily, THEME_TO_TICKERS)
    theme_weights_by_mode[mode] = theme_daily

    theme_path = OUT_TABLE_DIR / f"weights_theme_{mode}.csv"
    theme_daily.to_csv(theme_path)

    weights_export_log.append({"mode": mode, "etf_path": str(etf_path), "theme_path": str(theme_path)})

    # Heatmap uses monthly average weights for readability.
    theme_monthly = theme_daily.resample("ME").mean()
    mat = theme_monthly.T.values

    fig, ax = plt.subplots(figsize=(12, 6))
    vmax = float(np.nanmax(mat)) if np.isfinite(np.nanmax(mat)) else 0.25
    vmax = max(vmax, 0.25)
    im = ax.imshow(mat, aspect="auto", cmap="YlGnBu", vmin=0.0, vmax=vmax)

    y_idx = np.arange(len(theme_monthly.columns))
    ax.set_yticks(y_idx)
    ax.set_yticklabels(theme_monthly.columns, fontsize=8)

    if len(theme_monthly.index) > 1:
        x_idx = np.linspace(0, len(theme_monthly.index) - 1, num=min(10, len(theme_monthly.index)), dtype=int)
        x_lbl = [theme_monthly.index[i].strftime("%Y-%m") for i in x_idx]
        ax.set_xticks(x_idx)
        ax.set_xticklabels(x_lbl, rotation=45, ha="right")

    ax.set_title(f"Theme Weights Heatmap ({mode})")
    ax.set_xlabel("Month")
    ax.set_ylabel("Theme")
    fig.colorbar(im, ax=ax, fraction=0.02, pad=0.01, label="weight")

    heatmap_path = OUT_FIG_DIR / f"theme_weights_heatmap_{mode}.png"
    fig.savefig(heatmap_path, dpi=150, bbox_inches="tight")
    plt.show()

# Top-K timeline at rebalance dates (mode-independent selection rule).
selected_long = (
    theme_weights.stack()
    .rename("weight")
    .reset_index()
    .rename(columns={"level_0": "date", "level_1": "theme"})
)
selected_long = selected_long[selected_long["weight"] > 0].copy()

fig, ax = plt.subplots(figsize=(12, 5))
if not selected_long.empty:
    themes = sorted(selected_long["theme"].unique().tolist())
    theme_to_y = {theme: idx for idx, theme in enumerate(themes)}

    for theme in themes:
        dsub = selected_long[selected_long["theme"] == theme]
        ax.scatter(dsub["date"], [theme_to_y[theme]] * len(dsub), s=16, label=theme)

    ax.set_yticks(list(theme_to_y.values()))
    ax.set_yticklabels(list(theme_to_y.keys()), fontsize=8)

ax.set_title("Top-K Theme Timeline (Rebalance Dates)")
ax.set_xlabel("date")
ax.set_ylabel("selected theme")
ax.grid(True, alpha=0.3)

timeline_path = OUT_FIG_DIR / "theme_weights_topk_timeline.png"
fig.savefig(timeline_path, dpi=150, bbox_inches="tight")
plt.show()

weights_export_df = pd.DataFrame(weights_export_log)
weights_export_df.to_csv(OUT_TABLE_DIR / "weights_export_log.csv", index=False)

print("saved weight tables/figures")
print(weights_export_df)
print("timeline:", timeline_path)


## 13) FF Attribution（Ken French Data Library）

この節では Fama-French 因子を自動取得し、戦略月次リターンを事後分析する。

### 月次化
日次リターン $r_d$ を月次へ変換:

$$
r_m = \prod_{d\in m}(1+r_d)-1
$$

### excess return
戦略 net 月次リターンを $r^{str}_m$、無リスクを $RF_m$ として

$$
y_m = r^{str}_m - RF_m
$$

を回帰対象とする。追加で active return proxy として

$$
r^{active}_m = r^{str}_m - r^{EQW}_m
$$

も分析する（こちらは自己資金型差分として RF 控除しない）。

### 回帰モデル

- FF3:
$$
y = \alpha + b_M (MKT-RF) + b_S SMB + b_H HML + \varepsilon$$

- FF5:
$$
y = \alpha + b_M (MKT-RF) + b_S SMB + b_H HML + b_R RMW + b_C CMA + \varepsilon$$

- FF5+MOM（MOM取得時のみ）:
$$
y = \alpha + b_M (MKT-RF) + b_S SMB + b_H HML + b_R RMW + b_C CMA + b_{Mo} MOM + \varepsilon$$


In [None]:
# 13-a) Download and parse FF factors from Ken French Data Library

import io
import re
import zipfile
import urllib.request
from urllib.error import HTTPError, URLError

FF_DATA_DIR = ROOT / "data"
FF_DATA_DIR.mkdir(parents=True, exist_ok=True)


FF_SPECS = {
    "ff3": {
        "output": FF_DATA_DIR / "ff_factors_ff3_monthly.csv",
        "urls": [
            "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip",
            "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_TXT.zip",
        ],
        "expected_cols": ["MKT_RF", "SMB", "HML", "RF"],
    },
    "ff5": {
        "output": FF_DATA_DIR / "ff_factors_ff5_monthly.csv",
        "urls": [
            "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_5_Factors_2x3_CSV.zip",
            "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_5_Factors_2x3_TXT.zip",
        ],
        "expected_cols": ["MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"],
    },
    "mom": {
        "output": FF_DATA_DIR / "ff_mom_monthly.csv",
        "urls": [
            "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Momentum_Factor_CSV.zip",
            "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Momentum_Factor_TXT.zip",
        ],
        "expected_cols": ["MOM"],
    },
}


def _extract_text_from_zip(content: bytes) -> str:
    with zipfile.ZipFile(io.BytesIO(content)) as zf:
        members = [n for n in zf.namelist() if n.lower().endswith((".csv", ".txt"))]
        if not members:
            members = zf.namelist()
        if not members:
            raise ValueError("zip contains no files")
        raw = zf.read(members[0])

    for enc in ("utf-8", "latin1"):
        try:
            return raw.decode(enc)
        except UnicodeDecodeError:
            continue
    return raw.decode("latin1", errors="ignore")


def _parse_monthly_ff_table(raw_text: str, expected_cols: list[str]) -> pd.DataFrame:
    lines = raw_text.replace("\r", "").split("\n")
    pattern = re.compile(r"^\s*(\d{6})\s*,")

    rows = []
    in_block = False

    for line in lines:
        m = pattern.match(line)
        if m:
            in_block = True
            parts = [p.strip() for p in line.split(",")]
            if len(parts) < 1 + len(expected_cols):
                continue
            row = [parts[0]] + parts[1 : 1 + len(expected_cols)]
            rows.append(row)
        elif in_block:
            # Stop at annual section or first non-monthly line after entering block.
            break

    if not rows:
        raise ValueError("monthly YYYYMM block not found")

    df = pd.DataFrame(rows, columns=["YYYYMM"] + expected_cols)
    df["YYYYMM"] = pd.to_numeric(df["YYYYMM"], errors="coerce").astype("Int64")
    df = df.dropna(subset=["YYYYMM"]).copy()

    for col in expected_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        # Ken French missing code often appears near -99.99.
        df.loc[df[col] <= -90, col] = np.nan
        df[col] = df[col] / 100.0

    df["date"] = pd.to_datetime(df["YYYYMM"].astype(int).astype(str), format="%Y%m") + pd.offsets.MonthEnd(0)
    df = df.set_index("date")[expected_cols].sort_index()
    return df


def fetch_ff_dataset(spec_key: str, force_download: bool = False):
    spec = FF_SPECS[spec_key]
    out_path = spec["output"]

    if out_path.exists() and not force_download:
        df = pd.read_csv(out_path, index_col=0, parse_dates=True)
        df.index = pd.to_datetime(df.index)
        return df, "cached", str(out_path), ""

    last_error = ""
    for url in spec["urls"]:
        try:
            with urllib.request.urlopen(url, timeout=45) as resp:
                content = resp.read()
            raw_text = _extract_text_from_zip(content)
            df = _parse_monthly_ff_table(raw_text, spec["expected_cols"])
            df.to_csv(out_path)
            return df, "downloaded", url, ""
        except (HTTPError, URLError, TimeoutError, zipfile.BadZipFile, ValueError) as exc:
            last_error = f"{type(exc).__name__}: {exc}"
            print(f"[WARN] {spec_key} failed from {url} -> {last_error}")

    return None, "failed", "", last_error


ff_results = {}
ff_status_rows = []
force_ff = bool(CONFIG.get("force_ff_download", False))

for key in ["ff3", "ff5", "mom"]:
    df, status, source, err = fetch_ff_dataset(key, force_download=force_ff)
    ff_results[key] = df
    ff_status_rows.append(
        {
            "dataset": key,
            "status": status,
            "source_or_file": source,
            "error": err,
            "n_rows": (0 if df is None else len(df)),
        }
    )

ff_status_df = pd.DataFrame(ff_status_rows)
ff_status_path = OUT_TABLE_DIR / "ff_download_status.csv"
ff_status_df.to_csv(ff_status_path, index=False)

print(ff_status_df)
print("saved:", ff_status_path)


In [None]:
# 13-b) Monthly returns, FF regressions, and output figures


def daily_to_monthly_simple(ret: pd.Series) -> pd.Series:
    return (1.0 + ret).resample("ME").prod() - 1.0


def fit_ols_with_tstats(y: pd.Series, X: pd.DataFrame, x_cols: list[str]):
    merged = pd.concat([y.rename("y"), X[x_cols]], axis=1).dropna()
    n = len(merged)
    k = len(x_cols)
    if n <= (k + 1):
        return None

    yv = merged["y"].to_numpy(dtype=float)
    xv = merged[x_cols].to_numpy(dtype=float)
    Xmat = np.column_stack([np.ones(n), xv])

    coef, _, _, _ = np.linalg.lstsq(Xmat, yv, rcond=None)
    fitted = Xmat @ coef
    resid = yv - fitted

    sse = float((resid ** 2).sum())
    sst = float(((yv - yv.mean()) ** 2).sum())
    r2 = (1.0 - sse / sst) if sst > 1e-12 else np.nan

    dof = n - (k + 1)
    if dof > 0:
        sigma2 = sse / dof
        xtx_inv = np.linalg.pinv(Xmat.T @ Xmat)
        se = np.sqrt(np.diag(sigma2 * xtx_inv))
        tvals = np.where(se > 1e-12, coef / se, np.nan)
    else:
        tvals = np.full_like(coef, np.nan)

    res = {
        "nobs": int(n),
        "r2": float(r2),
        "alpha_monthly": float(coef[0]),
        "alpha_annualized": float((1.0 + coef[0]) ** 12 - 1.0),
        "t_alpha": float(tvals[0]) if np.isfinite(tvals[0]) else np.nan,
    }

    for j, name in enumerate(x_cols, start=1):
        res[f"beta_{name}"] = float(coef[j])
        res[f"t_{name}"] = float(tvals[j]) if np.isfinite(tvals[j]) else np.nan

    return res


# Strategy monthly series (net)
strategy_monthly = {
    mode: daily_to_monthly_simple(results_by_mode[mode]["bt"]["net_ret"])
    for mode in CONFIG["neutral_modes"]
}
eqw_monthly = daily_to_monthly_simple(eqw_bt["net_ret"])

active_monthly = {
    f"{mode}_minus_EQW": strategy_monthly[mode] - eqw_monthly
    for mode in CONFIG["neutral_modes"]
}

ff3 = ff_results.get("ff3")
ff5 = ff_results.get("ff5")
mom = ff_results.get("mom")

factor_models = {}
if ff3 is not None and {"MKT_RF", "SMB", "HML", "RF"}.issubset(ff3.columns):
    factor_models["FF3"] = {
        "data": ff3,
        "x_cols": ["MKT_RF", "SMB", "HML"],
        "has_rf": True,
    }

if ff5 is not None and {"MKT_RF", "SMB", "HML", "RMW", "CMA", "RF"}.issubset(ff5.columns):
    factor_models["FF5"] = {
        "data": ff5,
        "x_cols": ["MKT_RF", "SMB", "HML", "RMW", "CMA"],
        "has_rf": True,
    }

if ff5 is not None and mom is not None:
    ff5_mom = ff5.join(mom[["MOM"]], how="inner")
    if {"MKT_RF", "SMB", "HML", "RMW", "CMA", "MOM", "RF"}.issubset(ff5_mom.columns):
        factor_models["FF5_MOM"] = {
            "data": ff5_mom,
            "x_cols": ["MKT_RF", "SMB", "HML", "RMW", "CMA", "MOM"],
            "has_rf": True,
        }

rows = []

for mode in CONFIG["neutral_modes"]:
    series = strategy_monthly[mode]
    for model_name, model_spec in factor_models.items():
        fac = model_spec["data"]
        x_cols = model_spec["x_cols"]

        if model_spec["has_rf"] and "RF" in fac.columns:
            y = series - fac["RF"]
            return_type = "strategy_excess"
        else:
            y = series.copy()
            return_type = "strategy_raw"

        fit = fit_ols_with_tstats(y=y, X=fac, x_cols=x_cols)
        if fit is None:
            continue
        fit.update(
            {
                "strategy_id": mode,
                "series_kind": "strategy_net",
                "model": model_name,
                "return_type": return_type,
            }
        )
        rows.append(fit)

for key, series in active_monthly.items():
    for model_name, model_spec in factor_models.items():
        fac = model_spec["data"]
        x_cols = model_spec["x_cols"]

        # Active return proxy is treated as self-financing spread (no RF subtraction).
        y = series.copy()
        fit = fit_ols_with_tstats(y=y, X=fac, x_cols=x_cols)
        if fit is None:
            continue
        fit.update(
            {
                "strategy_id": key,
                "series_kind": "active_minus_eqw",
                "model": model_name,
                "return_type": "active_raw",
            }
        )
        rows.append(fit)

ff_attr_df = pd.DataFrame(rows)
ff_attr_path = OUT_TABLE_DIR / "ff_attribution.csv"
ff_attr_df.to_csv(ff_attr_path, index=False)

print("factor models:", list(factor_models.keys()))
print("saved:", ff_attr_path)
print(ff_attr_df.head(12))

# Figure 1: alpha comparison bar
alpha_fig_path = OUT_FIG_DIR / "ff_alpha_bar.png"
fig, ax = plt.subplots(figsize=(12, 5))

if not ff_attr_df.empty:
    alpha_plot = ff_attr_df.copy()
    alpha_plot["label"] = alpha_plot["strategy_id"] + "|" + alpha_plot["model"]
    alpha_plot = alpha_plot.sort_values("alpha_annualized", ascending=False)

    ax.bar(alpha_plot["label"], alpha_plot["alpha_annualized"] * 100.0)
    ax.set_xticks(np.arange(len(alpha_plot)))
    ax.set_xticklabels(alpha_plot["label"], rotation=70, ha="right", fontsize=8)
    ax.axhline(0.0, color="black", linestyle="--", linewidth=1.0)

ax.set_title("FF Attribution: Annualized Alpha")
ax.set_ylabel("alpha (% p.a.)")
ax.grid(True, alpha=0.3)
fig.savefig(alpha_fig_path, dpi=150, bbox_inches="tight")
plt.show()

# Figure 2: beta heatmap (priority: FF5_MOM > FF5 > FF3)
beta_fig_path = OUT_FIG_DIR / "ff_betas_heatmap.png"
priority = ["FF5_MOM", "FF5", "FF3"]
model_for_heatmap = next((m for m in priority if m in factor_models), None)

fig, ax = plt.subplots(figsize=(10, 4.8))
if model_for_heatmap is not None and not ff_attr_df.empty:
    factor_cols = factor_models[model_for_heatmap]["x_cols"]
    sub = ff_attr_df[
        (ff_attr_df["model"] == model_for_heatmap)
        & (ff_attr_df["series_kind"] == "strategy_net")
    ].copy()

    if not sub.empty:
        mat_df = sub.set_index("strategy_id")[[f"beta_{c}" for c in factor_cols]]
        mat = mat_df.values
        vmax = np.nanmax(np.abs(mat)) if np.isfinite(np.nanmax(np.abs(mat))) else 1.0
        vmax = max(vmax, 0.2)

        im = ax.imshow(mat, aspect="auto", cmap="coolwarm", vmin=-vmax, vmax=vmax)
        ax.set_yticks(np.arange(len(mat_df.index)))
        ax.set_yticklabels(mat_df.index)
        ax.set_xticks(np.arange(len(factor_cols)))
        ax.set_xticklabels(factor_cols)
        ax.set_title(f"FF Betas Heatmap ({model_for_heatmap}, strategy_net)")
        fig.colorbar(im, ax=ax, fraction=0.025, pad=0.01, label="beta")

ax.grid(False)
fig.savefig(beta_fig_path, dpi=150, bbox_inches="tight")
plt.show()

print("saved:", alpha_fig_path)
print("saved:", beta_fig_path)


In [None]:
# 11) Plot & Save

nav_fig_path = OUT_FIG_DIR / "final_method_nav_none_A_B_vs_eqw.png"
beta_fig_path = OUT_FIG_DIR / "final_method_rolling_beta_none_A_B.png"
turnover_fig_path = OUT_FIG_DIR / "final_method_turnover_none_A_B.png"
cost_fig_path = OUT_FIG_DIR / "final_method_cumulative_cost_none_A_B.png"

# NAV
fig, ax = plt.subplots(figsize=(11, 4.5))
for mode in CONFIG["neutral_modes"]:
    nav = results_by_mode[mode]["bt"]["net_nav"]
    ax.plot(nav.index, nav.values, label=f"strategy:{mode}")
ax.plot(eqw_bt.index, eqw_bt["net_nav"], linestyle="--", color="black", label="benchmark:EQW")
ax.set_title("Final Method NAV (net)")
ax.set_ylabel("NAV")
ax.grid(True, alpha=0.3)
ax.legend(loc="best")
fig.savefig(nav_fig_path, dpi=150, bbox_inches="tight")
plt.show()

# Rolling beta
fig, ax = plt.subplots(figsize=(11, 4))
for mode in CONFIG["neutral_modes"]:
    rb = results_by_mode[mode]["rolling_beta"]
    ax.plot(rb.index, rb.values, label=mode)
ax.axhline(CONFIG["beta_target"], linestyle="--", color="black", linewidth=1.0, label="beta_target")
ax.set_title("Rolling Beta vs EQW Factor")
ax.set_ylabel("beta")
ax.grid(True, alpha=0.3)
ax.legend(loc="best")
fig.savefig(beta_fig_path, dpi=150, bbox_inches="tight")
plt.show()

# Turnover
fig, ax = plt.subplots(figsize=(11, 4))
for mode in CONFIG["neutral_modes"]:
    t = results_by_mode[mode]["bt"]["turnover"]
    ax.plot(t.index, t.values, label=mode)
ax.set_title("Daily Turnover")
ax.set_ylabel("turnover")
ax.grid(True, alpha=0.3)
ax.legend(loc="best")
fig.savefig(turnover_fig_path, dpi=150, bbox_inches="tight")
plt.show()

# Cumulative cost
fig, ax = plt.subplots(figsize=(11, 4))
for mode in CONFIG["neutral_modes"]:
    c = results_by_mode[mode]["bt"]["cum_cost"]
    ax.plot(c.index, c.values, label=mode)
ax.set_title("Cumulative Transaction Cost")
ax.set_ylabel("cum cost")
ax.grid(True, alpha=0.3)
ax.legend(loc="best")
fig.savefig(cost_fig_path, dpi=150, bbox_inches="tight")
plt.show()

log_path = OUT_LOG_DIR / "final_method_full_log.txt"
log_path.write_text("\n".join([
    str(metrics_train_test_path),
    str(mode_summary_path),
    str(eqw_summary_path),
    str(nav_fig_path),
    str(beta_fig_path),
    str(turnover_fig_path),
    str(cost_fig_path),
]))

print("saved figures:")
print(nav_fig_path)
print(beta_fig_path)
print(turnover_fig_path)
print(cost_fig_path)
print("saved log:", log_path)


## (10) Interpretation Notes（実務解釈）

- 超過収益の源泉は、テーマ間の相対的なモメンタム加速を月次で取りに行くローテーション効果です。
- beta-neutral（A/B）は、$r^{EQW}$ に共通する成分を削る方向に働き、テーマ選抜の純粋成分を観測しやすくします。
- `none` は共通成分も保持するため、上昇局面ではリターン水準が高くなりやすい一方、EQW感応度も残ります。
- 失敗しやすい条件は、(i) コスト上昇、(ii) turnover急増、(iii) テーマ間相関上昇（分散低下）です。
- **最終採択モードは `none`**（`docs/notes/70_final_spec.md` 準拠）です。A/B は検証比較用オーバーレイとして扱います。
