In [10]:
#Cell 1
# -*- coding: utf-8 -*-
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, label_binarize
import numpy as np
import torch.backends.cudnn as cudnn
from datetime import datetime
import matplotlib.pyplot as plt  # NEW: for line charts

# Global settings / reproducibility
cudnn.benchmark = True
torch.cuda.empty_cache()
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Root path to UCRArchive_2018
ROOT = r"D:\2025暑期科研\UCRArchive_2018\UCRArchive_2018"

In [11]:
#Cell 2
def clean_and_pad_timeseries(raw_2d, min_len=8, cap_len=None, pad_value=0.0,
                             per_sample_standardize=True, fixed_len=None):
    """
    Clean time series with tail NaNs, z-score per-sample (optional), and pad/clip.

    Priority of output length:
      1) fixed_len: if not None, output length = fixed_len (force)
      2) cap_len:   if not None, output length = min(max_real_len, cap_len)
      3) otherwise, output length = max_real_len of this input batch
    """
    import numpy as np

    N, T = raw_2d.shape
    rows, keep_idx, real_lens = [], [], []

    for i in range(N):
        row = raw_2d[i]
        valid_vals = row[~np.isnan(row)]
        L = valid_vals.shape[0]
        if L < min_len:
            continue
        if per_sample_standardize:
            mu = valid_vals.mean()
            sigma = valid_vals.std()
            valid_vals = (valid_vals - mu) / (sigma if sigma > 0 else 1.0)
        rows.append(valid_vals); keep_idx.append(i); real_lens.append(L)

    if len(rows) == 0:
        raise ValueError("All samples filtered out. Lower min_len if needed.")

    max_real_len = max(real_lens)
    if fixed_len is not None:
        target_len = int(fixed_len)
    elif cap_len is not None:
        target_len = min(max_real_len, cap_len)
    else:
        target_len = max_real_len

    out = []
    for arr in rows:
        if arr.shape[0] >= target_len:
            arr = arr[:target_len]
        else:
            arr = np.pad(arr, (0, target_len - arr.shape[0]), constant_values=pad_value)
        out.append(arr)

    X = np.stack(out, axis=0).astype("float32")
    return X, np.array(keep_idx, dtype=np.int64), np.array(real_lens, dtype=np.int64)

In [12]:
#Cell 3
class TwoTowerTransformer(nn.Module):
    """
    Two-tower Transformer:
      - Tower 1: time series tokens [B, T, 1] -> embed -> transformer
      - Tower 2: Aout vector [B, F] as a single token -> embed -> transformer
      - Concat tokens -> final transformer -> flatten -> FC for multi-class logits
    """
    def __init__(self, input_dim1, input_dim2,
                 hidden_dim1, hidden_dim2, hidden_dim3,
                 num_heads, num_layers, num_classes,
                 seq_len1, seq_len2):
        super().__init__()
        # To keep it simple we force equal hidden dims and divisibility by nhead
        assert hidden_dim1 == hidden_dim2 == hidden_dim3, "hidden dims must be equal in this version."
        for h in (hidden_dim1, hidden_dim2, hidden_dim3):
            assert h % num_heads == 0, "hidden_dim must be divisible by num_heads"

        self.embedding1 = nn.Linear(input_dim1, hidden_dim1)
        self.embedding2 = nn.Linear(input_dim2, hidden_dim2)
        self.relu = nn.ReLU()

        self.transformer1 = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim1, nhead=num_heads, batch_first=True),
            num_layers=num_layers)

        self.transformer2 = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim2, nhead=num_heads, batch_first=True),
            num_layers=num_layers)

        self.final_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim3, nhead=num_heads, batch_first=True),
            num_layers=num_layers)

        self.dropout = nn.Dropout(0.3)
        self.seq_len1 = seq_len1
        self.seq_len2 = 1  # treat Aout as a single token
        fc_in = hidden_dim3 * (seq_len1 + self.seq_len2)
        self.fc = nn.Linear(fc_in, num_classes)

    def forward(self, x1, x2):
        # x1: [B, T, input_dim1], x2: [B, F] or [B, 1, F]
        if x1.dim() == 2:
            x1 = x1.unsqueeze(-1)
        x1 = self.relu(self.embedding1(x1))
        x1 = self.transformer1(x1)

        if x2.dim() == 3:
            assert x2.size(1) == 1, "Expect x2 with L2=1 if 3D"
            x2 = x2.squeeze(1)
        x2 = self.relu(self.embedding2(x2))
        x2 = x2.unsqueeze(1)
        x2 = self.transformer2(x2)

        x = torch.cat((x1, x2), dim=1)
        x = self.final_transformer(x)
        x = x.reshape(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)


class VisitDataset(Dataset):
    """Simple tensor dataset for (visit time series, aout features, one-hot labels)."""
    def __init__(self, visit_x, aout_x, y):
        self.visit_x = visit_x.astype("float32")
        self.aout_x  = aout_x.astype("float32")
        self.y       = y.astype("float32")
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        return self.visit_x[idx], self.aout_x[idx], self.y[idx]

In [13]:
#Cell 4
# ===== Backbones: Informer / FEDformer (Encoder-only Classifiers) =====
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class LearnedPositionalEncoding(nn.Module):
    def __init__(self, seq_len: int, d_model: int, dropout: float = 0.0):
        super().__init__()
        self.pe = nn.Parameter(torch.zeros(1, seq_len, d_model))
        nn.init.trunc_normal_(self.pe, std=0.02)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1), :])

class TokenEmbedding(nn.Module):
    def __init__(self, input_dim: int, d_model: int):
        super().__init__()
        self.proj = nn.Linear(input_dim, d_model)
    def forward(self, x):  # x: [B, L, C]
        return self.proj(x)

class InformerEncoderLayer(nn.Module):
    def __init__(self, d_model=256, n_heads=4, d_ff=512, dropout=0.1, attn_dropout=0.1, activation='gelu'):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=attn_dropout, batch_first=True)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU() if activation.lower()=='gelu' else nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)
    def forward(self, x, attn_mask=None, key_padding_mask=None):
        attn_out, _ = self.self_attn(x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False)
        x = self.norm1(x + self.dropout1(attn_out))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout2(ff_out))
        return x

class DistilConv1D(nn.Module):
    def __init__(self, d_model, dropout=0.0):
        super().__init__()
        self.conv = nn.Conv1d(d_model, d_model, kernel_size=3, stride=2, padding=1)
        self.norm = nn.BatchNorm1d(d_model)
        self.act = nn.ELU()
        self.drop = nn.Dropout(dropout)
    def forward(self, x):  # [B, L, D]
        x = x.transpose(1, 2)
        x = self.conv(x)
        x = self.norm(x)
        x = self.act(x)
        x = self.drop(x)
        return x.transpose(1, 2)

class InformerEncoder(nn.Module):
    def __init__(self, d_model=256, n_heads=4, e_layers=2, d_ff=512, dropout=0.1, distil=True, attn_dropout=0.1, activation='gelu'):
        super().__init__()
        self.layers = nn.ModuleList([
            InformerEncoderLayer(d_model, n_heads, d_ff, dropout, attn_dropout, activation)
            for _ in range(e_layers)
        ])
        self.distil_layers = nn.ModuleList([DistilConv1D(d_model, dropout=dropout) for _ in range(e_layers-1)]) if distil and e_layers>1 else None
    def forward(self, x):  # [B, L, D]
        if self.distil_layers is None:
            for enc in self.layers: x = enc(x)
            return x
        for i,enc in enumerate(self.layers):
            x = enc(x)
            if i < len(self.layers)-1: x = self.distil_layers[i](x)
        return x

# ====== REPLACE in Cell 4: FourierBlock & FEDEncoderLayer ======

class FourierAttentionLike(nn.Module):
    """
    '更贴近官方'的频域块：
    - rFFT 得到频域
    - 选能量最大的 top-k 频率（按 batch 的平均能量统计，不会引入数据泄漏）
    - 对选中的频率施加【可学习的复权重】(幅度 + 相位)
    - iFFT 回时域
    仍然是 Encoder 内的一个子层，用作“频域相关性”的近似。
    """
    def __init__(self, d_model: int, k_ratio: float = 0.25):
        super().__init__()
        self.k_ratio = k_ratio
        # 可学习的幅度与相位（简化：对所有通道共享一组权重；也可换成 per-channel）
        self.scale = nn.Parameter(torch.ones(1))          # 幅度缩放
        self.phase = nn.Parameter(torch.zeros(1))         # 相位偏移（弧度）
        self.proj_in  = nn.Identity()                     # 你也可以放个 1x1 线性层
        self.proj_out = nn.Identity()

    def forward(self, x):  # x: [B, L, D]
        B, L, D = x.shape
        x = self.proj_in(x)

        # FFT：对时间维做 rFFT -> 形状 [B, F, D]，F = L//2 + 1
        Xf = torch.fft.rfft(x, dim=1)

        # 频率能量（幅度平方），取 batch&channel 平均，得到每个频率的能量
        energy = (Xf.real**2 + Xf.imag**2).mean(dim=(0, 2))  # [F]
        F = Xf.size(1)
        k = max(1, int(F * self.k_ratio))

        # top-k 频率索引（按能量挑最重要的频率）
        topk_idx = torch.topk(energy, k=k, largest=True, sorted=False).indices

        # 构造一个 mask 仅保留 top-k 频率
        mask = torch.zeros(F, device=Xf.device, dtype=Xf.dtype)
        mask[topk_idx] = 1.0
        mask = mask.view(1, F, 1)  # broadcast 到 [B, F, D]

        # 对保留的频率应用可学习复权重：scale * exp(j*phase)
        complex_weight = self.scale * torch.complex(
            torch.cos(self.phase), torch.sin(self.phase)
        )  # 标量复数
        Xf = Xf * mask * complex_weight

        # 反变换回时域
        y = torch.fft.irfft(Xf, n=L, dim=1)
        y = self.proj_out(y)
        return y


class FEDEncoderLayer(nn.Module):
    def __init__(self, d_model=256, d_ff=512, dropout=0.1, activation='gelu', k_ratio=0.25):
        super().__init__()
        # 用上面“更贴近官方”的频域相关性近似块
        self.fourier = FourierAttentionLike(d_model=d_model, k_ratio=k_ratio)

        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)

        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU() if activation.lower()=='gelu' else nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # 频域相关性（残差）
        z = self.fourier(x)
        x = self.norm1(x + self.dropout1(z))
        # FFN（残差）
        z2 = self.ff(x)
        x = self.norm2(x + self.dropout2(z2))
        return x

class FEDformerEncoder(nn.Module):
    def __init__(self, d_model=256, e_layers=2, d_ff=512, dropout=0.1, activation='gelu', k_ratio=0.25):
        super().__init__()
        self.layers = nn.ModuleList([FEDEncoderLayer(d_model, d_ff, dropout, activation, k_ratio) for _ in range(e_layers)])
    def forward(self, x):
        for enc in self.layers: x = enc(x)
        return x

class TimePoolClassifierHead(nn.Module):
    def __init__(self, d_model: int, num_classes: int, pool: str = "mean", dropout: float = 0.1):
        super().__init__()
        assert pool in ("mean","max","cls")
        self.pool = pool
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, num_classes)
        self.cls_token = nn.Parameter(torch.zeros(1,1,d_model)) if pool=="cls" else None
        if self.cls_token is not None: nn.init.trunc_normal_(self.cls_token, std=0.02)
    def forward(self, x):  # [B, L, D]
        if self.pool=="mean":
            h = x.mean(dim=1)
        elif self.pool=="max":
            h,_ = x.max(dim=1)
        else:
            h = x[:,0,:]
        h = self.dropout(h)
        return self.fc(h)

class InformerEncoderClassifier(nn.Module):
    """Drop-in 替换 TwoTower；forward(x1,x2) 忽略 x2；x1=[B,L,1]"""
    def __init__(self, input_dim:int, num_classes:int, seq_len:int,
                 d_model=256, n_heads=4, e_layers=2, d_ff=512,
                 dropout=0.1, attn_dropout=0.1, distil=True, activation='gelu', pool="mean"):
        super().__init__()
        self.token = TokenEmbedding(input_dim, d_model)
        self.pos = LearnedPositionalEncoding(seq_len, d_model, dropout)
        self.encoder = InformerEncoder(d_model, n_heads, e_layers, d_ff, dropout, distil, attn_dropout, activation)
        self.head = TimePoolClassifierHead(d_model, num_classes, pool, dropout)
    def forward(self, x1, x2=None):
        if x1.dim()==2: x1 = x1.unsqueeze(-1)
        x = self.token(x1); x = self.pos(x); x = self.encoder(x)
        return self.head(x)

class FEDformerEncoderClassifier(nn.Module):
    """Drop-in 替换 TwoTower；forward(x1,x2) 忽略 x2；x1=[B,L,1]"""
    def __init__(self, input_dim:int, num_classes:int, seq_len:int,
                 d_model=256, e_layers=2, d_ff=512, dropout=0.1, activation='gelu', k_ratio=0.25, pool="mean"):
        super().__init__()
        self.token = TokenEmbedding(input_dim, d_model)
        self.pos = LearnedPositionalEncoding(seq_len, d_model, dropout)
        self.encoder = FEDformerEncoder(d_model, e_layers, d_ff, dropout, activation, k_ratio)
        self.head = TimePoolClassifierHead(d_model, num_classes, pool, dropout)
    def forward(self, x1, x2=None):
        if x1.dim()==2: x1 = x1.unsqueeze(-1)
        x = self.token(x1); x = self.pos(x); x = self.encoder(x)
        return self.head(x)
    
# === TS2Vec helpers: 自监督预训练 + 线性探针 ===
# 依赖：请把官方仓库的 ts2vec.py、models/*.py、utils/*.py 放到你的工程目录里（或同级子目录并做好 import）
from ts2vec import TS2Vec
import torch.nn as nn
import torch.nn.functional as F

def ts2vec_linear_probe(
    visit_tr, visit_te,             # np array: [N, L, 1] （左塔）
    Y_tr, Y_te,                     # one-hot labels
    device,
    batch_size=8,
    pretrain_iters=None,            # 若为 None，走官方默认（<=1e5元素 200iters，否则 600）
    pretrain_epochs=None,           # 二选一，给 iters 或 epochs
    encoder_hidden=64,
    encoder_depth=10,
    proj_dim=320,
    lr=1e-4,                        # 为公平，用你全局 lr；若要更贴近官方，可把这项设为1e-3
    head_epochs=100,                # 用与你主干一致的 num_epochs
    patience=15                     # 只对 head(监督)训练早停；自监督阶段不早停
):
    assert visit_tr.ndim == 3 and visit_tr.shape[-1] == 1
    assert visit_te.ndim == 3 and visit_te.shape[-1] == 1
    # --- 1) 自监督预训练（只用 train） ---
    ts2 = TS2Vec(
        input_dims=1,
        output_dims=proj_dim,
        hidden_dims=encoder_hidden,
        depth=encoder_depth,
        device=device,
        lr=lr,                 # 如想更贴近官方，可单独设 lr=1e-3
        batch_size=batch_size,
        max_train_length=None, # 我们已做同长，不再额外切窗
        temporal_unit=0
    )
    # 官方 fit 支持 n_iters 或 n_epochs；给一个即可
    loss_log = ts2.fit(
        train_data=visit_tr, 
        n_iters=pretrain_iters, 
        n_epochs=pretrain_epochs, 
        verbose=False
    )

    # --- 2) 编码为全序列表征（train/test 各得到一个向量表示） ---
    # encoding_window='full_series' 会做全序列 max-pool，输出 [N, 1, proj_dim] -> squeeze 到 [N, proj_dim]
    z_tr = ts2.encode(visit_tr, encoding_window='full_series')  # -> [Ntr, D]
    z_te = ts2.encode(visit_te, encoding_window='full_series')  # -> [Nte, D]

    # --- 3) 线性探针分类头（监督训练，与你现有超参/早停一致） ---
    num_classes = Y_tr.shape[1]
    z_tr_t = torch.from_numpy(z_tr).to(device=device, dtype=torch.float32)
    z_te_t = torch.from_numpy(z_te).to(device=device, dtype=torch.float32)
    y_tr_t = torch.from_numpy(Y_tr).to(device=device, dtype=torch.float32)
    y_te_t = torch.from_numpy(Y_te).to(device=device, dtype=torch.float32)

    head = nn.Linear(proj_dim, num_classes).to(device)
    optim_head = torch.optim.Adam(head.parameters(), lr=lr)
    sched_head = torch.optim.lr_scheduler.CosineAnnealingLR(optim_head, T_max=50, eta_min=0.0)
    criterion = nn.BCEWithLogitsLoss()

    # 简单的张量版 mini-batch loader
    def iter_minibatch(X, Y, bs):
        n = X.size(0)
        idx = torch.randperm(n, device=X.device)
        for i in range(0, n, bs):
            j = idx[i:i+bs]
            yield X[j], Y[j]

    class _ES:
        def __init__(self, patience=15, delta=0.0):
            self.pat = patience; self.delta=delta
            self.best=None; self.count=0; self.stop=False
        def step(self, loss):
            if self.best is None: self.best = loss; return False
            if loss > self.best - self.delta:
                self.count += 1
                if self.count >= self.pat: self.stop=True
            else:
                self.best = loss; self.count=0
            return self.stop
    es = _ES(patience=patience)

    # 监督阶段：只看 TrainLoss 早停（与你现在管线一致）
    for ep in range(head_epochs):
        head.train()
        tot=0.0; steps=0
        for xb, yb in iter_minibatch(z_tr_t, y_tr_t, batch_size):
            optim_head.zero_grad()
            logits = head(xb)
            loss = criterion(logits, yb)
            loss.backward(); optim_head.step()
            tot += loss.item(); steps+=1
        sched_head.step()
        avg_train_loss = tot / max(1,steps)
        if es.step(avg_train_loss): break

    # --- 4) 最终 Test 一次性评估 ---
    head.eval()
    with torch.no_grad():
        logits = head(z_te_t).detach().cpu().numpy()
    import numpy as np
    from sklearn.metrics import roc_auc_score, accuracy_score
    y_true = y_te_t.detach().cpu().numpy()
    try:
        test_auc = roc_auc_score(y_true, logits, multi_class='ovr')
    except:
        test_auc = roc_auc_score(y_true, logits)
    y_pred = np.argmax(logits, axis=1)
    y_cls  = np.argmax(y_true,  axis=1)
    test_acc = accuracy_score(y_cls, y_pred)
    return float(test_auc), float(test_acc), int(y_true.shape[0])

In [14]:
# %% 
# Cell 5  —— 修正后的 ts2vec_linear_probe（二分类ACC不再恒为1）
from ts2vec import TS2Vec
import torch, torch.nn as nn, torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score

def ts2vec_linear_probe(
    visit_tr, visit_te,            # np: [N, L, 1]
    Y_tr, Y_te,                    # one-hot; 二分类时形状为 [N,1]
    device='cuda:0',
    batch_size=8,
    pretrain_iters=None,
    pretrain_epochs=None,
    encoder_hidden=64,
    encoder_depth=10,
    proj_dim=320,
    lr=1e-4,
    head_epochs=100,
    patience=15
):
    # 1) 自监督：仅用 train 做 TS2Vec 预训练
    ts2 = TS2Vec(
        input_dims=1,
        output_dims=proj_dim,
        hidden_dims=encoder_hidden,
        depth=encoder_depth,
        device=device,
        lr=lr,
        batch_size=batch_size,
        max_train_length=None,
        temporal_unit=0
    )
    _ = ts2.fit(
        train_data=visit_tr,
        n_iters=pretrain_iters,
        n_epochs=pretrain_epochs,
        verbose=False
    )

    # 2) 整序列表示
    z_tr = ts2.encode(visit_tr, encoding_window='full_series')  # [Ntr, D]
    z_te = ts2.encode(visit_te, encoding_window='full_series')  # [Nte, D]

    X_tr = torch.from_numpy(z_tr).to(torch.float32).to(device)
    X_te = torch.from_numpy(z_te).to(torch.float32).to(device)
    y_tr = torch.from_numpy(Y_tr).to(torch.float32).to(device)
    y_te = torch.from_numpy(Y_te).to(torch.float32).to(device)

    head = nn.Linear(X_tr.shape[1], y_tr.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(head.parameters(), lr=lr)

    class EarlyStopping:
        def __init__(self, patience=15, delta=0.0):
            self.p = patience; self.d = delta; self.best = None; self.cnt = 0
        def step(self, loss):
            if self.best is None: self.best = loss; return False
            if loss > self.best - self.d:
                self.cnt += 1
                if self.cnt >= self.p: return True
            else:
                self.best = loss; self.cnt = 0
            return False
    es = EarlyStopping(patience=patience)

    # 线性头训练
    ds = torch.utils.data.TensorDataset(X_tr, y_tr)
    dl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True)
    for _ in range(head_epochs):
        head.train(); total = 0.0
        for xb, yb in dl:
            optimizer.zero_grad()
            logits = head(xb)
            loss = criterion(logits, yb)
            loss.backward(); optimizer.step()
            total += loss.item()
        if es.step(total / max(1, len(dl))): break

    # —— 评估：二分类与多分类分开处理 —— #
    head.eval()
    with torch.no_grad():
        logits = head(X_te).detach().cpu().numpy()   # [N, C]
    C = logits.shape[1]

    if C == 1:
        # 二分类：sigmoid 概率 + 0.5 阈值
        probs_pos = 1.0 / (1.0 + np.exp(-logits.ravel()))   # [N]
        y_true    = y_te.detach().cpu().numpy().ravel().astype(int)  # [N] in {0,1}
        y_pred    = (probs_pos >= 0.5).astype(int)
        acc = accuracy_score(y_true, y_pred)
        auc = roc_auc_score(y_true, probs_pos)
    else:
        # 多分类（OVR）：对每列做 sigmoid，再取 argmax
        probs = 1.0 / (1.0 + np.exp(-logits))               # [N, C]
        y_true_1h = y_te.detach().cpu().numpy()             # [N, C] one-hot
        y_pred    = probs.argmax(axis=1)
        y_true    = y_true_1h.argmax(axis=1)
        acc = accuracy_score(y_true, y_pred)
        auc = roc_auc_score(y_true_1h, probs, multi_class='ovr')

    return float(auc), float(acc), int(y_te.shape[0])



In [15]:
#Cell 6
def run_one_dataset(dataset_dir, dataset_name,
                    device='cuda:0',
                    batch_size=8, num_epochs=100, lr=1e-4,
                    cap_len=None, patience=15,
                    verbose=False, plot_curves=True,
                    backbone='twotower'):
    import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
    import torch, torch.nn as nn, torch.optim as optim
    from torch.utils.data import DataLoader
    from torch.optim.lr_scheduler import CosineAnnealingLR
    from sklearn.preprocessing import StandardScaler, label_binarize
    from sklearn.metrics import accuracy_score, roc_auc_score

    # ---------- 路径 ----------
    tsv_train_path = os.path.join(dataset_dir, f"{dataset_name}_TRAIN_cleaned.tsv")
    tsv_test_path  = os.path.join(dataset_dir, f"{dataset_name}_TEST_cleaned.tsv")
    aout_train_csv = os.path.join(dataset_dir, f"{dataset_name}_Aout_train_k2.csv")
    aout_test_csv  = os.path.join(dataset_dir, f"{dataset_name}_Aout_test_k2.csv")
    if not all(os.path.exists(p) for p in [tsv_train_path, tsv_test_path, aout_train_csv, aout_test_csv]):
        print(f"⚠ Skip {dataset_name}, missing files"); return None

    # ---------- 读取 ----------
    tsv_tr = pd.read_csv(tsv_train_path, sep="\t", header=None)
    tsv_te = pd.read_csv(tsv_test_path,  sep="\t", header=None)
    csv_tr = pd.read_csv(aout_train_csv, header=None)
    csv_te = pd.read_csv(aout_test_csv,  header=None)

    y_tr_raw = tsv_tr.iloc[:,0].values
    y_te_raw = tsv_te.iloc[:,0].values
    visit_tr_raw = tsv_tr.iloc[:,1:].values.astype("float32")
    visit_te_raw = tsv_te.iloc[:,1:].values.astype("float32")
    aout_tr_raw_all  = csv_tr.iloc[:,1:].values.astype("float32")
    aout_te_raw_all  = csv_te.iloc[:,1:].values.astype("float32")

    # ---------- Train 决定参考长度 ----------
    tmp_train_clean, keep_tr0, _ = clean_and_pad_timeseries(
        visit_tr_raw, min_len=8, cap_len=None, pad_value=0.0,
        per_sample_standardize=True, fixed_len=None
    )
    train_seq_len = tmp_train_clean.shape[1]

    # ---------- 固定同长清洗（官方A方案） ----------
    visit_tr_clean, keep_tr, _ = clean_and_pad_timeseries(
        visit_tr_raw, min_len=8, cap_len=None, pad_value=0.0,
        per_sample_standardize=True, fixed_len=train_seq_len
    )
    y_tr = y_tr_raw[keep_tr]
    aout_tr_raw = aout_tr_raw_all[keep_tr]

    visit_te_clean, keep_te, _ = clean_and_pad_timeseries(
        visit_te_raw, min_len=8, cap_len=None, pad_value=0.0,
        per_sample_standardize=True, fixed_len=train_seq_len
    )
    y_te = y_te_raw[keep_te]
    aout_te_raw = aout_te_raw_all[keep_te]

    # ---------- 右塔标准化（fit on train, apply to test） ----------
    scaler = StandardScaler().fit(aout_tr_raw)
    aout_tr = scaler.transform(aout_tr_raw).astype("float32")
    aout_te = scaler.transform(aout_te_raw).astype("float32")

    # ---------- One-hot 标签 ----------
    classes = sorted(np.unique(y_tr))
    Y_tr = label_binarize(y_tr, classes=classes).astype("float32")
    Y_te = label_binarize(y_te, classes=classes).astype("float32")

    # ---------- 左塔输入形状 [B, L, 1] ----------
    visit_tr = visit_tr_clean[:, :, None]
    visit_te = visit_te_clean[:, :, None]

    # ===== TS2Vec 分支（自监督 + 线性探针；不实例化 PyTorch 分类模型） =====
    if backbone.lower() in ('ts2vec', 'ts2vec_lp'):
        device_t = torch.device(device if torch.cuda.is_available() else 'cpu')
        test_auc, test_acc, test_n = ts2vec_linear_probe(
            visit_tr=visit_tr, visit_te=visit_te,
            Y_tr=Y_tr, Y_te=Y_te,
            device=device_t,
            batch_size=batch_size,
            pretrain_iters=None,       # 官方默认：<=1e5元素 200，否则 600
            pretrain_epochs=None,
            encoder_hidden=64,         # 贴近官方
            encoder_depth=10,          # 贴近官方
            proj_dim=320,              # 贴近官方
            lr=lr,                     # 为公平，用你的全局 lr
            head_epochs=num_epochs,    # 线性头与其他骨干对齐
            patience=patience
        )
        print(f"[{dataset_name} | ts2vec] TEST AUC={test_auc:.4f}, TEST ACC={test_acc:.4f}, n_samples={test_n}")
        return test_auc, test_acc, test_n

    # ===== 其余骨干（TwoTower / Informer / FEDformer）保持不变 =====
    seq_len1 = visit_tr.shape[1]
    input_dim1 = visit_tr.shape[2]
    input_dim2 = aout_tr.shape[1]   # TwoTower 用；单塔忽略
    num_classes = Y_tr.shape[1]

    device_torch = torch.device(device if torch.cuda.is_available() else 'cpu')

    # 构建模型
    if backbone.lower() == 'twotower':
        hidden_dim1 = hidden_dim2 = hidden_dim3 = 16
        num_heads = 2; num_layers = 2; seq_len2 = 1
        model = TwoTowerTransformer(
            input_dim1, input_dim2,
            hidden_dim1, hidden_dim2, hidden_dim3,
            num_heads, num_layers,
            num_classes,
            seq_len1, seq_len2
        ).to(device_torch)

    elif backbone.lower() == 'informer':
        model = InformerEncoderClassifier(
            input_dim=1, num_classes=num_classes, seq_len=seq_len1,
            d_model=256, n_heads=4, e_layers=2, d_ff=512,
            dropout=0.1, attn_dropout=0.1, distil=True, activation='gelu', pool='mean'
        ).to(device_torch)

    elif backbone.lower() in ('fed', 'fedformer'):
        model = FEDformerEncoderClassifier(
            input_dim=1, num_classes=num_classes, seq_len=seq_len1,
            d_model=256, e_layers=2, d_ff=512,
            dropout=0.1, activation='gelu', k_ratio=0.25, pool='mean'
        ).to(device_torch)

    else:
        raise ValueError(f"Unknown backbone: {backbone}")

    # 多卡
    if torch.cuda.device_count() > 1 and str(device_torch).startswith('cuda'):
        model = nn.DataParallel(model, device_ids=[0,1])
    model = model.to(device_torch)

    # DataLoader
    train_loader = DataLoader(VisitDataset(visit_tr, aout_tr, Y_tr),
                              batch_size=batch_size, shuffle=True,  pin_memory=True, num_workers=0)
    test_loader  = DataLoader(VisitDataset(visit_te, aout_te, Y_te),
                              batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=0)

    # 优化器/损失/调度
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=0)

    # 仅基于训练损失的早停
    class EarlyStopping:
        def __init__(self, patience=15, delta=0.0):
            self.patience = patience; self.delta = delta
            self.counter = 0; self.best = None; self.stop = False
        def step(self, train_loss):
            if self.best is None: self.best = train_loss; return False
            if train_loss > self.best - self.delta:
                self.counter += 1
                if self.counter >= self.patience: self.stop = True
            else:
                self.best = train_loss; self.counter = 0
            return self.stop
    es = EarlyStopping(patience=patience)

    # 训练（只记录 TrainLoss）
    log_dir = os.path.join(dataset_dir, "_twotower_logs"); os.makedirs(log_dir, exist_ok=True)
    log_file = open(os.path.join(log_dir, f"log_{backbone}.txt"), "w", encoding="utf-8")
    train_loss_hist = []

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for x1,x2,y in train_loader:
            x1 = x1.to(torch.float32).to(device_torch)
            x2 = x2.to(torch.float32).to(device_torch)
            y  = y.to(torch.float32).to(device_torch)
            optimizer.zero_grad()
            o = model(x1,x2)   # 单塔会忽略 x2
            l = criterion(o,y)
            l.backward(); optimizer.step()
            total_loss += l.item()
        scheduler.step()
        avg_train_loss = total_loss / max(1,len(train_loader))
        train_loss_hist.append(avg_train_loss)
        log_file.write(f"Epoch [{epoch+1}/{num_epochs}] Train Loss: {avg_train_loss:.4f}\n")
        if es.step(avg_train_loss): break
    log_file.close()

    # 画 TrainLoss
    if plot_curves and len(train_loss_hist)>0:
        fig,ax=plt.subplots(figsize=(6,4))
        ax.plot(range(1,len(train_loss_hist)+1), train_loss_hist, label="Train Loss")
        ax.set_xlabel("Epoch"); ax.set_ylabel("Loss"); ax.grid(True); ax.legend()
        ax.set_title(f"{dataset_name} - {backbone} - Train Loss")
        plt.tight_layout(); plt.savefig(os.path.join(log_dir, f"{dataset_name}_{backbone}_train_loss.png"), dpi=150); plt.show()

    # --- 最终 Test 一次性评估（监督模型通用） ---
    model.eval()
    logits_list, labels_list = [], []
    with torch.no_grad():
        for x1, x2, y in test_loader:
            x1 = x1.to(torch.float32).to(device_torch)
            x2 = x2.to(torch.float32).to(device_torch)
            o  = model(x1, x2)  # logits
            logits_list.append(o.detach().cpu().numpy())
            labels_list.append(y.numpy())

    logits = np.concatenate(logits_list, axis=0)   # [N, C] 或 [N, 1]
    labels = np.concatenate(labels_list, axis=0)   # one-hot: [N, C] 或 [N, 1]
    num_classes = labels.shape[1]

    if num_classes == 1:
        # ===== 二分类（BCE 训练）=====
        probs_pos = 1.0 / (1.0 + np.exp(-logits.ravel()))   # sigmoid -> P(y=1)
        y_true    = labels.ravel().astype(int)              # [N] in {0,1}
        y_pred    = (probs_pos >= 0.5).astype(int)
        test_acc  = accuracy_score(y_true, y_pred)
        test_auc  = roc_auc_score(y_true, probs_pos)
    else:
        # ===== 多分类（BCE 的 OVR 语义）=====
        probs = 1.0 / (1.0 + np.exp(-logits))               # 列向 sigmoid，形状 [N, C]
        y_true_idx = labels.argmax(axis=1)
        y_pred_idx = probs.argmax(axis=1)
        test_acc    = accuracy_score(y_true_idx, y_pred_idx)
        test_auc    = roc_auc_score(labels, probs, multi_class='ovr')

    test_n = labels.shape[0]
    print(f"[{dataset_name} | {backbone}] TEST AUC={test_auc:.4f}, TEST ACC={test_acc:.4f}, n_samples={test_n}")
    return test_auc, test_acc, test_n



In [16]:
#Cell 7
def discover_datasets(root):
    """
    Discover dataset subfolders that contain all four required files:
      *_TRAIN_cleaned.tsv, *_TEST_cleaned.tsv, *_Aout_train_k2.csv, *_Aout_test_k2.csv
    Returns: a sorted list of dataset names (folder names).
    """
    names = []
    for name in sorted(os.listdir(root)):
        subdir = os.path.join(root, name)
        if not os.path.isdir(subdir):
            continue
        t_train = os.path.join(subdir, f"{name}_TRAIN_cleaned.tsv")
        t_test  = os.path.join(subdir, f"{name}_TEST_cleaned.tsv")
        a_train = os.path.join(subdir, f"{name}_Aout_train_k2.csv")
        a_test  = os.path.join(subdir, f"{name}_Aout_test_k2.csv")
        if all(os.path.exists(p) for p in [t_train, t_test, a_train, a_test]):
            names.append(name)
    return names

In [17]:
#Cell 8
def run_all_datasets(root, device='cuda:0',
                     batch_size=8, num_epochs=100, lr=1e-4,
                     cap_len=None, verbose=False, plot_curves=True,
                     patience=15, backbone='twotower'):
    import os, pandas as pd
    from datetime import datetime

    dataset_names = discover_datasets(root)
    print(f"Found {len(dataset_names)} datasets:", dataset_names)

    rows = []
    for name in dataset_names:
        out = run_one_dataset(
            dataset_dir=os.path.join(root, name),
            dataset_name=name,
            device=device,
            batch_size=batch_size, num_epochs=num_epochs, lr=lr,
            cap_len=cap_len, patience=patience,
            verbose=verbose, plot_curves=plot_curves,
            backbone=backbone   # <-- 透传骨干
        )
        if out is None: continue
        test_auc, test_acc, test_n = out
        rows.append((name, test_auc, test_acc, test_n))

    if not rows:
        print("No dataset finished successfully."); return None

    df = pd.DataFrame(rows, columns=["dataset", "test_auc", "test_acc", "n_samples"])
    mean_auc = df["test_auc"].mean(); mean_acc = df["test_acc"].mean()
    w_auc = (df["test_auc"] * df["n_samples"]).sum() / df["n_samples"].sum()
    w_acc = (df["test_acc"] * df["n_samples"]).sum() / df["n_samples"].sum()

    print("\n========== Summary (OFFICIAL TEST) ==========")
    print(df.sort_values("dataset").to_string(index=False))
    print(f"\nSimple mean: AUC = {mean_auc:.4f}, ACC = {mean_acc:.4f}")
    print(f"Weighted (by samples): AUC = {w_auc:.4f}, ACC = {w_acc:.4f}")

    summary_dir = os.path.join(root, "_twotower_logs"); os.makedirs(summary_dir, exist_ok=True)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    tag = backbone.lower()
    df.to_csv(os.path.join(summary_dir, f"summary_TEST_{tag}_{ts}.csv"), index=False, encoding="utf-8")
    with open(os.path.join(summary_dir, f"summary_TEST_{tag}_{ts}.txt"), "w", encoding="utf-8") as f:
        f.write(df.sort_values("dataset").to_string(index=False))
        f.write(f"\n\nSimple mean: AUC={mean_auc:.6f}, ACC={mean_acc:.6f}\n")
        f.write(f"Weighted (by samples): AUC={w_auc:.6f}, ACC={w_acc:.6f}\n")

    return df, (mean_auc, mean_acc), (w_auc, w_acc)

In [18]:
# Cell X: run ONLY the 6 selected datasets with TS2Vec
import os
import pandas as pd

SELECTED = ["ECG200", "GunPoint", "ArrowHead", "Beef", "Coffee", "ECG5000"]

# 若未先运行定义 ROOT 的 Cell，这里提供一个兜底；按需修改为你的实际路径
if "ROOT" not in globals():
    ROOT = r"D:\2025暑期科研\UCRArchive_2018\UCRArchive_2018"

if not os.path.isdir(ROOT):
    raise FileNotFoundError(f"ROOT 路径不存在：{ROOT}")

# 过滤：只保留根目录下真实存在的子目录
available, missing = [], []
for ds in SELECTED:
    (available if os.path.isdir(os.path.join(ROOT, ds)) else missing).append(ds)
if missing:
    print(f"[WARN] 以下数据集未在 ROOT 下找到，将被跳过：{missing}")
print(f"将运行 {len(available)} 个数据集（TS2Vec）：{available}")

rows = []
for ds in available:
    out = run_one_dataset(
        dataset_dir=os.path.join(ROOT, ds),
        dataset_name=ds,
        device='cuda:0',
        batch_size=8,
        num_epochs=100,     # 线性头训练 epoch
        lr=1e-4,
        cap_len=None,
        verbose=False,
        plot_curves=False,  # TS2Vec 线性头不需要画曲线的话保持 False
        patience=15,
        backbone='ts2vec',  # 关键：使用 TS2Vec 分支
        # 如果你不想强制跑满，可以加：force_full_epochs=False, collect_test_curve=False
    )
    if out is None:
        print(f"[WARN] {ds} 缺少必要文件，已跳过。")
        continue

    # 兼容可能返回 3 或 4 项（若 return_epoch_losses=True 会多一个 test_loss_hist）
    if isinstance(out, (list, tuple)) and len(out) == 4:
        auc, acc, n, _ = out
    else:
        auc, acc, n = out

    rows.append((ds, float(auc), float(acc), int(n)))
    print(f"[{ds} | TS2Vec] AUC={auc:.4f} | ACC={acc:.4f} | n={n}")

# 汇总
if rows:
    df = pd.DataFrame(rows, columns=["dataset","test_auc","test_acc","n_samples"]).sort_values("dataset")
    mean_auc = df["test_auc"].mean();   mean_acc = df["test_acc"].mean()
    w_auc = (df["test_auc"] * df["n_samples"]).sum() / df["n_samples"].sum()
    w_acc = (df["test_acc"] * df["n_samples"]).sum() / df["n_samples"].sum()

    print("\n========== Summary (OFFICIAL TEST, TS2Vec) ==========")
    print(df.to_string(index=False))
    print(f"\nSimple mean: AUC = {mean_auc:.4f}, ACC = {mean_acc:.4f}")
    print(f"Weighted (by samples): AUC = {w_auc:.4f}, ACC = {w_acc:.4f}")
else:
    print("No dataset finished successfully.")



将运行 6 个数据集（TS2Vec）：['ECG200', 'GunPoint', 'ArrowHead', 'Beef', 'Coffee', 'ECG5000']
[ECG200 | ts2vec] TEST AUC=0.7530, TEST ACC=0.6400, n_samples=100
[ECG200 | TS2Vec] AUC=0.7530 | ACC=0.6400 | n=100
[GunPoint | ts2vec] TEST AUC=0.8270, TEST ACC=0.4933, n_samples=150
[GunPoint | TS2Vec] AUC=0.8270 | ACC=0.4933 | n=150
[ArrowHead | ts2vec] TEST AUC=0.6465, TEST ACC=0.3943, n_samples=175
[ArrowHead | TS2Vec] AUC=0.6465 | ACC=0.3943 | n=175
[Beef | ts2vec] TEST AUC=0.6125, TEST ACC=0.2333, n_samples=30
[Beef | TS2Vec] AUC=0.6125 | ACC=0.2333 | n=30
[Coffee | ts2vec] TEST AUC=0.8667, TEST ACC=0.6786, n_samples=28
[Coffee | TS2Vec] AUC=0.8667 | ACC=0.6786 | n=28
[ECG5000 | ts2vec] TEST AUC=0.8037, TEST ACC=0.8922, n_samples=4500
[ECG5000 | TS2Vec] AUC=0.8037 | ACC=0.8922 | n=4500

  dataset  test_auc  test_acc  n_samples
ArrowHead  0.646518  0.394286        175
     Beef  0.612500  0.233333         30
   Coffee  0.866667  0.678571         28
   ECG200  0.753038  0.640000        100
  ECG500

In [None]:
#Cell 9
# 只跑 TS2Vec（自监督 + 线性探针）
_ = run_all_datasets(
    root=ROOT,
    device='cuda:0',
    batch_size=8,
    num_epochs=100,   # 用作线性头训练的 epoch；自监督迭代走默认
    lr=1e-4,          # 为公平也可用同 lr
    cap_len=None,
    verbose=False,
    plot_curves=True,
    patience=15,
    backbone='ts2vec'   # ← 关键
)

Found 125 datasets: ['ACSF1', 'Adiac', 'AllGestureWiimoteX', 'AllGestureWiimoteY', 'AllGestureWiimoteZ', 'ArrowHead', 'BME', 'Beef', 'BeetleFly', 'BirdChicken', 'CBF', 'Car', 'Chinatown', 'ChlorineConcentration', 'CinCECGTorso', 'Coffee', 'CricketX', 'CricketY', 'CricketZ', 'Crop', 'DiatomSizeReduction', 'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 'DodgerLoopDay', 'DodgerLoopGame', 'DodgerLoopWeekend', 'ECG200', 'ECG5000', 'ECGFiveDays', 'EOGHorizontalSignal', 'EOGVerticalSignal', 'Earthquakes', 'EthanolLevel', 'FaceAll', 'FaceFour', 'FacesUCR', 'FiftyWords', 'Fish', 'FordA', 'FordB', 'FreezerRegularTrain', 'FreezerSmallTrain', 'Fungi', 'GestureMidAirD1', 'GestureMidAirD2', 'GestureMidAirD3', 'GesturePebbleZ1', 'GesturePebbleZ2', 'GunPoint', 'GunPointAgeSpan', 'GunPointMaleVersusFemale', 'GunPointOldVersusYoung', 'Ham', 'HandOutlines', 'Haptics', 'Herring', 'HouseTwenty', 'InlineSkate', 'InsectEPGRegularTrain', 'InsectEPGSmallTrain', 'InsectWingbea