In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# ==========================================
# 1. 数据集定义 (处理变长数据)
# ==========================================
class KoopmanDataset(Dataset):
    def __init__(self, data_list, labels):
        """
        data_list: list of np.array, 每个元素是一个序列
        labels: list of int, 对应的标签
        """
        self.data = data_list
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 将数据转为 Tensor (Float32)
        sequence = torch.tensor(self.data[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return sequence, label

def load_data_from_txt(file_path):
    """从txt读取数据，并进行训练集/测试集划分"""
    sequences = []
    labels = []
    
    print(f"正在加载数据: {file_path} ...")
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split(',')
            if len(parts) < 2: continue # 跳过空行
            
            # 最后一位是标签
            label = int(parts[-1])
            # 前面的是序列数据
            seq_data = [float(x) for x in parts[:-1]]
            
            # 过滤掉过短的序列（可选，防止报错）
            if len(seq_data) > 5: 
                sequences.append(np.array(seq_data))
                labels.append(label)
    
    print(f"总样本数: {len(sequences)}")
    return sequences, labels

# --- 关键：Collate Function (用于自动填充/Padding) ---
def collate_fn(batch):
    """
    DataLoader 会调用这个函数来打包一个 Batch。
    我们需要在这里做 Padding，把不同长度的序列补齐到该 Batch 中最长序列的长度。
    """
    sequences, labels = zip(*batch)
    
    # 1. 获取每个序列的原始长度（RNN可能需要用）
    lengths = torch.tensor([len(seq) for seq in sequences])
    
    # 2. padding_value=0 (用0填充)
    # batch_first=True -> 输出形状 (Batch_Size, Max_Len)
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    # 3. 增加特征维度 (Batch_Size, Max_Len) -> (Batch_Size, Max_Len, 1)
    # 因为 CNN/RNN 通常认为输入是 (Batch, Len, Channels) 或 (Batch, Channels, Len)
    padded_seqs = padded_seqs.unsqueeze(-1) 
    
    labels = torch.stack(labels)
    
    return padded_seqs, labels, lengths

# ==========================================
# 2. 模型定义 (CNN 和 LSTM)
# ==========================================

# --- 选项 A: 1D-CNN (推荐首选，速度快) ---
class KoopmanCNN(nn.Module):
    def __init__(self, input_channels=1, num_classes=2):
        super(KoopmanCNN, self).__init__()
        
        # 输入形状: (Batch, Channel=1, Length)
        # 注意：在 forward 里我们需要把数据转置一下
        
        self.layer1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=5, padding=2),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv1d(16, 32, kernel_size=5, padding=2),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        self.layer3 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1) # 全局平均池化，不管长度多少，最后都变成1
        )
        
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x, lengths=None):
        # x shape: (Batch, Length, 1)
        # Conv1d 需要: (Batch, Channel, Length)
        x = x.permute(0, 2, 1) 
        
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out) # -> (Batch, 64, 1)
        
        out = out.view(out.size(0), -1) # -> (Batch, 64)
        out = self.fc(out)
        return out

# --- 选项 B: LSTM (适合强时序依赖) ---
class KoopmanLSTM(nn.Module):
    def __init__(self, input_size=1, hidden_size=64, num_layers=2, num_classes=2):
        super(KoopmanLSTM, self).__init__()
        
        # batch_first=True -> 输入 (Batch, Length, Input_Size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths=None):
        # x shape: (Batch, Length, 1)
        
        # 这里的 output 是所有时间步的输出，(h_n, c_n) 是最后时刻的状态
        # 如果使用 pack_padded_sequence 效果更好，但为了简单这里直接跑
        output, (h_n, c_n) = self.lstm(x)
        
        # 取最后一个时间步的输出作为分类特征
        # 或者取 h_n[-1]
        final_feature = h_n[-1] 
        
        out = self.fc(final_feature)
        return out

# ==========================================
# 3. 训练流程
# ==========================================

def train_model():
    # ... (参数设置保持不变) ...
    DATA_PATH = r'./data/data_koopman_sequence.txt'
    BATCH_SIZE = 16
    EPOCHS = 500
    LEARNING_RATE = 1e-4
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 1. 读取与划分数据
    sequences, labels = load_data_from_txt(DATA_PATH)
    if not sequences: return
    X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.3, random_state=42)
    train_loader = DataLoader(KoopmanDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(KoopmanDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    
    # 2. 初始化模型
    model = KoopmanCNN().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # ==========================================
    # 【新增 1】 初始化日志字典
    # ==========================================
    total_params = sum(p.numel() for p in model.parameters())
    log_data = {
        'name': 'CNN (Koopman Seq)',  # 模型名称
        'params': total_params,
        'loss': [],
        'train_acc': [],
        'test_acc': [],
        'epochs': []
    }
    print(f"模型参数量: {total_params:,}")
    # ==========================================

    print("开始训练 KoopmanCNN...")
    
    for epoch in range(EPOCHS):
        model.train()
        correct = 0; total = 0; running_loss = 0.0
        
        for inputs, targets, lens in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs, lens)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
            
        epoch_acc = 100 * correct / total
        avg_loss = running_loss / len(train_loader)
        
        # Test
        model.eval()
        test_correct = 0; test_total = 0
        with torch.no_grad():
            for inputs, targets, lens in test_loader:
                inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
                outputs = model(inputs, lens)
                _, predicted = torch.max(outputs.data, 1)
                test_total += targets.size(0)
                test_correct += (predicted == targets).sum().item()
        test_acc = 100 * test_correct / test_total
        
        print(f"Epoch [{epoch+1}/{EPOCHS}] | Loss: {avg_loss:.4f} | Train: {epoch_acc:.2f}% | Test: {test_acc:.2f}%")

        # ==========================================
        # 【新增 2】 记录数据
        # ==========================================
        log_data['loss'].append(avg_loss)
        log_data['train_acc'].append(epoch_acc)
        log_data['test_acc'].append(test_acc)
        log_data['epochs'].append(epoch + 1)
        # ==========================================

    # ==========================================
    # 【新增 3】 保存为 .npy 文件
    # ==========================================
    save_path = 'log_cnn_koopman.npy'
    np.save(save_path, log_data)
    print(f"\n>>> 训练日志已保存至: {save_path}")

if __name__ == '__main__':
    train_model()

正在加载数据: ./data/data_koopman_sequence.txt ...
总样本数: 4763
模型参数量: 9,250
开始训练 KoopmanCNN...
Epoch [1/500] | Loss: 0.4744 | Train: 94.48% | Test: 92.65%
Epoch [2/500] | Loss: 0.3135 | Train: 95.20% | Test: 92.09%
Epoch [3/500] | Loss: 0.2231 | Train: 95.23% | Test: 92.44%
Epoch [4/500] | Loss: 0.1827 | Train: 95.38% | Test: 92.09%
Epoch [5/500] | Loss: 0.1668 | Train: 95.95% | Test: 93.28%
Epoch [6/500] | Loss: 0.1644 | Train: 96.13% | Test: 94.19%
Epoch [7/500] | Loss: 0.1530 | Train: 96.22% | Test: 94.47%
Epoch [8/500] | Loss: 0.1468 | Train: 96.28% | Test: 93.77%
Epoch [9/500] | Loss: 0.1426 | Train: 96.46% | Test: 94.33%
Epoch [10/500] | Loss: 0.1385 | Train: 96.64% | Test: 94.26%
Epoch [11/500] | Loss: 0.1392 | Train: 96.40% | Test: 94.40%
Epoch [12/500] | Loss: 0.1336 | Train: 96.64% | Test: 92.65%
Epoch [13/500] | Loss: 0.1350 | Train: 96.70% | Test: 94.26%
Epoch [14/500] | Loss: 0.1306 | Train: 96.73% | Test: 94.19%
Epoch [15/500] | Loss: 0.1288 | Train: 96.79% | Test: 95.24%
Epoch 

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib
matplotlib.use('Agg') # 服务器端绘图
import matplotlib.pyplot as plt
import os

# ================= 配置 =================
CONFIG = {
    'data_path': r'./data/data_koopman_sequence.txt',
    'batch_size': 16,
    'epochs': 300,
    'lr': 1e-4,
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'seed': 42,
    'save_fig_path': 'Koopman_CNN_Feature_Evolution.pdf' # 保存文件名
}
# =======================================

# ==============================================================================
# 0. 基础组件 (Dataset, Collate, Model)
# ==============================================================================

class KoopmanDataset(Dataset):
    def __init__(self, data_list, labels):
        self.data = data_list
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 转换为 Float32 Tensor
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

def load_data_from_txt(file_path):
    """从txt读取变长序列数据"""
    sequences = []
    labels = []
    print(f"Loading data from: {file_path} ...")
    try:
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split(',')
                if len(parts) < 2: continue
                label = int(parts[-1])
                seq_data = [float(x) for x in parts[:-1]]
                if len(seq_data) > 5: 
                    sequences.append(np.array(seq_data))
                    labels.append(label)
        print(f"Loaded {len(sequences)} samples.")
        return sequences, labels
    except Exception as e:
        print(f"Error loading data: {e}")
        return [], []

def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = torch.tensor([len(seq) for seq in sequences])
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=0)
    padded_seqs = padded_seqs.unsqueeze(-1) # (Batch, Len, 1)
    labels = torch.stack(labels)
    return padded_seqs, labels, lengths

class KoopmanCNN(nn.Module):
    def __init__(self, input_channels=1, num_classes=2):
        super(KoopmanCNN, self).__init__()
        
        # Layer 1
        self.layer1 = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=5, padding=2),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        # Layer 2 (中间特征提取点)
        self.layer2 = nn.Sequential(
            nn.Conv1d(16, 32, kernel_size=5, padding=2),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        # Layer 3 (最终特征提取点)
        self.layer3 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1) # (Batch, 64, 1)
        )
        
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x, lengths=None):
        x = x.permute(0, 2, 1) # (Batch, 1, Len)
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

    # 【新增】专门用于可视化提取特征的方法
    def extract_features(self, x):
        """返回：中间层特征，最终层特征"""
        x = x.permute(0, 2, 1)
        
        # Pass Layer 1
        out1 = self.layer1(x)
        
        # Pass Layer 2 -> Extract Intermediate
        out2 = self.layer2(out1)
        # Global Avg Pooling 用于中间层可视化 (Batch, 32)
        feat_inter = torch.mean(out2, dim=2) 
        
        # Pass Layer 3 -> Extract Final Latent
        out3 = self.layer3(out2) # (Batch, 64, 1)
        feat_final = out3.view(out3.size(0), -1) # (Batch, 64)
        
        return feat_inter, feat_final

# ==============================================================================
# PART 1: 训练流程 (Training Pipeline)
# ==============================================================================

def train_pipeline():
    print("\n>>> [Part 1] Starting Training Pipeline...")
    
    # 1. 加载数据
    sequences, labels = load_data_from_txt(CONFIG['data_path'])
    if not sequences: return None, None
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        sequences, labels, test_size=0.3, random_state=CONFIG['seed']
    )
    
    train_dataset = KoopmanDataset(X_train, y_train)
    test_dataset = KoopmanDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False, collate_fn=collate_fn)
    
    # 2. 初始化模型
    model = KoopmanCNN().to(CONFIG['device'])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=CONFIG['lr'])
    
    # 统计参数
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Model: {model.__class__.__name__} | Total Params: {total_params:,}")
    
    # 3. 训练循环
    print("Training started...")
    for epoch in range(CONFIG['epochs']):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, targets, _ in train_loader:
            inputs, targets = inputs.to(CONFIG['device']), targets.to(CONFIG['device'])
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
            
        train_acc = 100 * correct / total
        
        # 简单打印进度
        if (epoch+1) % 10 == 0 or epoch == 0:
            print(f"Epoch [{epoch+1}/{CONFIG['epochs']}] Loss: {running_loss/len(train_loader):.4f} Train Acc: {train_acc:.2f}%")

    print(">>> Training Finished.")
    
    # 打包返回：模型和测试数据（用于可视化）
    vis_data = {
        'X_test_seq': X_test, # 原始 list (用于 Input Space 可视化)
        'y_test': np.array(y_test),
        'test_loader': test_loader # 用于提取模型特征
    }
    return model, vis_data

# ==============================================================================
# PART 2: 可视化流程 (Visualization Pipeline)
# ==============================================================================

def visualize_pipeline(model, vis_data):
    print("\n>>> [Part 2] Starting Feature Visualization...")
    model.eval()
    
    y_test = vis_data['y_test']
    
    # -------------------------------------------------------
    # 1. 提取特征 (Feature Extraction)
    # -------------------------------------------------------
    
    # (a) Input Space (Koopman Sequences)
    # 需要将变长序列 Padding 成矩阵，以便 t-SNE 处理
    print("   -> Processing Input Space features...")
    X_test_seq = vis_data['X_test_seq']
    
    # 简单降采样 (防止序列太长 t-SNE 跑不动)
    downsample_rate = 1  # 如果序列很长(>1000)，建议设为 10 或 20
    if len(X_test_seq[0]) > 500: downsample_rate = 10
    
    seqs_ds = [s[::downsample_rate] for s in X_test_seq]
    max_len = max(len(s) for s in seqs_ds)
    
    # Padding 成矩阵 (N, Max_Len)
    X_input_mat = np.zeros((len(seqs_ds), max_len))
    for i, s in enumerate(seqs_ds):
        X_input_mat[i, :len(s)] = s
        
    # (b) & (c) Model Features
    print("   -> Extracting CNN Intermediate & Final features...")
    loader = vis_data['test_loader']
    feats_inter_list = []
    feats_final_list = []
    
    with torch.no_grad():
        for inputs, targets, _ in loader:
            inputs = inputs.to(CONFIG['device'])
            # 调用 extract_features
            f_inter, f_final = model.extract_features(inputs)
            
            feats_inter_list.append(f_inter.cpu().numpy())
            feats_final_list.append(f_final.cpu().numpy())
            
    X_inter_mat = np.concatenate(feats_inter_list, axis=0)
    X_final_mat = np.concatenate(feats_final_list, axis=0)
    
    # -------------------------------------------------------
    # 2. 降维与绘图 (t-SNE & Plotting)
    # -------------------------------------------------------
    # 定义绘图数据
    data_map = [
        ('Koopman Input Space\n(Raw Sequences)', X_input_mat),
        ('CNN Intermediate Features\n(Layer 2 Output)', X_inter_mat),
        ('CNN Final Latent Space\n(Layer 3 Output)', X_final_mat)
    ]
    
    # 设置风格 (保持与量子图一致)
    plt.style.use('seaborn-v0_8-paper')
    plt.rcParams.update({
        "font.family": "serif",
        "font.serif": ["Times New Roman"],
        "font.size": 12,
        "axes.labelsize": 14,
        "legend.fontsize": 12,
        "figure.titlesize": 16
    })
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    colors = ['#1f77b4', '#ff7f0e'] # Blue (Normal), Orange (Disruption)
    class_names = ['Normal', 'Disruption']
    
    print("   -> Running t-SNE and plotting...")
    
    for i, (title, data) in enumerate(data_map):
        ax = axes[i]
        
        # t-SNE 配置 (关键参数与量子可视化保持一致)
        perp = min(50, len(data)-1) # Perplexity 50 让聚类更紧致
        tsne = TSNE(
            n_components=2, 
            perplexity=50,          # 建议尝试 50 或 80，消除长条纹，让簇更圆润
            early_exaggeration=20,  # 增大此值，强行拉大类间距离，视觉更震撼
            learning_rate='auto',   # 自动学习率
            init='pca',             # 使用 PCA 初始化，保留全局结构，图更整齐
            max_iter=1000,            # 增加迭代次数，确保收敛
            random_state=42
        )
        emb = tsne.fit_transform(data)
        
        # 计算 S-Score
        try:
            score = silhouette_score(data, y_test)
        except: score = 0
        
        # 散点图
        for lbl_idx, color in enumerate(colors):
            mask = (y_test == lbl_idx)
            # ax.scatter(emb[mask, 0], emb[mask, 1], c=color, label=class_names[lbl_idx],
            #            alpha=0.75, s=30, edgecolors='w', linewidth=0.3)
            ax.scatter(
                emb[mask, 0], emb[mask, 1], 
                c=color, 
                label=class_names[lbl_idx],
                alpha=0.75,   # 透明度从 0.6 提高到 0.75，让颜色更实，对比度更高
                s=30,         # 点的大小从 20 提高到 30，让点更清晰
                edgecolors='w', # 加白色描边
                linewidth=0.3   # 描边细一点
            )     
            
        ax.set_title(f"({chr(97+i)}) {title}", fontweight='bold')
        ax.set_xticks([])
        ax.set_yticks([])
        
        # S-Score 指标框
        ax.text(0.05, 0.92, f'S-Score: {score:.3f}', transform=ax.transAxes,
                bbox=dict(facecolor='white', alpha=0.9, edgecolor='gray', boxstyle='round'))

    # 全局图例
    handles, _ = axes[0].get_legend_handles_labels()
    fig.legend(handles, class_names, loc='lower center', ncol=2, bbox_to_anchor=(0.5, 0.0), frameon=True)
    
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15) # 留出底部图例空间
    plt.savefig(CONFIG['save_fig_path'], dpi=300)
    print(f">>> Figure saved to {CONFIG['save_fig_path']}")

# ==============================================================================
# Main Execution
# ==============================================================================
if __name__ == '__main__':
    # 1. 运行训练
    trained_model, vis_data = train_pipeline()
    
    # 2. 运行可视化 (如果训练成功)
    if trained_model is not None:
        visualize_pipeline(trained_model, vis_data)


>>> [Part 1] Starting Training Pipeline...
Loading data from: ./data/data_koopman_sequence.txt ...
Loaded 4763 samples.
Model: KoopmanCNN | Total Params: 9,250
Training started...
Epoch [1/300] Loss: 0.6844 Train Acc: 53.15%
Epoch [10/300] Loss: 0.1412 Train Acc: 96.37%
Epoch [20/300] Loss: 0.1298 Train Acc: 96.64%
Epoch [30/300] Loss: 0.1088 Train Acc: 97.36%
Epoch [40/300] Loss: 0.1103 Train Acc: 97.18%
Epoch [50/300] Loss: 0.1026 Train Acc: 97.57%
Epoch [60/300] Loss: 0.0959 Train Acc: 97.60%
Epoch [70/300] Loss: 0.1055 Train Acc: 97.18%
Epoch [80/300] Loss: 0.1018 Train Acc: 97.45%
Epoch [90/300] Loss: 0.0918 Train Acc: 97.78%
Epoch [100/300] Loss: 0.0892 Train Acc: 97.78%
Epoch [110/300] Loss: 0.0934 Train Acc: 97.48%
Epoch [120/300] Loss: 0.0905 Train Acc: 97.84%
Epoch [130/300] Loss: 0.0885 Train Acc: 97.84%
Epoch [140/300] Loss: 0.0832 Train Acc: 97.93%
Epoch [150/300] Loss: 0.0881 Train Acc: 97.81%
Epoch [160/300] Loss: 0.0864 Train Acc: 97.96%
Epoch [170/300] Loss: 0.0886 Tr