# Notice
训练之前请确保，特征模型训练后得到的 **`feature_model_1dcnn.pth`** 和 **`scaler_50hz_torch.gz`** 这两个文件存在

# 导入依赖库，定义参数

In [1]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import re
import io
import joblib
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from typing import List



# --- Configuration ---
DATASET_PATH = 'MobiFall_Dataset'
TARGET_SAMPLING_RATE_HZ = 50.0  # Target sampling rate in Hz
TARGET_SAMPLING_PERIOD = f"{int(1000 / TARGET_SAMPLING_RATE_HZ)}ms"
SEQUENCE_LENGTH = int(TARGET_SAMPLING_RATE_HZ * 4) # 200 samples for 4 seconds at 50Hz
STEP = int(TARGET_SAMPLING_RATE_HZ * 1)          # 50 samples for 1 second step at 50Hz

SENSOR_CODES = ["acc", "gyro", "ori"]
EXPECTED_COLUMNS = {
    "acc": ["acc_x", "acc_y", "acc_z"],
    "gyro": ["gyro_x", "gyro_y", "gyro_z"],
    "ori": ["ori_azimuth", "ori_pitch", "ori_roll"]
}
ALL_FEATURE_COLUMNS = [
    "acc_x", "acc_y", "acc_z", "acc_smv",
    "gyro_x", "gyro_y", "gyro_z", "gyro_smv",
    "ori_azimuth", "ori_pitch", "ori_roll"
]

# 第 1 步：处理原始数据

In [2]:
def load_and_resample_sensor_file(filepath, sensor_code):
    """加载单个传感器文件，转换时间戳并进行重采样。"""
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()

        # 初始化一个变量作为“标记未找到”的标志
        data_start_line_index = -1

        # 遍历文件中的每一行
        for i, line in enumerate(lines):
            # 检查当前行是否是"@DATA"标记
            if line.strip().upper() == "@DATA":
                # 如果是，则记录下一行的行号并跳出循环
                data_start_line_index = i + 1
                break

        # 检查标记是否被找到
        if data_start_line_index == -1 or data_start_line_index >= len(lines):
            return None

        # 将数据行拼接成单个字符串
        data_string = "".join(lines[data_start_line_index:])

        # 检查字符串是否为空
        if not data_string.strip():
            return None

        # 使用pandas处理数据
        df = pd.read_csv(io.StringIO(data_string), header=None, usecols=[0, 1, 2, 3])
        
        # 检查生成的数据表是否为空
        if df.empty:
            return None

        # 为数据列进行命名
        df.columns = ['timestamp_ns'] + EXPECTED_COLUMNS[sensor_code]

        # 将ns时间戳转换为标准的日期时间格式
        df['timestamp'] = pd.to_datetime(df['timestamp_ns'], unit='ns')

        # 将新的日期时间设置为索引，并删除旧的时间戳列
        df = df.set_index('timestamp').drop(columns=['timestamp_ns'])

        # 按时间索引进行排序
        df = df.sort_index()

        # 将采样时间不均匀的传感器数据，强制转换为频率统一（每20毫秒一个点）的规整数据流，并填补其中的所有空白
        df_resampled = df.resample(TARGET_SAMPLING_PERIOD).mean().interpolate(method='linear', limit_direction='both')

        # 检查当前处理的传感器是否为加速度计 ('acc')
        if sensor_code == 'acc':
            # 安全性检查 - 确认三轴数据都存在
            if all(col in df_resampled.columns for col in ['acc_x', 'acc_y', 'acc_z']):
                # 计算信号幅值向量 (SMV)
                df_resampled['acc_smv'] = np.sqrt(
                    df_resampled['acc_x']**2 + df_resampled['acc_y']**2 + df_resampled['acc_z']**2
                )

        # 如果不是加速度计，则检查是否为陀螺仪 ('gyro')
        elif sensor_code == 'gyro':
            # 对陀螺仪数据执行相同的操作
            if all(col in df_resampled.columns for col in ['gyro_x', 'gyro_y', 'gyro_z']):
                df_resampled['gyro_smv'] = np.sqrt(
                    df_resampled['gyro_x']**2 + df_resampled['gyro_y']**2 + df_resampled['gyro_z']**2
                )

        return df_resampled

    except (pd.errors.EmptyDataError, ValueError):
        return None
    except Exception as e:
        print(f"Error processing file {filepath}: {e}. Skipping.")
        return None

def load_data_from_structured_folders(dataset_root_path):
    """遍历数据集文件夹，处理、对齐并组合每个试验的传感器数据。"""
    print(f"Scanning for data in: {dataset_root_path}")
    if not os.path.isdir(dataset_root_path):
        print(f"ERROR: Dataset root path '{dataset_root_path}' not found.")
        return [], []

    # 存放每一次活动试验（trial）所对应的各个传感器文件的路径（数据文件的位置）
    trial_sensor_files_map = defaultdict(lambda: defaultdict(str))

    # 存放每一次活动试验的元数据（这些数据代表什么，即标签信息）
    trial_metadata_map = {}
    
    # 遍历数据集的每一个文件夹
    for dirpath, _, filenames in os.walk(dataset_root_path):
        # 解析文件夹路径，以确定活动类别和具体活动
        relative_path = os.path.relpath(dirpath, dataset_root_path)
        path_parts = relative_path.split(os.sep)
        # 确保只处理包含实际数据文件的特定层级文件夹
        if len(path_parts) != 3: continue

        # 遍历这些特定文件夹中的每一个文件
        for filename in filenames:
            # 确保只处理.txt文件
            if not filename.endswith(".txt"): continue
            
            # 解析文件名，通过下划线分割以获取各个部分
            fname_parts = filename.replace('.txt', '').split('_')
            # 过滤掉不符合预期格式的文件名
            if len(fname_parts) != 4: continue
            
            # 从文件名部分中提取所需信息
            _, sensor_code, _, trial_no_str = fname_parts
            # 将传感器代码转为小写以保持一致性
            sensor_code = sensor_code.lower()
            # 确保是已知的传感器类型 ('acc', 'gyro', 'ori')
            if sensor_code not in SENSOR_CODES: continue

            # 尝试从路径和文件名中提取并转换所有元数据
            try:
                # 从文件夹路径的第一部分提取受试者ID
                subject_match = re.fullmatch(r'sub(\d+)', path_parts[0], re.IGNORECASE)
                if not subject_match: continue
                subject_id = int(subject_match.group(1))
                
                # 从文件夹路径的第二和第三部分获取类别和活动代码
                category = path_parts[1].upper()
                activity_code = path_parts[2].upper()
                # 将试验编号从字符串转换为整数
                trial_no = int(trial_no_str)
                # 构建完整的文件路径
                filepath = os.path.join(dirpath, filename)
                
                # 创建一个唯一的键来标识这次试验 (受试者, 活动, 试验编号)
                trial_key = (subject_id, activity_code, trial_no)
                # 在映射表中存储该传感器文件的路径
                trial_sensor_files_map[trial_key][sensor_code] = filepath
                # 如果是第一次遇到这个试验，则记录其元数据（类别和活动代码）
                if trial_key not in trial_metadata_map:
                    trial_metadata_map[trial_key] = {"category": category, "activity_code": activity_code}
            except (AttributeError, ValueError):
                # 如果在提取或转换过程中出现任何错误，则跳过该文件
                continue

    # 初始化两个列表，用于存放最终处理好的数据和对应的标签
    processed_trials_data, labels = [], []
    print(f"\nProcessing and combining {len(trial_sensor_files_map)} unique trials...")
    
    # 遍历前面组织好的每一次活动试验（trial）
    for trial_key, sensor_files in trial_sensor_files_map.items():
        # 确保该次试验包含了 acc, gyro, ori 全部三种传感器文件，否则跳过
        if not all(s_code in sensor_files for s_code in SENSOR_CODES): continue

        # 使用字典推导式，为每种传感器加载并重采样数据
        resampled_dfs = {s_code: load_and_resample_sensor_file(sensor_files[s_code], s_code) for s_code in SENSOR_CODES}
        # 如果任何一个文件加载或处理失败（返回了None或空表），则跳过这次试验
        if any(df is None or df.empty for df in resampled_dfs.values()): continue

        try:
            # --- 时间对齐关键步骤 ---
            # 找到三个传感器数据中最晚的开始时间
            common_start = max(df.index.min() for df in resampled_dfs.values())
            # 找到三个传感器数据中最早的结束时间
            common_end = min(df.index.max() for df in resampled_dfs.values())
            # 如果没有重叠的时间窗口，则跳过
            if common_start >= common_end: continue

            # 将三个数据表都裁剪到共同的时间范围内
            aligned_dfs = [resampled_dfs[s_code][common_start:common_end].reset_index(drop=True) for s_code in SENSOR_CODES]
            # 确保对齐后的数据表长度一致且不为空，否则跳过
            if not all(len(df) > 0 and len(df) == len(aligned_dfs[0]) for df in aligned_dfs): continue
            
            # --- 数据合并 ---
            # 按列（axis=1）将三个对齐后的数据表拼接成一个宽表
            combined_df = pd.concat(aligned_dfs, axis=1)
            
            # 再次检查并确保列名正确
            if len(combined_df.columns) == len(ALL_FEATURE_COLUMNS):
                 combined_df.columns = ALL_FEATURE_COLUMNS
            else:
                 continue # 如果列数不匹配则跳过

            # 如果合并后的数据长度不足一个序列窗口（4秒），则跳过
            if len(combined_df) < SEQUENCE_LENGTH: continue
            
            # --- 数据和标签存储 ---
            # 将处理好的数据（转换为Numpy数组）存入列表
            processed_trials_data.append(combined_df.values)
            # 根据元数据判断该试验是"FALLS"还是"ADL"，并存入标签（1代表跌倒，0代表非跌倒）
            labels.append(1 if trial_metadata_map[trial_key]["category"] == "FALLS" else 0)
            
        except Exception:
            # 捕获任何在对齐和合并过程中可能出现的意外错误，并跳过该试验
            continue

    print(f"Successfully processed and combined sensor data for {len(processed_trials_data)} trials.")
    # 返回包含所有处理好的试验数据和标签的列表
    return processed_trials_data, labels

# def create_sequences(data_list, label_list, seq_length, step):
#     """使用滑动窗口从试验数据创建序列。"""
#     # 初始化用于存放最终序列和对应标签的列表
#     X, y = [], []
#     # 遍历每一次活动试验的数据
#     for i, trial_data in enumerate(data_list):
#         trial_label = label_list[i]
#         # 在单次试验数据上，按指定的步长（step）移动窗口
#         for j in range(0, len(trial_data) - seq_length + 1, step):
#             # 截取一个固定长度（seq_length）的片段作为序列
#             X.append(trial_data[j:(j + seq_length)])
#             # 为这个序列分配对应的标签
#             y.append(trial_label)
            
#     if not X: return np.array([]), np.array([])
#     # 将列表转换为Numpy数组后返回
#     return np.array(X), np.array(y)


trial_arrays, trial_labels = load_data_from_structured_folders(DATASET_PATH)
# x, y = create_sequences(trial_arrays, trial_labels, SEQUENCE_LENGTH, STEP)
# print(f"The shape of X: {x.shape}, The shape of y: {y.shape}")


Scanning for data in: MobiFall_Dataset

Processing and combining 627 unique trials...
Successfully processed and combined sensor data for 627 trials.


In [3]:
def create_continuous_stream(data_list: list, label_list: list) -> (np.ndarray, np.ndarray):
    """
    将所有试验数据拼接成一个连续的、无采样的原始数据流，并生成对应的标签流。

    参数:
    - data_list (list): 一个列表，其中每个元素是一个试验的Numpy数组 (形状为 [n_samples, 11])。
    - label_list (list): 一个列表，包含每个试验对应的标签 (0 或 1)。

    返回:
    - continuous_data (np.ndarray): 拼接后的连续数据流，形状为 [总时间点数, 11]。
    - continuous_labels (np.ndarray): 对应的连续标签流，形状为 [总时间点数,]。
    """
    if not data_list:
        return np.array([]), np.array([])

    # 为每个试验的每个时间点生成对应的标签
    # 例如，如果一个试验有491个时间点，标签是1，我们就生成一个包含491个1的数组
    all_trial_labels_expanded = []
    for i, trial_data in enumerate(data_list):
        num_timesteps = trial_data.shape[0]  # 获取该试验的时间点数量
        trial_label = label_list[i]
        # 使用 np.full 创建一个长度为 num_timesteps，值全为 trial_label 的数组
        labels_for_this_trial = np.full(num_timesteps, trial_label, dtype=np.int32)
        all_trial_labels_expanded.append(labels_for_this_trial)

    # 使用 np.concatenate 将所有试验数据数组沿第一个轴（时间轴）拼接起来
    continuous_data = np.concatenate(data_list, axis=0)
    
    # 同样地，拼接所有扩展后的标签数组
    continuous_labels = np.concatenate(all_trial_labels_expanded, axis=0)

    return continuous_data, continuous_labels

continuous_data, continuous_labels = create_continuous_stream(trial_arrays, trial_labels)
print(f"Continuous data shape: {continuous_data.shape}")


Continuous data shape: (593473, 11)


## 定义模型

In [4]:
class FeatureModel1DCNN(nn.Module):
    def __init__(self, input_channels=11, num_classes=1):
        super(FeatureModel1DCNN, self).__init__()
        
        # 特征提取器: 包含一系列的卷积和池化层
        self.feature_extractor = nn.Sequential(
            # Block 1
            nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(kernel_size=2, stride=2), # Length: 200 -> 100
            
            # Block 2
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.MaxPool1d(kernel_size=2, stride=2), # Length: 100 -> 50

            # Block 3
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.MaxPool1d(kernel_size=2, stride=2)  # Length: 50 -> 25
        )
        
        # 分类器: 将提取的特征映射到最终的输出
        # 输入维度需要计算: 256 (channels) * 25 (length)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 25, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        """
        标准的前向传播，用于训练和评估
        x 的输入形状: (batch_size, sequence_length, num_features) -> (N, 200, 11)
        """
        # Conv1d 需要 (N, C, L) 格式, 所以我们需要重排维度
        x = x.permute(0, 2, 1) # -> (N, 11, 200)
        
        features = self.feature_extractor(x)
        output = self.classifier(features)
        
        # 因为使用 BCEWithLogitsLoss, 所以不需要在这里加 sigmoid
        return output

    def extract_features(self, x):
        """
        仅用于提取中间特征的函数
        x 的输入形状: (batch_size, sequence_length, num_features) -> (N, 200, 11)
        """
        # 同样需要重排维度
        x = x.permute(0, 2, 1) # -> (N, 11, 200)
        
        # 只通过特征提取器
        features = self.feature_extractor(x)
        
        # 输出形状将是 (N, 256, 25)
        return features

# 第 2 步：生成连续的特征流和标签流

In [None]:
# --- 1. 配置参数 ---
# 确保这些参数与模型训练和模拟逻辑一致
WINDOW_SECONDS = 4

WINDOW_SIZE = int(TARGET_SAMPLING_RATE_HZ * WINDOW_SECONDS) # 窗口大小 (200个点)
STEP_SIZE = 25 # 步长 (25个点)

MODEL_PATH = "feature_model_1dcnn.pth"
SCALER_PATH = "scaler_50hz_torch.gz"

# --- 2. 加载模型和标准化器 ---
print("正在加载模型和标准化器...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载模型
model = FeatureModel1DCNN(input_channels=11, num_classes=1).to(device)
if os.path.exists(MODEL_PATH):
    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
    print(f"模型已从 {MODEL_PATH} 加载")
else:
    print(f"警告: 在 {MODEL_PATH} 未找到模型文件。将使用随机初始化的模型。")
model.eval() # 设置为评估模式

# 加载标准化器
if os.path.exists(SCALER_PATH):
    scaler = joblib.load(SCALER_PATH)
    print(f"标准化器已从 {SCALER_PATH} 加载")
else:
    # 抛出错误并停止执行
    raise FileNotFoundError(f"ERROR: Standard scaler file not found at '{SCALER_PATH}'. Cannot proceed without it.")


# --- 3. 批量处理数据并提取特征 ---
print("\n开始批量提取特征...")
all_features_list = []
all_labels_list = []

# `trial_arrays` 和 `trial_labels` 变量是从上一个数据加载单元格中获得的
# 遍历每一次试验的数据
for i in range(len(continuous_data) - WINDOW_SIZE + 1):
    trial_label = continuous_labels[i]

    window = continuous_data[i : i + WINDOW_SIZE]
    
    # 2. 预处理窗口数据 (标准化 -> 转换为Tensor)
    scaled_window = scaler.transform(window)
    window_tensor = torch.tensor(scaled_window, dtype=torch.float32).unsqueeze(0).to(device)
    
    # 3. 从模型中提取特征
    with torch.no_grad(): # 关闭梯度计算以加速
        features = model.extract_features(window_tensor)
    
    # 4. 将特征扁平化并添加到列表中
    flattened_features = features.cpu().numpy().flatten()
    all_features_list.append(flattened_features)
    
    # 5. 将该窗口对应的标签添加到列表中
    all_labels_list.append(trial_label)

print(f"处理完成！共处理了 {len(trial_arrays)} 次试验，生成了 {len(all_features_list)} 个特征向量。")

# --- 4. 保存最终的数据集 ---
if all_features_list:
    # 将列表转换为Numpy数组
    final_features = np.array(all_features_list)
    final_labels = np.array(all_labels_list)

    # 保存数组到.npy文件
    np.save("all_features.npy", final_features)
    np.save("all_labels.npy", final_labels)

    print(f"\n数据集已成功保存:")
    print(f"  - 特征文件: all_features.npy, 形状: {final_features.shape}")
    print(f"  - 标签文件: all_labels.npy, 形状: {final_labels.shape}")
else:
    print("\n未能生成任何特征，未创建文件。")

正在加载模型和标准化器...
模型已从 feature_model_1dcnn.pth 加载
标准化器已从 scaler_50hz_torch.gz 加载

开始批量提取特征...


KeyboardInterrupt: 

# 第 3 步：生成场景

In [6]:
def create_training_scenarios(all_features, continuous_raw_data, continuous_labels,
                              window_size, history_len, availability_ratio):
    """
    基于连续的LFS和HFS，生成用于训练保真模型的多样化场景。

    :param all_features: 低保真特征流 (LFS), (N, feature_dim)
    :param continuous_raw_data: 连续的原始数据流 (HFS), (M, 11)
    :param continuous_labels: 连续的标签流, (M,)
    :param window_size: 生成单个特征所需的原始数据窗口大小 (200)
    :param history_len: 保真模型LSTM所需的历史特征序列长度
    :param availability_ratio: 原始数据在任一时间点可用的概率
    """
    print("--- Starting Scenario Generator ---")
    
    # 1. 对齐数据流
    print("Step 1: Aligning Low-Fidelity and High-Fidelity streams...")
    offset = window_size - 1
    aligned_hfs = continuous_raw_data[offset:]
    aligned_labels = continuous_labels[offset:]
    
    # 确保LFS和对齐后的HFS/Labels长度一致
    num_features = len(all_features)
    aligned_hfs = aligned_hfs[:num_features]
    aligned_labels = aligned_labels[:num_features]
    
    print(f"Streams aligned. Resulting length: {num_features} points.")
    
    # 2. 生成场景
    print("Step 2: Generating scenarios with Window-Level Sparsity...")
    scenarios = []
    
    # 从第一个可以构成完整历史序列的点开始遍历
    for t in range(history_len - 1, num_features):
        # 提取历史特征序列
        feature_sequence = all_features[t - history_len + 1 : t + 1]
        
        # 模拟采样决策
        raw_data_for_t = None
        if random.random() < availability_ratio:
            # 如果“采样”成功，则获取当前时间点对应的原始数据窗口
            start_index = t - (window_size - 1)
            end_index = t + 1
            raw_data_for_t = aligned_hfs[start_index : end_index]
            
            # 安全检查，确保切片长度正确
            if raw_data_for_t.shape[0] != window_size:
                continue # 如果窗口不完整（通常在数据流末尾），则跳过此样本
        
        # 提取标签
        label = aligned_labels[t]

        # 存入场景列表
        scenarios.append((feature_sequence, raw_data_for_t, label))
        
    print(f"Generated {len(scenarios)} scenarios.")
    print(f"Example: {len([s for s in scenarios if s[1] is not None])} scenarios have raw data available.")
    return scenarios


# PyTorch Dataset
class FidelityDataset(Dataset):
    def __init__(self, scenarios):
        self.scenarios = scenarios
        print(f"FidelityDataset created with {len(self.scenarios)} samples.")

    def __len__(self):
        return len(self.scenarios)

    def __getitem__(self, idx):
        feature_seq, raw_data, label = self.scenarios[idx]

        feature_seq_tensor = torch.tensor(feature_seq, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.float32)

        if raw_data is not None:
            raw_data_tensor = torch.tensor(raw_data, dtype=torch.float32)
        else:
            # 当原始数据不存在时，用一个全零的Tensor作为占位符
            # 这就是对“完全缺失”的窗口级“全零”填补
            raw_data_tensor = torch.zeros((200, 11), dtype=torch.float32)
            
        return feature_seq_tensor, raw_data_tensor, label_tensor



# --- 1. 配置参数 ---
HISTORY_LEN = 60      # 保真模型的历史窗口长度 (30秒)
WINDOW_SIZE = 200     # 特征模型用的原始数据窗口长度 (4秒)
AVAILABILITY_RATIO = 0.3 # 原始数据可用率 (稀疏度)

# --- 2. 加载基础数据流 ---
print("Loading foundational data streams...")

try:
    all_features = np.load("all_features.npy")
    all_labels = np.load("all_labels.npy") # 这里的标签是对应每个特征的

except FileNotFoundError:
    print("Error: `all_features.npy` or `all_labels.npy` not found.")
    print("Please generate these files first or use the dummy data block.")
    exit()

# --- 3. 调用场景生成器 ---
all_scenarios = create_training_scenarios(
    all_features=all_features,
    continuous_raw_data=continuous_data,
    continuous_labels=continuous_labels,
    window_size=WINDOW_SIZE,
    history_len=HISTORY_LEN,
    availability_ratio=AVAILABILITY_RATIO
)

# --- 4. 准备训练、验证集 ---
print("\nSplitting data into training and validation sets...")

# 从场景中解包，以便进行分层抽样
feature_sequences, raw_data_list, labels = zip(*all_scenarios)

# 划分训练集和验证集 (80/20)
X_seq_train, X_seq_val, X_raw_train, X_raw_val, y_train, y_val = train_test_split(
    feature_sequences, raw_data_list, labels, 
    test_size=0.2, random_state=42, stratify=labels
)

# 重新打包成场景元组
train_scenarios = list(zip(X_seq_train, X_raw_train, y_train))
val_scenarios = list(zip(X_seq_val, X_raw_val, y_val))

# 创建Dataset和DataLoader
train_dataset = FidelityDataset(train_scenarios)
val_dataset = FidelityDataset(val_scenarios)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print("\n--- DataLoader Sanity Check ---")
# 从dataloader中取出一个批次的数据来检查
batch_feature_seq, batch_raw_data, batch_labels = next(iter(train_loader))

print(f"Feature sequence batch shape: {batch_feature_seq.shape}")
print(f"Raw data batch shape: {batch_raw_data.shape}")
print(f"Labels batch shape: {batch_labels.shape}")

print("\nDataset construction complete. The `train_loader` and `val_loader` are ready for training.")

Loading foundational data streams...
--- Starting Scenario Generator ---
Step 1: Aligning Low-Fidelity and High-Fidelity streams...
Streams aligned. Resulting length: 593274 points.
Step 2: Generating scenarios with Window-Level Sparsity...
Generated 593172 scenarios.
Example: 177811 scenarios have raw data available.

Splitting data into training and validation sets...
FidelityDataset created with 474537 samples.
FidelityDataset created with 118635 samples.

--- DataLoader Sanity Check ---
Feature sequence batch shape: torch.Size([64, 60, 6400])
Raw data batch shape: torch.Size([64, 200, 11])
Labels batch shape: torch.Size([64])

Dataset construction complete. The `train_loader` and `val_loader` are ready for training.


## 模型定义

In [7]:
def create_raw_data_cnn():
    """创建一个用于处理原始传感器数据的1D-CNN模块。"""
    raw_data_processor = nn.Sequential(
        nn.Conv1d(in_channels=11, out_channels=64, kernel_size=3, padding='same'), nn.ReLU(), nn.BatchNorm1d(64),
        nn.MaxPool1d(kernel_size=2, stride=2),
        nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding='same'), nn.ReLU(), nn.BatchNorm1d(128),
        nn.MaxPool1d(kernel_size=2, stride=2),
        nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding='same'), nn.ReLU(), nn.BatchNorm1d(256),
        nn.MaxPool1d(kernel_size=2, stride=2),
        nn.Flatten()
    )
    return raw_data_processor

class FidelityModelGated(nn.Module):
    def __init__(self, feature_dim, lstm_hidden_dim, raw_cnn_output_dim, num_classes=1):
        super(FidelityModelGated, self).__init__()
        self.lstm_hidden_dim = lstm_hidden_dim
        self.raw_cnn_output_dim = raw_cnn_output_dim
        self.feature_lstm = nn.LSTM(
            input_size=feature_dim, hidden_size=lstm_hidden_dim,
            num_layers=2, batch_first=True, dropout=0.5
        )
        self.raw_data_cnn = create_raw_data_cnn()
        self.gating_layer = nn.Sequential(
            nn.Linear(lstm_hidden_dim + self.raw_cnn_output_dim, lstm_hidden_dim), nn.ReLU(),
            nn.Linear(lstm_hidden_dim, 1), nn.Sigmoid()
        )
        self.raw_transform = nn.Linear(self.raw_cnn_output_dim, self.lstm_hidden_dim)
        self.classifier = nn.Sequential(
            nn.Linear(lstm_hidden_dim, 64), nn.ReLU(),
            nn.Dropout(0.5), nn.Linear(64, num_classes)
        )

    def forward(self, feature_sequence, raw_data=None):
        lstm_outputs, (h_n, c_n) = self.feature_lstm(feature_sequence)
        lstm_last_output = lstm_outputs[:, -1, :]
        if raw_data is not None and raw_data.nelement() > 0: # 确保传入的不是空tensor
            raw_data = raw_data.permute(0, 2, 1)
            v_raw = self.raw_data_cnn(raw_data)
        else:
            v_raw = torch.zeros(feature_sequence.size(0), self.raw_cnn_output_dim, device=feature_sequence.device)

        combined_for_gate = torch.cat((lstm_last_output, v_raw), dim=1)
        gate = self.gating_layer(combined_for_gate)
        transformed_v_raw = self.raw_transform(v_raw)
        fused_vector = lstm_last_output + gate * torch.tanh(transformed_v_raw)
        logits = self.classifier(fused_vector)
        state_feature = h_n[-1, :, :].squeeze(0)
        return logits, state_feature, gate

# 第 4 步：训练与评估函数模块

In [None]:
def train_one_epoch(model, dataloader, criterion, optimizer, device, noise_level=0.0):
    model.train()
    running_loss = 0.0
    for feature_seq, raw_data, labels in dataloader:
        feature_seq = feature_seq.to(device)
        raw_data = raw_data.to(device)
        labels = labels.to(device).unsqueeze(1) # 形状匹配BCEWithLogitsLoss

        optimizer.zero_grad()
        
        # 注意：只关心训练时的logits输出
        logits, _, _ = model(feature_seq, raw_data)
        
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * feature_seq.size(0)
    
    return running_loss / len(dataloader.dataset)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for feature_seq, raw_data, labels in dataloader:
            feature_seq = feature_seq.to(device)
            raw_data = raw_data.to(device)
            labels = labels.to(device).unsqueeze(1)

            logits, _, _ = model(feature_seq, raw_data)
            loss = criterion(logits, labels)
            total_loss += loss.item() * feature_seq.size(0)
            
            preds = torch.sigmoid(logits) > 0.5
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
    
    return {
        "loss": avg_loss,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# 第 6 步：训练

In [None]:
# --- 1. 配置参数 ---
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
HISTORY_SEQ_LEN = 60  # 30秒历史
BATCH_SIZE = 64
LEARNING_RATE = 0.0005
EPOCHS = 10
MODEL_SAVE_PATH = "fidelity_model_best.pth"
NOISE_LEVEL = 0.25


# --- 4. 初始化模型并开始训练 ---
print("\nInitializing model for training...")
fidelity_model = FidelityModelGated(
    feature_dim=6400,
    lstm_hidden_dim=256,
    raw_cnn_output_dim=6400
).to(DEVICE)

# 使用BCEWithLogitsLoss，它内置了sigmoid，更稳定
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(fidelity_model.parameters(), lr=LEARNING_RATE)

best_val_f1 = 0.0
best_val_loss = float('inf')
print("--- Starting Training ---")
for epoch in range(EPOCHS):
    train_loss = train_one_epoch(fidelity_model, train_loader, criterion, optimizer, DEVICE, noise_level=NOISE_LEVEL)
    val_metrics = evaluate(fidelity_model, val_loader, criterion, DEVICE)
    
    print(
        f"Epoch {epoch+1}/{EPOCHS} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_metrics['loss']:.4f} | "
        f"Val Accuracy: {val_metrics['accuracy']:.4f} | "
        f"Val F1: {val_metrics['f1']:.4f}"
    )
    
    # 保存表现最好的模型（以F1分数为标准）
    if val_metrics['f1'] > best_val_f1 or val_metrics['loss'] < best_val_loss:
        best_val_f1 = val_metrics['f1']
        best_val_loss = val_metrics['loss']
        torch.save(fidelity_model.state_dict(), MODEL_SAVE_PATH)
        print(f"  -> New best model saved to {MODEL_SAVE_PATH} (F1: {best_val_f1:.4f})")

print("--- Training Finished ---")


Initializing model for training...
--- Starting Training ---
