# Notice
训练之前请确保，特征模型训练后得到的 **`feature_model_1dcnn.pth`** 和 **`scaler_50hz_torch.gz`** 这两个文件存在

# 导入依赖库，定义参数

In [1]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import re
import io
import joblib
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset



# --- Configuration ---
DATASET_PATH = 'MobiFall_Dataset'
TARGET_SAMPLING_RATE_HZ = 50.0  # 每秒有 50 个数据点
TARGET_SAMPLING_PERIOD = f"{int(1000 / TARGET_SAMPLING_RATE_HZ)}ms"
SEQUENCE_LENGTH = int(TARGET_SAMPLING_RATE_HZ * 4) # 200 samples for 4 seconds at 50Hz

SENSOR_CODES = ["acc", "gyro", "ori"]
EXPECTED_COLUMNS = {
    "acc": ["acc_x", "acc_y", "acc_z"],
    "gyro": ["gyro_x", "gyro_y", "gyro_z"],
    "ori": ["ori_azimuth", "ori_pitch", "ori_roll"]
}
ALL_FEATURE_COLUMNS = [
    "acc_x", "acc_y", "acc_z", "acc_smv",
    "gyro_x", "gyro_y", "gyro_z", "gyro_smv",
    "ori_azimuth", "ori_pitch", "ori_roll"
]

# 第 1 步：处理原始数据

In [None]:
def load_and_resample_sensor_file(filepath, sensor_code):
    """加载单个传感器文件，转换时间戳并进行重采样。"""
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()

        # 初始化一个变量作为“标记未找到”的标志
        data_start_line_index = -1

        # 遍历文件中的每一行
        for i, line in enumerate(lines):
            # 检查当前行是否是"@DATA"标记
            if line.strip().upper() == "@DATA":
                # 如果是，则记录下一行的行号并跳出循环
                data_start_line_index = i + 1
                break

        # 检查标记是否被找到
        if data_start_line_index == -1 or data_start_line_index >= len(lines):
            return None

        # 将数据行拼接成单个字符串
        data_string = "".join(lines[data_start_line_index:])

        # 检查字符串是否为空
        if not data_string.strip():
            return None

        # 使用pandas处理数据
        df = pd.read_csv(io.StringIO(data_string), header=None, usecols=[0, 1, 2, 3])
        
        # 检查生成的数据表是否为空
        if df.empty:
            return None

        # 为数据列进行命名
        df.columns = ['timestamp_ns'] + EXPECTED_COLUMNS[sensor_code]

        # 将ns时间戳转换为标准的日期时间格式
        df['timestamp'] = pd.to_datetime(df['timestamp_ns'], unit='ns')

        # 将新的日期时间设置为索引，并删除旧的时间戳列
        df = df.set_index('timestamp').drop(columns=['timestamp_ns'])

        # 按时间索引进行排序
        df = df.sort_index()

        # 将采样时间不均匀的传感器数据，强制转换为频率统一（每20毫秒一个点）的规整数据流，并填补其中的所有空白
        df_resampled = df.resample(TARGET_SAMPLING_PERIOD).mean().interpolate(method='linear', limit_direction='both')

        # 检查当前处理的传感器是否为加速度计 ('acc')
        if sensor_code == 'acc':
            # 安全性检查 - 确认三轴数据都存在
            if all(col in df_resampled.columns for col in ['acc_x', 'acc_y', 'acc_z']):
                # 计算信号幅值向量 (SMV)
                df_resampled['acc_smv'] = np.sqrt(
                    df_resampled['acc_x']**2 + df_resampled['acc_y']**2 + df_resampled['acc_z']**2
                )

        # 如果不是加速度计，则检查是否为陀螺仪 ('gyro')
        elif sensor_code == 'gyro':
            # 对陀螺仪数据执行相同的操作
            if all(col in df_resampled.columns for col in ['gyro_x', 'gyro_y', 'gyro_z']):
                df_resampled['gyro_smv'] = np.sqrt(
                    df_resampled['gyro_x']**2 + df_resampled['gyro_y']**2 + df_resampled['gyro_z']**2
                )

        return df_resampled

    except (pd.errors.EmptyDataError, ValueError):
        return None
    except Exception as e:
        print(f"Error processing file {filepath}: {e}. Skipping.")
        return None

def load_data_from_structured_folders(dataset_root_path):
    """遍历数据集文件夹，处理、对齐并组合每个试验的传感器数据。"""
    print(f"Scanning for data in: {dataset_root_path}")
    if not os.path.isdir(dataset_root_path):
        print(f"ERROR: Dataset root path '{dataset_root_path}' not found.")
        return [], []

    # 存放每一次活动试验（trial）所对应的各个传感器文件的路径（数据文件的位置）
    trial_sensor_files_map = defaultdict(lambda: defaultdict(str))

    # 存放每一次活动试验的元数据（这些数据代表什么，即标签信息）
    trial_metadata_map = {}
    
    # 遍历数据集的每一个文件夹
    for dirpath, _, filenames in os.walk(dataset_root_path):
        # 解析文件夹路径，以确定活动类别和具体活动
        relative_path = os.path.relpath(dirpath, dataset_root_path)
        path_parts = relative_path.split(os.sep)
        # 确保只处理包含实际数据文件的特定层级文件夹
        if len(path_parts) != 3: continue

        # 遍历这些特定文件夹中的每一个文件
        for filename in filenames:
            # 确保只处理.txt文件
            if not filename.endswith(".txt"): continue
            
            # 解析文件名，通过下划线分割以获取各个部分
            fname_parts = filename.replace('.txt', '').split('_')
            # 过滤掉不符合预期格式的文件名
            if len(fname_parts) != 4: continue
            
            # 从文件名部分中提取所需信息
            _, sensor_code, _, trial_no_str = fname_parts
            # 将传感器代码转为小写以保持一致性
            sensor_code = sensor_code.lower()
            # 确保是已知的传感器类型 ('acc', 'gyro', 'ori')
            if sensor_code not in SENSOR_CODES: continue

            # 尝试从路径和文件名中提取并转换所有元数据
            try:
                # 从文件夹路径的第一部分提取受试者ID
                subject_match = re.fullmatch(r'sub(\d+)', path_parts[0], re.IGNORECASE)
                if not subject_match: continue
                subject_id = int(subject_match.group(1))
                
                # 从文件夹路径的第二和第三部分获取类别和活动代码
                category = path_parts[1].upper()
                activity_code = path_parts[2].upper()
                # 将试验编号从字符串转换为整数
                trial_no = int(trial_no_str)
                # 构建完整的文件路径
                filepath = os.path.join(dirpath, filename)
                
                # 创建一个唯一的键来标识这次试验 (受试者, 活动, 试验编号)
                trial_key = (subject_id, activity_code, trial_no)
                # 在映射表中存储该传感器文件的路径
                trial_sensor_files_map[trial_key][sensor_code] = filepath
                # 如果是第一次遇到这个试验，则记录其元数据（类别和活动代码）
                if trial_key not in trial_metadata_map:
                    trial_metadata_map[trial_key] = {"category": category, "activity_code": activity_code}
            except (AttributeError, ValueError):
                # 如果在提取或转换过程中出现任何错误，则跳过该文件
                continue

    # 初始化两个列表，用于存放最终处理好的数据和对应的标签
    processed_trials_data, labels = [], []
    print(f"\nProcessing and combining {len(trial_sensor_files_map)} unique trials...")
    
    # 遍历前面组织好的每一次活动试验（trial）
    for trial_key, sensor_files in trial_sensor_files_map.items():
        # 确保该次试验包含了 acc, gyro, ori 全部三种传感器文件，否则跳过
        if not all(s_code in sensor_files for s_code in SENSOR_CODES): continue

        # 使用字典推导式，为每种传感器加载并重采样数据
        resampled_dfs = {s_code: load_and_resample_sensor_file(sensor_files[s_code], s_code) for s_code in SENSOR_CODES}
        # 如果任何一个文件加载或处理失败（返回了None或空表），则跳过这次试验
        if any(df is None or df.empty for df in resampled_dfs.values()): continue

        try:
            # --- 时间对齐关键步骤 ---
            # 找到三个传感器数据中最晚的开始时间
            common_start = max(df.index.min() for df in resampled_dfs.values())
            # 找到三个传感器数据中最早的结束时间
            common_end = min(df.index.max() for df in resampled_dfs.values())
            # 如果没有重叠的时间窗口，则跳过
            if common_start >= common_end: continue

            # 将三个数据表都裁剪到共同的时间范围内
            aligned_dfs = [resampled_dfs[s_code][common_start:common_end].reset_index(drop=True) for s_code in SENSOR_CODES]
            # 确保对齐后的数据表长度一致且不为空，否则跳过
            if not all(len(df) > 0 and len(df) == len(aligned_dfs[0]) for df in aligned_dfs): continue
            
            # --- 数据合并 ---
            # 按列（axis=1）将三个对齐后的数据表拼接成一个宽表
            combined_df = pd.concat(aligned_dfs, axis=1)
            
            # 再次检查并确保列名正确
            if len(combined_df.columns) == len(ALL_FEATURE_COLUMNS):
                 combined_df.columns = ALL_FEATURE_COLUMNS
            else:
                 continue # 如果列数不匹配则跳过

            # 如果合并后的数据长度不足一个序列窗口（4秒），则跳过
            if len(combined_df) < SEQUENCE_LENGTH: continue
            
            # --- 数据和标签存储 ---
            # 将处理好的数据（转换为Numpy数组）存入列表
            processed_trials_data.append(combined_df.values)
            # 根据元数据判断该试验是"FALLS"还是"ADL"，并存入标签（1代表跌倒，0代表非跌倒）
            labels.append(1 if trial_metadata_map[trial_key]["category"] == "FALLS" else 0)
            
        except Exception:
            # 捕获任何在对齐和合并过程中可能出现的意外错误，并跳过该试验
            continue

    print(f"Successfully processed and combined sensor data for {len(processed_trials_data)} trials.")
    # 返回包含所有处理好的试验数据和标签的列表
    return processed_trials_data, labels

# def create_sequences(data_list, label_list, seq_length, step):
#     """使用滑动窗口从试验数据创建序列。"""
#     # 初始化用于存放最终序列和对应标签的列表
#     X, y = [], []
#     # 遍历每一次活动试验的数据
#     for i, trial_data in enumerate(data_list):
#         trial_label = label_list[i]
#         # 在单次试验数据上，按指定的步长（step）移动窗口
#         for j in range(0, len(trial_data) - seq_length + 1, step):
#             # 截取一个固定长度（seq_length）的片段作为序列
#             X.append(trial_data[j:(j + seq_length)])
#             # 为这个序列分配对应的标签
#             y.append(trial_label)
            
#     if not X: return np.array([]), np.array([])
#     # 将列表转换为Numpy数组后返回
#     return np.array(X), np.array(y)


trial_arrays, trial_labels = load_data_from_structured_folders(DATASET_PATH)
# x, y = create_sequences(trial_arrays, trial_labels, SEQUENCE_LENGTH, STEP)
# print(f"The shape of X: {x.shape}, The shape of y: {y.shape}")


## 创建连续的数据流

In [None]:
def create_continuous_stream(data_list: list, label_list: list) -> (np.ndarray, np.ndarray):
    """
    将所有试验数据拼接成一个连续的、无采样的原始数据流，并生成对应的标签流。

    参数:
    - data_list (list): 一个列表，其中每个元素是一个试验的Numpy数组 (形状为 [n_samples, 11])。
    - label_list (list): 一个列表，包含每个试验对应的标签 (0 或 1)。

    返回:
    - continuous_data (np.ndarray): 拼接后的连续数据流，形状为 [总时间点数, 11]。
    - continuous_labels (np.ndarray): 对应的连续标签流，形状为 [总时间点数,]。
    """
    if not data_list:
        return np.array([]), np.array([])

    # 为每个试验的每个时间点生成对应的标签
    # 例如，如果一个试验有491个时间点，标签是1，我们就生成一个包含491个1的数组
    all_trial_labels_expanded = []
    for i, trial_data in enumerate(data_list):
        num_timesteps = trial_data.shape[0]  # 获取该试验的时间点数量
        trial_label = label_list[i]
        # 使用 np.full 创建一个长度为 num_timesteps，值全为 trial_label 的数组
        labels_for_this_trial = np.full(num_timesteps, trial_label, dtype=np.int32)
        all_trial_labels_expanded.append(labels_for_this_trial)

    # 使用 np.concatenate 将所有试验数据数组沿第一个轴（时间轴）拼接起来
    continuous_data = np.concatenate(data_list, axis=0)
    
    # 同样地，拼接所有扩展后的标签数组
    continuous_labels = np.concatenate(all_trial_labels_expanded, axis=0)

    return continuous_data, continuous_labels

continuous_data, continuous_labels = create_continuous_stream(trial_arrays, trial_labels)
print(f"Continuous data shape: {continuous_data.shape}")


# 截断数据流，仅保留十分之一（测试）

In [None]:
continuous_data = continuous_data[:len(continuous_data)//10]  # 截断数据流（测试）
continuous_labels = continuous_labels[:len(continuous_labels)//10]
print(f"Truncated continuous data shape: {continuous_data.shape}")

## 定义 1D-CNN 模型

In [None]:
class FeatureModel1DCNN(nn.Module):
    def __init__(self, input_channels=11, num_classes=1):
        super(FeatureModel1DCNN, self).__init__()
        
        # 特征提取器: 包含一系列的卷积和池化层
        self.feature_extractor = nn.Sequential(
            # Block 1
            nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(kernel_size=2, stride=2), # Length: 200 -> 100
            
            # Block 2
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.MaxPool1d(kernel_size=2, stride=2), # Length: 100 -> 50

            # Block 3
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.MaxPool1d(kernel_size=2, stride=2)  # Length: 50 -> 25
        )
        
        # 分类器: 将提取的特征映射到最终的输出
        # 输入维度需要计算: 256 (channels) * 25 (length)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 25, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        """
        标准的前向传播，用于训练和评估
        x 的输入形状: (batch_size, sequence_length, num_features) -> (N, 200, 11)
        """
        # Conv1d 需要 (N, C, L) 格式, 所以我们需要重排维度
        x = x.permute(0, 2, 1) # -> (N, 11, 200)
        
        features = self.feature_extractor(x)
        output = self.classifier(features)
        
        # 因为使用 BCEWithLogitsLoss, 所以不需要在这里加 sigmoid
        return output

    def extract_features(self, x):
        """
        仅用于提取中间特征的函数
        x 的输入形状: (batch_size, sequence_length, num_features) -> (N, 200, 11)
        """
        # 同样需要重排维度
        x = x.permute(0, 2, 1) # -> (N, 11, 200)
        
        # 只通过特征提取器
        features = self.feature_extractor(x)
        
        # 输出形状将是 (N, 256, 25)
        return features

# 第 2 步：生成连续的特征流和标签流

In [None]:
# --- 1. 配置参数 ---
# 确保这些参数与模型训练和模拟逻辑一致
WINDOW_SECONDS = 4

WINDOW_SIZE = SEQUENCE_LENGTH
STEP_SIZE = 25 # 步长 (25个点)，即每0.5秒提取一次特征

MODEL_PATH = "feature_model_1dcnn.pth"
SCALER_PATH = "scaler_50hz_torch.gz"

# 如果已经存在all_features.npy和all_labels.npy，就跳过该单元格
if os.path.exists("all_features.npy") and os.path.exists("all_labels.npy"):
    print("all_features.npy 和 all_labels.npy 已存在，跳过特征提取步骤。")

else:
    print("all_features.npy 和 all_labels.npy 不存在，开始特征提取步骤。")
    # --- 2. 加载模型和标准化器 ---
    print("正在加载模型和标准化器...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 加载模型
    model = FeatureModel1DCNN(input_channels=11, num_classes=1).to(device)
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
        print(f"模型已从 {MODEL_PATH} 加载")
    else:
        print(f"警告: 在 {MODEL_PATH} 未找到模型文件。将使用随机初始化的模型。")
    model.eval() # 设置为评估模式

    # 加载标准化器
    if os.path.exists(SCALER_PATH):
        scaler = joblib.load(SCALER_PATH)
        print(f"标准化器已从 {SCALER_PATH} 加载")
    else:
        # 抛出错误并停止执行
        raise FileNotFoundError(f"ERROR: Standard scaler file not found at '{SCALER_PATH}'. Cannot proceed without it.")


    # --- 3. 批量处理数据并提取特征 ---
    print("\n开始批量提取特征...")
    all_features_list = []
    all_labels_list = []

    # `trial_arrays` 和 `trial_labels` 变量是从上一个数据加载单元格中获得的
    # 遍历每一次试验的数据
    for i in range(len(continuous_data) - WINDOW_SIZE + 1):
        trial_label = continuous_labels[i]

        window = continuous_data[i : i + WINDOW_SIZE]
        
        # 2. 预处理窗口数据 (标准化 -> 转换为Tensor)
        scaled_window = scaler.transform(window)
        window_tensor = torch.tensor(scaled_window, dtype=torch.float32).unsqueeze(0).to(device)
        
        # 3. 从模型中提取特征
        with torch.no_grad(): # 关闭梯度计算以加速
            features = model.extract_features(window_tensor)
        
        # 4. 将特征扁平化并添加到列表中
        flattened_features = features.cpu().numpy().flatten()
        all_features_list.append(flattened_features)
        
        # 5. 将该窗口对应的标签添加到列表中
        all_labels_list.append(trial_label)

    print(f"处理完成！共处理了 {len(trial_arrays)} 次试验，生成了 {len(all_features_list)} 个特征向量。")

    # --- 4. 保存最终的数据集 ---
    if all_features_list:
        # 将列表转换为Numpy数组
        final_features = np.array(all_features_list)
        final_labels = np.array(all_labels_list)

        # 保存数组到.npy文件
        np.save("all_features.npy", final_features)
        np.save("all_labels.npy", final_labels)

        print(f"\n数据集已成功保存:")
        print(f"  - 特征文件: all_features.npy, 形状: {final_features.shape}")
        print(f"  - 标签文件: all_labels.npy, 形状: {final_labels.shape}")
    else:
        print("\n未能生成任何特征，未创建文件。")

    # 释放内存，清理变量
    del model
    del scaler
    del all_features_list
    del all_labels_list
    del final_features
    del final_labels
    

## 加载 npy 文件

In [2]:
# --- 1. 配置参数 ---
HISTORY_LEN = 60    # 30 秒内特征的个数（两个连续特征之间的时间间隔是 0.5 秒）
BATCH_SIZE = 32
AVAILABILITY_RATIO = 0.3

# --- 2. 加载或模拟基础数据流 ---
print("Loading foundational data streams...")
# 【用户待办】您需要加载您真实的数据
try:
    all_features = np.load("all_features.npy")
except FileNotFoundError:
    print("Error: `all_features.npy` not found.")
    exit()

Loading foundational data streams...


# 第 3 步：生成场景

## 生成器

In [None]:
class ContextualFidelityDataset_OnTheFly(Dataset):
    """
    一个内存高效的数据集类，它在被请求时即时生成每个训练样本。
    """
    def __init__(self, all_features, continuous_raw_data, continuous_labels,
                 window_size, history_len, availability_ratio):
        
        super().__init__()
        print("--- Initializing On-the-Fly Dataset ---")

        # 1. 存储基础数据和参数
        self.window_size = window_size
        self.history_len = history_len
        self.availability_ratio = availability_ratio
        
        # 2. 对齐数据流 (这一步在初始化时执行一次即可)
        print("Aligning streams...")
        offset = self.window_size - 1
        
        self.aligned_lfs = all_features
        self.aligned_labels = continuous_labels[offset : offset + len(all_features)]
        
        # 将原始数据也存储起来，以便在 __getitem__ 中切片
        # 注意：这里我们只存储原始的连续数据流，而不是生成所有窗口
        self.continuous_raw_data = continuous_raw_data
        
        self.num_features = len(self.aligned_lfs)
        print(f"Initialization complete. Total possible scenarios: {self.__len__()}")

    def __len__(self):
        # 可生成的总样本数
        return self.num_features - self.history_len + 1

    def __getitem__(self, idx):
        # DataLoader 请求第 idx 个样本时，此方法被调用
        
        # 1. 计算当前样本在对齐流中的结束时间点
        t = idx + self.history_len - 1
        
        # 2. 提取低保真历史特征序列
        feature_sequence = self.aligned_lfs[t - self.history_len + 1 : t + 1]
        
        # 3. 提取标签
        label = self.aligned_labels[t]
        
        # 4. 即时构建并稀疏化高保真历史原始数据序列
        hfs_history_list = []
        for i in range(self.history_len):
            historical_time_index_in_lfs = t - self.history_len + 1 + i
            
            if random.random() < self.availability_ratio:
                # 从 *最原始的* continuous_raw_data 中截取窗口
                start_index = historical_time_index_in_lfs
                end_index = start_index + self.window_size
                raw_window = self.continuous_raw_data[start_index:end_index]
                hfs_history_list.append(raw_window)
            else:
                # 用全零窗口作为占位符
                hfs_history_list.append(np.zeros((self.window_size, 11)))
        
        # 5. 将窗口列表堆叠成一个大的Numpy数组
        imputed_raw_sequence = np.stack(hfs_history_list, axis=0)
        
        # 6. 转换为Tensor并返回
        return (
            torch.tensor(feature_sequence, dtype=torch.float32),
            torch.tensor(imputed_raw_sequence, dtype=torch.float32),
            torch.tensor(label, dtype=torch.float32)
        )



# # --- 3. 初始化“即时生成”数据集 ---
# full_dataset = ContextualFidelityDataset_OnTheFly(
#     all_features=all_features,
#     continuous_raw_data=continuous_data,
#     continuous_labels=continuous_labels,
#     window_size=WINDOW_SIZE,
#     history_len=HISTORY_LEN,
#     availability_ratio=AVAILABILITY_RATIO
# )

# # --- 4. 划分训练、验证集 (通过划分索引) ---
# print("\nSplitting data into training and validation sets by indices...")
# dataset_size = len(full_dataset)
# indices = list(range(dataset_size))
# labels_for_stratify = full_dataset.aligned_labels[HISTORY_LEN - 1:] # 获取用于分层的标签

# train_indices, val_indices = train_test_split(
#     indices, test_size=0.2, random_state=42, stratify=labels_for_stratify
# )

# # 使用 PyTorch 的 Subset 来创建基于索引的子数据集
# train_dataset = Subset(full_dataset, train_indices)
# val_dataset = Subset(full_dataset, val_indices)

# print(f"Training set size: {len(train_dataset)}")
# print(f"Validation set size: {len(val_dataset)}")

# # 创建DataLoader
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
# val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8)

# --- 5. 健全性检查 ---
# print("\n--- DataLoader Sanity Check ---")
# batch_feature_seq, batch_raw_seq, batch_labels = next(iter(train_loader))

# print(f"Feature sequence batch shape: {batch_feature_seq.shape}")
# print(f"Raw data sequence batch shape: {batch_raw_seq.shape}")
# print(f"Labels batch shape: {batch_labels.shape}")

# print("\nMemory-efficient dataset pipeline is ready for training.")

## 保存到本地

In [None]:
import math

def generate_and_save_scenarios_as_chunks(
    all_features, continuous_raw_data, continuous_labels,
    window_size, history_len, availability_ratio,
    chunk_size_gb=1.0, output_dir="precomputed_dataset", num_epochs_to_generate=1):
    
    print("--- Starting Chunked Dataset Generation ---")
    
    # 对齐数据流
    offset = window_size - 1
    aligned_labels = continuous_labels[offset : offset + len(all_features)]
    num_features = len(all_features)
    
    # 计算每个样本的大约大小 (MB)
    sample_size_bytes = (history_len * window_size * 11 * 4) + (history_len * 6400 * 4)
    sample_size_mb = sample_size_bytes / (1024**2)
    
    # 计算每个文件块应包含多少个样本
    chunk_size_mb = chunk_size_gb * 1024
    samples_per_chunk = math.floor(chunk_size_mb / sample_size_mb)
    print(f"Each sample is ~{sample_size_mb:.2f} MB. Each chunk will contain {samples_per_chunk} samples.")

    # --- 核心改动：为每个epoch生成一套独立的、随机的数据 ---
    for epoch_idx in range(num_epochs_to_generate):
        epoch_dir = os.path.join(output_dir, f"epoch_{epoch_idx}")
        os.makedirs(epoch_dir, exist_ok=True)
        print(f"\nGenerating data for Epoch {epoch_idx}...")

        scenarios_chunk = []
        chunk_idx = 0
        
        # 遍历所有可能的样本
        for t in range(history_len - 1, num_features):
            feature_sequence = all_features[t - history_len + 1 : t + 1]
            hfs_history_list = []
            for i in range(history_len):
                historical_time_index_in_lfs = t - history_len + 1 + i
                if random.random() < availability_ratio:
                    start_index = historical_time_index_in_lfs
                    end_index = start_index + window_size
                    raw_window = continuous_raw_data[start_index:end_index]
                    hfs_history_list.append(raw_window)
                else:
                    hfs_history_list.append(np.zeros((window_size, 11)))
            
            imputed_raw_sequence = np.stack(hfs_history_list, axis=0)
            label = aligned_labels[t]
            
            # 使用torch tensor来保存，读取更快
            scenarios_chunk.append((
                torch.tensor(feature_sequence, dtype=torch.float32),
                torch.tensor(imputed_raw_sequence, dtype=torch.float32),
                torch.tensor(label, dtype=torch.float32)
            ))
            
            # 如果当前块满了，就保存
            if len(scenarios_chunk) >= samples_per_chunk:
                chunk_path = os.path.join(epoch_dir, f"chunk_{chunk_idx}.pt")
                print(f"  Saving {len(scenarios_chunk)} samples to {chunk_path}...")
                torch.save(scenarios_chunk, chunk_path)
                
                # 重置
                scenarios_chunk = []
                chunk_idx += 1
        
        # 保存最后一个不满的块
        if scenarios_chunk:
            chunk_path = os.path.join(epoch_dir, f"chunk_{chunk_idx}.pt")
            print(f"  Saving final {len(scenarios_chunk)} samples to {chunk_path}...")
            torch.save(scenarios_chunk, chunk_path)

    print("\nDataset generation complete.")

## 执行生成并保存到本地

In [9]:
all_features = np.load("all_features.npy")
num_features = len(all_features)

NUM_PRECOMPUTED_EPOCHS = 1 # 您预生成的epoch数据版本数量
# 保存到D盘
BASE_DATA_DIR = "D:/MobiFall_Precomputed"
os.makedirs(BASE_DATA_DIR, exist_ok=True)

# --- 运行生成器 ---
generate_and_save_scenarios_as_chunks(
    all_features, continuous_data, continuous_labels,
    window_size=WINDOW_SIZE, history_len=60, availability_ratio=0.3,
    chunk_size_gb=1.0, 
    num_epochs_to_generate=NUM_PRECOMPUTED_EPOCHS, # 预生成5个不同随机版本的epoch
    output_dir=BASE_DATA_DIR
)

NameError: name 'generate_and_save_scenarios_as_chunks' is not defined

## 读取数据块的 Dataset 类

In [3]:
class PrecomputedChunkDataset(Dataset):
    def __init__(self, epoch_dir):
        print(f"Initializing dataset from precomputed chunks in {epoch_dir}...")
        self.epoch_dir = epoch_dir
        
        chunk_files = sorted(
            [f for f in os.listdir(epoch_dir) if f.endswith('.pt')],
            key=lambda x: int(x.split('_')[1].split('.')[0])
        )
        
        self.chunk_paths = [os.path.join(epoch_dir, f) for f in chunk_files]
        
        # 扫描所有块，建立索引
        self.index_map = []
        self.chunk_sizes = []
        total_samples = 0
        for i, path in enumerate(self.chunk_paths):
            # 这是一个快速但不精确的获取长度的方法，下面会用精确的
            # num_samples_in_chunk = len(torch.load(path)) # 这样会加载整个文件
            # 为了快速初始化，我们可以先假设或存储元数据，但这里为了简单，我们还是加载一次
            num_samples_in_chunk = len(torch.load(path))
            self.chunk_sizes.append(num_samples_in_chunk)
            for j in range(num_samples_in_chunk):
                self.index_map.append((i, j)) # (块索引, 块内索引)
            total_samples += num_samples_in_chunk
            
        self._len = total_samples
        
        # 用于缓存最近加载的块，避免频繁读盘
        self.last_loaded_chunk_idx = -1
        self.cached_chunk = None
        print(f"Found {len(self.chunk_paths)} chunks, total {self._len} samples.")

    def __len__(self):
        return self._len

    def __getitem__(self, idx):
        chunk_idx, in_chunk_idx = self.index_map[idx]
        
        # 如果请求的块不在缓存中，则加载它
        if chunk_idx != self.last_loaded_chunk_idx:
            # print(f"Loading chunk {chunk_idx}...") # for debugging
            self.cached_chunk = torch.load(self.chunk_paths[chunk_idx])
            self.last_loaded_chunk_idx = chunk_idx
            
        return self.cached_chunk[in_chunk_idx]


## 时间分布CNN
它的作用是将一个作用于单个样本的模块（如CNN）应用到一个序列中的每一个元素上

In [4]:
class TimeDistributed(nn.Module):
    def __init__(self, module):
        super(TimeDistributed, self).__init__()
        self.module = module

    def forward(self, x):
        # x 的形状: (batch_size, time_steps, C, H, W) 或 (batch_size, time_steps, features...)
        # 我们这里是 (batch_size, 60, 200, 11)
        
        batch_size, time_steps = x.size(0), x.size(1)
        
        # 1. 合并 batch 和 time 维度
        # (B, T, C, F) -> (B * T, C, F)
        # 我们的输入是 (B, 60, 200, 11)，需要先 permute
        x = x.permute(0, 1, 3, 2) # -> (B, 60, 11, 200)
        x_reshape = x.contiguous().view(batch_size * time_steps, x.size(2), x.size(3))
        # -> (B * 60, 11, 200)

        # 2. 应用模块
        y = self.module(x_reshape)
        
        # y 的形状是 (B * 60, output_features)
        
        # 3. 恢复 batch 和 time 维度
        y = y.view(batch_size, time_steps, y.size(-1))
        # -> (B, 60, output_features)
        
        return y

## 交叉注意力模块


In [5]:
class CrossAttention(nn.Module):
    def __init__(self, query_dim, key_dim, hidden_dim):
        super(CrossAttention, self).__init__()
        self.query_layer = nn.Linear(query_dim, hidden_dim)
        self.key_layer = nn.Linear(key_dim, hidden_dim)
        self.value_layer = nn.Linear(key_dim, hidden_dim)
        self.scale = hidden_dim ** -0.5

    def forward(self, query, key, value):
        # query (来自LFS): (Batch, SeqLen, query_dim)
        # key/value (来自HFS): (Batch, SeqLen, key_dim)
        
        Q = self.query_layer(query)
        K = self.key_layer(key)
        V = self.value_layer(value)
        
        # 计算注意力分数
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        attention_weights = torch.softmax(attention_scores, dim=-1)
        
        # 应用权重
        context_vector = torch.matmul(attention_weights, V)
        return context_vector

# 模型定义

In [6]:
def create_raw_data_cnn():
    """创建一个用于处理原始传感器数据的1D-CNN模块。"""
    raw_data_processor = nn.Sequential(
        nn.Conv1d(in_channels=11, out_channels=64, kernel_size=3, padding='same'), nn.ReLU(), nn.BatchNorm1d(64),
        nn.MaxPool1d(kernel_size=2, stride=2),
        nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding='same'), nn.ReLU(), nn.BatchNorm1d(128),
        nn.MaxPool1d(kernel_size=2, stride=2),
        nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding='same'), nn.ReLU(), nn.BatchNorm1d(256),
        nn.MaxPool1d(kernel_size=2, stride=2),
        nn.Flatten()
    )
    return raw_data_processor


class ContextualFidelityModel(nn.Module):
    def __init__(self, feature_dim, lstm_hidden_dim, raw_cnn_output_dim, num_classes=1):
        super(ContextualFidelityModel, self).__init__()

        # --- 分支一：高保真原始数据流处理器 ---
        raw_cnn = create_raw_data_cnn()
        self.hfs_processor = TimeDistributed(raw_cnn)

        # --- 分支二：低保真特征流处理器 ---
        self.lfs_processor = nn.LSTM(
            input_size=feature_dim,
            hidden_size=lstm_hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=0.5
        )

        # --- 融合模块：交叉注意力 ---
        # query 来自 lfs_processor (lstm_hidden_dim)
        # key/value 来自 hfs_processor (raw_cnn_output_dim)
        self.cross_attention = CrossAttention(
            query_dim=lstm_hidden_dim,
            key_dim=raw_cnn_output_dim,
            hidden_dim=lstm_hidden_dim # 通常设置为与query_dim一致
        )
        
        # --- 后融合处理器与分类器 ---
        # 将 LSTM 的输出和注意力机制的输出结合起来
        self.post_fusion_processor = nn.LSTM(
            input_size=lstm_hidden_dim * 2, # Concatenated input
            hidden_size=lstm_hidden_dim,
            num_layers=1,
            batch_first=True
        )

        self.classifier = nn.Sequential(
            nn.Linear(lstm_hidden_dim, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, feature_sequence, imputed_raw_sequence):
        # feature_sequence: (B, 60, 6400)
        # imputed_raw_sequence: (B, 60, 200, 11)

        # 1. 并行处理两条流
        lfs_output, _ = self.lfs_processor(feature_sequence) # -> (B, 60, lstm_hidden_dim)
        hfs_output = self.hfs_processor(imputed_raw_sequence) # -> (B, 60, raw_cnn_output_dim)

        # 2. 交叉注意力融合
        # lfs_output 作为 Query，去查询 hfs_output
        attention_context = self.cross_attention(
            query=lfs_output, 
            key=hfs_output, 
            value=hfs_output
        ) # -> (B, 60, lstm_hidden_dim)
        
        # 3. 结合 LFS 输出和注意力上下文
        combined_features = torch.cat([lfs_output, attention_context], dim=-1)
        # -> (B, 60, lstm_hidden_dim * 2)

        # 4. 后融合处理与最终裁决
        final_sequence, (h_n, _) = self.post_fusion_processor(combined_features)
        
        # 使用序列的最后一个时间点的输出进行分类
        last_step_output = final_sequence[:, -1, :]
        logits = self.classifier(last_step_output)
        
        # 状态特征依然是最后一个LSTM的隐藏状态
        state_feature = h_n.squeeze(0) # -> (B, lstm_hidden_dim)

        return logits, state_feature

## 训练与评估函数模块

In [7]:
def train_one_epoch(model, dataloader, criterion, optimizer, device, noise_level=0.0):
    model.train()
    running_loss = 0.0
    
    # 循环变量名修改，以匹配DataLoader的输出
    for feature_seq, imputed_raw_seq, labels in dataloader:
        feature_seq = feature_seq.to(device)
        # 将新的 imputed_raw_seq 移动到设备
        imputed_raw_seq = imputed_raw_seq.to(device)
        labels = labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        
        # 更新模型调用方式，并只接收两个返回值
        logits, _ = model(feature_seq, imputed_raw_seq)
        
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * feature_seq.size(0)
    
    return running_loss / len(dataloader.dataset)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        # 循环变量名修改
        for feature_seq, imputed_raw_seq, labels in dataloader:
            feature_seq = feature_seq.to(device)
            # 将新的 imputed_raw_seq 移动到设备
            imputed_raw_seq = imputed_raw_seq.to(device)
            labels_for_loss = labels.to(device).unsqueeze(1) # 用于计算loss

            # 更新模型调用方式，并只接收两个返回值
            logits, _ = model(feature_seq, imputed_raw_seq)
            
            loss = criterion(logits, labels_for_loss)
            total_loss += loss.item() * feature_seq.size(0)
            
            preds = torch.sigmoid(logits) > 0.5
            all_preds.extend(preds.cpu().numpy())
            # 注意：用于指标计算的标签不需要 unsqueeze
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader.dataset)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary', zero_division=0)
    
    return {
        "loss": avg_loss,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# 第 6 步：训练

In [None]:
# --- 1. 配置参数 ---
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
HISTORY_SEQ_LEN = 60  # 30秒历史
# BATCH_SIZE 见上面定义
LEARNING_RATE = 0.0005
EPOCHS = 10
MODEL_SAVE_PATH = "fidelity_model_best.pth"
NOISE_LEVEL = 0.25


# --- 4. 初始化模型并开始训练 ---
print("\nInitializing model for training...")
fidelity_model = ContextualFidelityModel(
    feature_dim=6400,
    lstm_hidden_dim=256,
    raw_cnn_output_dim=6400
).to(DEVICE)

# 使用BCEWithLogitsLoss，它内置了sigmoid，更稳定
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(fidelity_model.parameters(), lr=LEARNING_RATE)

best_val_f1 = 0.0
best_val_loss = float('inf')



# --- 2. 训练主循环 ---
for epoch in range(EPOCHS):
    # 轮换使用预生成的数据集，实现更好的随机性
    epoch_to_load = epoch % NUM_PRECOMPUTED_EPOCHS
    epoch_data_dir = os.path.join(BASE_DATA_DIR, f"epoch_{epoch_to_load}")
    
    print(f"\n===== Epoch {epoch+1}/{EPOCHS} =====")
    print(f"Loading data from precomputed version: epoch_{epoch_to_load}")

    # a. 为当前epoch加载完整的数据集
    full_dataset = PrecomputedChunkDataset(epoch_data_dir)

    # b. 划分训练集和验证集的索引
    # 注意：由于数据已预先生成并分块，为了快速初始化，我们不再进行分层抽样(stratify)。
    # 对于大数据集，简单的随机划分通常也能保证训练集和验证集中类别分布大致均衡。
    dataset_size = len(full_dataset)
    indices = list(range(dataset_size))
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

    # c. 使用PyTorch的Subset创建数据集
    train_dataset = Subset(full_dataset, train_indices)
    val_dataset = Subset(full_dataset, val_indices)
    
    print(f"Training set size: {len(train_dataset)}, Validation set size: {len(val_dataset)}")

    # d. 创建DataLoader
    # pin_memory=True 可以在数据从CPU传到GPU时提速
    # num_workers > 0 可以使用多进程加载数据，进一步提升效率
    train_loader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=True, 
        num_workers=4, 
        pin_memory=True if torch.cuda.is_available() else False
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=4,
        pin_memory=True if torch.cuda.is_available() else False
    )

    # e. 执行训练和评估
    print("Starting training for this epoch...")
    train_loss = train_one_epoch(fidelity_model, train_loader, criterion, optimizer, DEVICE, noise_level=0.1)
    val_metrics = evaluate(fidelity_model, val_loader, criterion, DEVICE)

    print(f"Epoch {epoch+1} results: Train Loss: {train_loss:.4f}, Val F1: {val_metrics['f1']:.4f}")

    print("\n--- Training Finished ---")


Initializing model for training...

===== Epoch 1/10 =====
Loading data from precomputed version: epoch_0
Initializing dataset from precomputed chunks in D:/MobiFall_Precomputed\epoch_0...
