# 数据获取与处理

### 导入库与全局配置

In [1]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import re
import io
import joblib

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# --- Configuration ---
DATASET_PATH = 'MobiFall_Dataset'
TARGET_SAMPLING_RATE_HZ = 50.0  # Target sampling rate in Hz
TARGET_SAMPLING_PERIOD = f"{int(1000 / TARGET_SAMPLING_RATE_HZ)}ms"
WINDOW_SECOND = 2
WINDOW_SIZE = int(TARGET_SAMPLING_RATE_HZ * WINDOW_SECOND)  # 200 samples for 4 seconds at 50Hz
STEP_SECONDS = 1 # 1秒步长
STEP = int(TARGET_SAMPLING_RATE_HZ * STEP_SECONDS)          # 50 samples for 1 second step at 50Hz


SENSOR_CODES = ["acc", "gyro", "ori"]
EXPECTED_COLUMNS = {
    "acc": ["acc_x", "acc_y", "acc_z"],
    "gyro": ["gyro_x", "gyro_y", "gyro_z"],
    "ori": ["ori_azimuth", "ori_pitch", "ori_roll"]
}
ALL_FEATURE_COLUMNS = [
    "acc_x", "acc_y", "acc_z", "acc_smv",
    "gyro_x", "gyro_y", "gyro_z", "gyro_smv",
    "ori_azimuth", "ori_pitch", "ori_roll"
]

### 数据加载与预处理函数


In [2]:
def load_and_resample_sensor_file(filepath, sensor_code):
    """加载单个传感器文件，转换时间戳并进行重采样。"""
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()

        # 初始化一个变量作为“标记未找到”的标志
        data_start_line_index = -1

        # 遍历文件中的每一行
        for i, line in enumerate(lines):
            # 检查当前行是否是"@DATA"标记
            if line.strip().upper() == "@DATA":
                # 如果是，则记录下一行的行号并跳出循环
                data_start_line_index = i + 1
                break

        # 检查标记是否被找到
        if data_start_line_index == -1 or data_start_line_index >= len(lines):
            return None

        # 将数据行拼接成单个字符串
        data_string = "".join(lines[data_start_line_index:])

        # 检查字符串是否为空
        if not data_string.strip():
            return None

        # 使用pandas处理数据
        df = pd.read_csv(io.StringIO(data_string), header=None, usecols=[0, 1, 2, 3])
        
        # 检查生成的数据表是否为空
        if df.empty:
            return None

        # 为数据列进行命名
        df.columns = ['timestamp_ns'] + EXPECTED_COLUMNS[sensor_code]

        # 将ns时间戳转换为标准的日期时间格式
        df['timestamp'] = pd.to_datetime(df['timestamp_ns'], unit='ns')

        # 将新的日期时间设置为索引，并删除旧的时间戳列
        df = df.set_index('timestamp').drop(columns=['timestamp_ns'])

        # 按时间索引进行排序
        df = df.sort_index()

        # 将采样时间不均匀的传感器数据，强制转换为频率统一（每20毫秒一个点）的规整数据流，并填补其中的所有空白
        df_resampled = df.resample(TARGET_SAMPLING_PERIOD).mean().interpolate(method='linear', limit_direction='both')

        # 检查当前处理的传感器是否为加速度计 ('acc')
        if sensor_code == 'acc':
            # 安全性检查 - 确认三轴数据都存在
            if all(col in df_resampled.columns for col in ['acc_x', 'acc_y', 'acc_z']):
                # 计算信号幅值向量 (SMV)
                df_resampled['acc_smv'] = np.sqrt(
                    df_resampled['acc_x']**2 + df_resampled['acc_y']**2 + df_resampled['acc_z']**2
                )

        # 如果不是加速度计，则检查是否为陀螺仪 ('gyro')
        elif sensor_code == 'gyro':
            # 对陀螺仪数据执行相同的操作
            if all(col in df_resampled.columns for col in ['gyro_x', 'gyro_y', 'gyro_z']):
                df_resampled['gyro_smv'] = np.sqrt(
                    df_resampled['gyro_x']**2 + df_resampled['gyro_y']**2 + df_resampled['gyro_z']**2
                )

        return df_resampled

    except (pd.errors.EmptyDataError, ValueError):
        return None
    except Exception as e:
        print(f"Error processing file {filepath}: {e}. Skipping.")
        return None

def load_data_from_structured_folders(dataset_root_path):
    """遍历数据集文件夹，处理、对齐并组合每个试验的传感器数据。"""
    print(f"Scanning for data in: {dataset_root_path}")
    if not os.path.isdir(dataset_root_path):
        print(f"ERROR: Dataset root path '{dataset_root_path}' not found.")
        return [], []

    # 存放每一次活动试验（trial）所对应的各个传感器文件的路径（数据文件的位置）
    trial_sensor_files_map = defaultdict(lambda: defaultdict(str))

    # 存放每一次活动试验的元数据（这些数据代表什么，即标签信息）
    trial_metadata_map = {}
    
    # 遍历数据集的每一个文件夹
    for dirpath, _, filenames in os.walk(dataset_root_path):
        # 解析文件夹路径，以确定活动类别和具体活动
        relative_path = os.path.relpath(dirpath, dataset_root_path)
        path_parts = relative_path.split(os.sep)
        # 确保只处理包含实际数据文件的特定层级文件夹
        if len(path_parts) != 3: continue

        # 遍历这些特定文件夹中的每一个文件
        for filename in filenames:
            # 确保只处理.txt文件
            if not filename.endswith(".txt"): continue
            
            # 解析文件名，通过下划线分割以获取各个部分
            fname_parts = filename.replace('.txt', '').split('_')
            # 过滤掉不符合预期格式的文件名
            if len(fname_parts) != 4: continue
            
            # 从文件名部分中提取所需信息
            _, sensor_code, _, trial_no_str = fname_parts
            # 将传感器代码转为小写以保持一致性
            sensor_code = sensor_code.lower()
            # 确保是已知的传感器类型 ('acc', 'gyro', 'ori')
            if sensor_code not in SENSOR_CODES: continue

            # 尝试从路径和文件名中提取并转换所有元数据
            try:
                # 从文件夹路径的第一部分提取受试者ID
                subject_match = re.fullmatch(r'sub(\d+)', path_parts[0], re.IGNORECASE)
                if not subject_match: continue
                subject_id = int(subject_match.group(1))
                
                # 从文件夹路径的第二和第三部分获取类别和活动代码
                category = path_parts[1].upper()
                activity_code = path_parts[2].upper()
                # 将试验编号从字符串转换为整数
                trial_no = int(trial_no_str)
                # 构建完整的文件路径
                filepath = os.path.join(dirpath, filename)
                
                # 创建一个唯一的键来标识这次试验 (受试者, 活动, 试验编号)
                trial_key = (subject_id, activity_code, trial_no)
                # 在映射表中存储该传感器文件的路径
                trial_sensor_files_map[trial_key][sensor_code] = filepath
                # 如果是第一次遇到这个试验，则记录其元数据（类别和活动代码）
                if trial_key not in trial_metadata_map:
                    trial_metadata_map[trial_key] = {"category": category, "activity_code": activity_code}
            except (AttributeError, ValueError):
                # 如果在提取或转换过程中出现任何错误，则跳过该文件
                continue

    # 初始化两个列表，用于存放最终处理好的数据和对应的标签
    processed_trials_data, labels = [], []
    print(f"\nProcessing and combining {len(trial_sensor_files_map)} unique trials...")
    
    # 遍历前面组织好的每一次活动试验（trial）
    for trial_key, sensor_files in trial_sensor_files_map.items():
        # 确保该次试验包含了 acc, gyro, ori 全部三种传感器文件，否则跳过
        if not all(s_code in sensor_files for s_code in SENSOR_CODES): continue

        # 使用字典推导式，为每种传感器加载并重采样数据
        resampled_dfs = {s_code: load_and_resample_sensor_file(sensor_files[s_code], s_code) for s_code in SENSOR_CODES}
        # 如果任何一个文件加载或处理失败（返回了None或空表），则跳过这次试验
        if any(df is None or df.empty for df in resampled_dfs.values()): continue

        try:
            # --- 时间对齐关键步骤 ---
            # 找到三个传感器数据中最晚的开始时间
            common_start = max(df.index.min() for df in resampled_dfs.values())
            # 找到三个传感器数据中最早的结束时间
            common_end = min(df.index.max() for df in resampled_dfs.values())
            # 如果没有重叠的时间窗口，则跳过
            if common_start >= common_end: continue

            # 将三个数据表都裁剪到共同的时间范围内
            aligned_dfs = [resampled_dfs[s_code][common_start:common_end].reset_index(drop=True) for s_code in SENSOR_CODES]
            # 确保对齐后的数据表长度一致且不为空，否则跳过
            if not all(len(df) > 0 and len(df) == len(aligned_dfs[0]) for df in aligned_dfs): continue
            
            # --- 数据合并 ---
            # 按列（axis=1）将三个对齐后的数据表拼接成一个宽表
            combined_df = pd.concat(aligned_dfs, axis=1)
            
            # 再次检查并确保列名正确
            if len(combined_df.columns) == len(ALL_FEATURE_COLUMNS):
                 combined_df.columns = ALL_FEATURE_COLUMNS
            else:
                 continue # 如果列数不匹配则跳过

            # 如果合并后的数据长度不足一个序列窗口（4秒），则跳过
            if len(combined_df) < WINDOW_SIZE: continue
            
            # --- 数据和标签存储 ---
            # 将处理好的数据（转换为Numpy数组）存入列表
            processed_trials_data.append(combined_df.values)
            # 根据元数据判断该试验是"FALLS"还是"ADL"，并存入标签（1代表跌倒，0代表非跌倒）
            labels.append(1 if trial_metadata_map[trial_key]["category"] == "FALLS" else 0)
            
        except Exception:
            # 捕获任何在对齐和合并过程中可能出现的意外错误，并跳过该试验
            continue

    print(f"Successfully processed and combined sensor data for {len(processed_trials_data)} trials.")
    # 返回包含所有处理好的试验数据和标签的列表
    return processed_trials_data, labels

def create_sequences(data_list, label_list, seq_length, step):
    """使用滑动窗口从试验数据创建序列。"""
    # 初始化用于存放最终序列和对应标签的列表
    X, y = [], []
    # 遍历每一次活动试验的数据
    for i, trial_data in enumerate(data_list):
        trial_label = label_list[i]
        # 在单次试验数据上，按指定的步长（step）移动窗口
        for j in range(0, len(trial_data) - seq_length + 1, step):
            # 截取一个固定长度（seq_length）的片段作为序列
            X.append(trial_data[j:(j + seq_length)])
            # 为这个序列分配对应的标签
            y.append(trial_label)
            
    if not X: return np.array([]), np.array([])
    # 将列表转换为Numpy数组后返回
    return np.array(X), np.array(y)

### 加载和创建序列


In [3]:
SensorDataSequences, SensorLabelSequences = np.array([]), np.array([])

if os.path.exists('SensorDataSequences.npy') and os.path.exists('SensorLabelSequences.npy'):
    print("Found existing npy files. Loading...")
    SensorDataSequences = np.load('SensorDataSequences.npy')
    print(f"Loaded dataset shape: X={SensorDataSequences.shape}")
    SensorLabelSequences = np.load('SensorLabelSequences.npy')
    print(f"Loaded dataset shape: y={SensorLabelSequences.shape}")
else:
    trial_arrays, trial_labels = load_data_from_structured_folders(DATASET_PATH)
    SensorDataSequences, SensorLabelSequences = create_sequences(trial_arrays, trial_labels, WINDOW_SIZE, STEP)
    print(f"The shape of the final dataset is: X={SensorDataSequences.shape}, y={SensorLabelSequences.shape}")
    np.save('SensorDataSequences.npy', SensorDataSequences)
    np.save('SensorLabelSequences.npy', SensorLabelSequences)
    print("Saved processed dataset to npy files.")

Scanning for data in: MobiFall_Dataset

Processing and combining 627 unique trials...
Successfully processed and combined sensor data for 627 trials.
The shape of the final dataset is: X=(10745, 100, 11), y=(10745,)
Saved processed dataset to npy files.


# 特征模型

### 训练数据集准备

In [4]:
# PyTorch Dataset 类

class FallDetectionDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

# 2. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    SensorDataSequences, SensorLabelSequences, test_size=0.25, random_state=42, stratify=SensorLabelSequences
)
print(f"Train set: {X_train.shape}; Test set: {X_test.shape}")


# 3. 数据标准化
scaler = StandardScaler()
X_train_reshaped = X_train.reshape(-1, X_train.shape[2])
X_train_scaled_reshaped = scaler.fit_transform(X_train_reshaped)
X_train = X_train_scaled_reshaped.reshape(X_train.shape)

X_test_reshaped = X_test.reshape(-1, X_test.shape[2])
X_test_scaled_reshaped = scaler.transform(X_test_reshaped)
X_test = X_test_scaled_reshaped.reshape(X_test.shape)

# 保存 scaler
scaler_save_path = "scaler_50hz_torch.gz"
joblib.dump(scaler, scaler_save_path)
print(f"Data scaled and scaler saved to {scaler_save_path}")


# 4. 创建 Dataset 和 DataLoader
train_dataset = FallDetectionDataset(X_train, y_train)
test_dataset = FallDetectionDataset(X_test, y_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

Train set: (8058, 100, 11); Test set: (2687, 100, 11)
Data scaled and scaler saved to scaler_50hz_torch.gz


### PyTorch 模型定义

In [5]:
class FeatureModel1DCNN(nn.Module):
    def __init__(self, input_channels=11, num_classes=1, sequence_length=200): # 添加 sequence_length 参数
        super(FeatureModel1DCNN, self).__init__()
        
        # 特征提取器: 包含一系列的卷积和池化层
        self.feature_extractor = nn.Sequential(
            # Block 1
            nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(kernel_size=2, stride=2), # Length: L -> L/2
            
            # Block 2
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.MaxPool1d(kernel_size=2, stride=2), # Length: L/2 -> L/4

            # Block 3
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding='same'),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.MaxPool1d(kernel_size=2, stride=2)  # Length: L/4 -> L/8
        )
        
        # --- 动态计算分类器的输入维度 ---
        # 创建一个与真实输入形状相同的虚拟张量
        # (batch_size=1, channels=input_channels, length=sequence_length)
        with torch.no_grad():
            dummy_input = torch.zeros(1, input_channels, sequence_length)
            # 将虚拟张量传递给特征提取器以获取输出形状
            dummy_output = self.feature_extractor(dummy_input)
            # 计算展平后的大小
            flattened_size = dummy_output.numel() # .numel() 返回张量中元素的总数

        # 分类器: 将提取的特征映射到最终的输出
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, 512), # <-- 使用动态计算出的大小
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        """
        标准的前向传播，用于训练和评估
        x 的输入形状: (batch_size, sequence_length, num_features) -> (N, 200, 11) or (N, 100, 11)
        """
        # Conv1d 需要 (N, C, L) 格式, 所以我们需要重排维度
        x = x.permute(0, 2, 1) # -> (N, 11, L)
        
        features = self.feature_extractor(x)
        output = self.classifier(features)
        
        # 因为我们使用 BCEWithLogitsLoss, 所以不需要在这里加 sigmoid
        return output

    def extract_features(self, x):
        """
        仅用于提取中间特征的函数
        x 的输入形状: (batch_size, sequence_length, num_features) -> (N, L, 11)
        """
        # 同样需要重排维度
        x = x.permute(0, 2, 1) # -> (N, 11, L)
        
        # 只通过特征提取器
        features = self.feature_extractor(x)
        
        # 输出形状将是 (N, 256, L/8)
        return features

### 训练模型

In [6]:
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 超参数
INPUT_CHANNELS = 11
NUM_CLASSES = 1 # 二分类
LEARNING_RATE = 0.001
BATCH_SIZE = 64
EPOCHS = 20 # 您可以根据需要调整

# 实例化模型、损失函数和优化器
# 修改后的代码
model = FeatureModel1DCNN(
    input_channels=INPUT_CHANNELS, 
    num_classes=NUM_CLASSES, 
    sequence_length=WINDOW_SIZE
).to(device)
# BCEWithLogitsLoss 自动处理 sigmoid，更稳定
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# --- 3. 训练循环 ---
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for sequences, labels in train_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        
        # 梯度清零
        optimizer.zero_grad()
        
        # 前向传播
        outputs = model(sequences)
        loss = criterion(outputs, labels.unsqueeze(1))
        
        # 反向传播和优化
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * sequences.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {epoch_loss:.4f}")

print("Finished Training")

Using device: cuda
Epoch 1/20, Loss: 0.1310
Epoch 2/20, Loss: 0.0752
Epoch 3/20, Loss: 0.0563
Epoch 4/20, Loss: 0.0731
Epoch 5/20, Loss: 0.0478
Epoch 6/20, Loss: 0.0425
Epoch 7/20, Loss: 0.0416
Epoch 8/20, Loss: 0.0418
Epoch 9/20, Loss: 0.0384
Epoch 10/20, Loss: 0.0364
Epoch 11/20, Loss: 0.0309
Epoch 12/20, Loss: 0.0247
Epoch 13/20, Loss: 0.0264
Epoch 14/20, Loss: 0.0438
Epoch 15/20, Loss: 0.0293
Epoch 16/20, Loss: 0.0413
Epoch 17/20, Loss: 0.0317
Epoch 18/20, Loss: 0.0280
Epoch 19/20, Loss: 0.0180
Epoch 20/20, Loss: 0.0204
Finished Training


### 评估模型

In [7]:
model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
    for sequences, labels in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        outputs = model(sequences)
        
        # 将 logits 转换为概率，再转换为预测类别 (0或1)
        preds = torch.sigmoid(outputs) > 0.5
        
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")


Test Accuracy: 0.9929
Test Precision: 0.9895
Test Recall: 0.9775
Test F1-Score: 0.9835


### 提取并保存特征


In [8]:
model.eval()
all_features = []
with torch.no_grad():
    for sequences, _ in test_loader:
        sequences = sequences.to(device)
        
        # 使用我们专门定义的方法提取特征
        features = model.extract_features(sequences)
        
        # 将特征移回 CPU 并添加到列表中
        all_features.append(features.cpu().numpy())

# 将所有批次的特征拼接成一个大的 numpy 数组
# 注意：最后一个batch可能不满，所以使用 vstack
features_array = np.vstack(all_features)
print(f"Shape of the extracted features array: {features_array.shape}")

# 保存特征到文件
# features_save_path = "extracted_features.npy"
# np.save(features_save_path, features_array)
# print(f"Features saved to {features_save_path}")

# 保存模型权重
model_save_path = "feature_model_1dcnn.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model state saved to {model_save_path}")

Shape of the extracted features array: (2687, 256, 12)
Model state saved to feature_model_1dcnn.pth
