# 注意
请确保SensorDataSequences.npy, SensorLabelSequences.npy存在，且维度为(x,200,11)
生成部分可以参见 fidelity_model_train_new.ipynb

In [4]:
import torch
import torch.nn as nn
import numpy as np
import os
import glob

# 模型参数
INPUT_DIM = 11
HIDDEN_DIM = 64
N_LAYERS = 2
MODEL_PATH = 'autoregression_feature_extractor_model.pt'
DROP_OUT = 0.1

In [5]:
# 注意：这里的模型定义需要和训练时的代码保持完全一致

class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return hidden, cell
        
# --- 核心：特征提取器类 ---

class FeatureExtractor:
    def __init__(self, model_path, input_dim, hidden_dim, n_layers, dropout=0.0):
        """
        初始化特征提取器。
        """
        # 检查设备
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"FeatureExtractor is using device: {self.device}")

        # 1. 实例化我们需要的Encoder模型
        self.encoder = Encoder(input_dim, hidden_dim, n_layers, dropout).to(self.device)

        # 2. 加载训练好的完整Seq2Seq模型的权重字典
        full_state_dict = torch.load(model_path, map_location=self.device)

        # 3. 创建一个新的字典，只包含Encoder的权重
        #    并移除键名前缀 "encoder."
        encoder_state_dict = {}
        for key, value in full_state_dict.items():
            if key.startswith('encoder.'):
                # 将 'encoder.lstm.weight_ih_l0' 变为 'lstm.weight_ih_l0'
                new_key = key[len('encoder.'):] 
                encoder_state_dict[new_key] = value
        
        # 4. 将筛选后的权重加载到Encoder模型中
        self.encoder.load_state_dict(encoder_state_dict)
        
        print(f"Successfully loaded encoder weights from {model_path}")

        # 5. 设置为评估模式
        self.encoder.eval()

    def extract_feature(self, sequence_data):
        """
        从一个4秒(200个点)的序列中提取特征向量。

        参数:
            sequence_data (np.ndarray): 输入的传感器数据，形状必须为 (200, 11)

        返回:
            np.ndarray: 提取出的特征向量，形状为 (hidden_dim,)
        """
        # --- 输入验证 ---
        if not isinstance(sequence_data, np.ndarray) or sequence_data.shape != (200, 11):
            raise ValueError("Input data must be a numpy array of shape (200, 11)")

        # --- 特征提取核心逻辑 ---
        with torch.no_grad(): # 关闭梯度计算，加速推理
            # 1. 将Numpy数组转换为PyTorch张量
            input_tensor = torch.tensor(sequence_data, dtype=torch.float32).to(self.device)

            # 2. 增加Batch维度
            # 模型的LSTM层期望的输入是 (batch_size, seq_len, input_dim)
            # 所以 (200, 11) 需要变成 (1, 200, 11)
            input_tensor = input_tensor.unsqueeze(0)

            # 3. 通过Encoder进行前向传播
            hidden_state, _ = self.encoder(input_tensor)
            # hidden_state 的形状是 (n_layers, batch_size, hidden_dim)

            # 4. 提取我们需要的特征向量
            # 通常我们使用最后一层的隐藏状态作为特征
            feature_vector_tensor = hidden_state[-1, :, :] # 取最后一层, shape: (1, hidden_dim)

            # 5. 去掉Batch维度，并转换回Numpy数组
            feature_vector_tensor = feature_vector_tensor.squeeze(0) # Shape: (hidden_dim)
            feature_vector_np = feature_vector_tensor.cpu().numpy()

            return feature_vector_np

In [6]:
# 你的完整数据文件路径
FULL_DATA_PATH = 'SensorDataSequences.npy'
FULL_LABEL_PATH = 'SensorLabelSequences.npy'

# 在这里自定义批次大小
BATCH_SIZE = 256

# 输出文件的保存目录
OUTPUT_DIR = 'extracted_features'

# 创建输出目录（如果不存在）
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created output directory: {OUTPUT_DIR}")
    
# --- 2. 初始化特征提取器 ---
print("\n--- Initializing Feature Extractor ---")
extractor = FeatureExtractor(
    model_path=MODEL_PATH,
    input_dim=INPUT_DIM,
    hidden_dim=HIDDEN_DIM,
    n_layers=N_LAYERS,
    dropout=DROP_OUT
)

# --- 3. 加载数据并逐个提取特征 ---
print("\n--- Loading data and extracting features ---")
all_sequences = np.load(FULL_DATA_PATH)
all_labels = np.load(FULL_LABEL_PATH) # <-- 新增：加载标签

# 验证一下数据和标签的数量是否一致
num_sequences = all_sequences.shape[0]
assert num_sequences == all_labels.shape[0], \
    f"数据和标签的数量不匹配! 数据有 {num_sequences} 个, 标签有 {all_labels.shape[0]} 个。"

print(f"Loaded data with shape: {all_sequences.shape}")
print(f"Loaded labels with shape: {all_labels.shape}") # <-- 新增：打印标签形状

all_features = []
for i in range(num_sequences):
    sequence = all_sequences[i] # 取出第 i 个序列
    feature = extractor.extract_feature(sequence) # 提取特征
    all_features.append(feature) # 添加到列表中
    
    # 打印进度，方便观察
    if (i + 1) % 500 == 0 or (i + 1) == num_sequences:
        print(f"Processed {i + 1}/{num_sequences} sequences...")

# 将特征列表转换为一个大的Numpy数组
all_features_np = np.array(all_features)
print(f"\nFeature extraction complete. Final features array shape: {all_features_np.shape}")

# --- 4. 将所有特征和标签分批保存到文件 ---
print(f"\n--- Saving features and labels into batches of size {BATCH_SIZE} ---")
num_batches = (num_sequences + BATCH_SIZE - 1) // BATCH_SIZE

for i in range(num_batches):
    start_idx = i * BATCH_SIZE
    end_idx = min((i + 1) * BATCH_SIZE, num_sequences)
    
    # --- 切片特征和标签 ---
    batch_features = all_features_np[start_idx:end_idx]
    batch_labels = all_labels[start_idx:end_idx] # <-- 新增：切片标签
    
    # --- 保存特征批次 ---
    output_feature_filename = os.path.join(OUTPUT_DIR, f'features_batch_{i}.npy')
    np.save(output_feature_filename, batch_features)
    
    # --- 保存标签批次 ---
    output_label_filename = os.path.join(OUTPUT_DIR, f'labels_batch_{i}.npy') # <-- 新增
    np.save(output_label_filename, batch_labels) # <-- 新增
    
    # 更新打印信息
    print(f"Saved batch {i+1}/{num_batches}: features {batch_features.shape}, labels {batch_labels.shape}")


--- Initializing Feature Extractor ---
FeatureExtractor is using device: cuda
Successfully loaded encoder weights from autoregression_feature_extractor_model.pt

--- Loading data and extracting features ---
Loaded data with shape: (9491, 200, 11)
Loaded labels with shape: (9491,)
Processed 500/9491 sequences...
Processed 1000/9491 sequences...
Processed 1500/9491 sequences...
Processed 2000/9491 sequences...
Processed 2500/9491 sequences...
Processed 3000/9491 sequences...
Processed 3500/9491 sequences...
Processed 4000/9491 sequences...
Processed 4500/9491 sequences...
Processed 5000/9491 sequences...
Processed 5500/9491 sequences...
Processed 6000/9491 sequences...
Processed 6500/9491 sequences...
Processed 7000/9491 sequences...
Processed 7500/9491 sequences...
Processed 8000/9491 sequences...
Processed 8500/9491 sequences...
Processed 9000/9491 sequences...
Processed 9491/9491 sequences...

Feature extraction complete. Final features array shape: (9491, 64)

--- Saving features an

In [7]:
# 存放分批文件的文件夹
BATCH_DIR = 'extracted_features' 
# 合并后的特征文件名
OUTPUT_FEATURES_FILE = 'all_features.npy'
# 合并后的标签文件名
OUTPUT_LABELS_FILE = 'all_labels.npy'


# 查找所有分批的特征文件并排序，确保顺序正确
feature_files = sorted(glob.glob(os.path.join(BATCH_DIR, 'features_batch_*.npy')))
label_files = sorted(glob.glob(os.path.join(BATCH_DIR, 'labels_batch_*.npy')))

if not feature_files or not label_files:
    print(f"错误：在文件夹 '{BATCH_DIR}' 中找不到任何分批文件。")
    exit(1)

print(f"找到 {len(feature_files)} 个特征文件和 {len(label_files)} 个标签文件。")

# 加载并合并所有特征
all_features = [np.load(f) for f in feature_files]
consolidated_features = np.concatenate(all_features, axis=0)

# 加载并合并所有标签
all_labels = [np.load(f) for f in label_files]
consolidated_labels = np.concatenate(all_labels, axis=0)

# 保存合并后的文件
np.save(OUTPUT_FEATURES_FILE, consolidated_features)
np.save(OUTPUT_LABELS_FILE, consolidated_labels)

print("\n合并完成！")
print(f"  - 特征文件已保存: {OUTPUT_FEATURES_FILE}, 形状: {consolidated_features.shape}")
print(f"  - 标签文件已保存: {OUTPUT_LABELS_FILE}, 形状: {consolidated_labels.shape}")

找到 38 个特征文件和 38 个标签文件。

合并完成！
  - 特征文件已保存: all_features.npy, 形状: (9491, 64)
  - 标签文件已保存: all_labels.npy, 形状: (9491,)
