# CNN和RNN

## CNN示例

### 卷积操作

In [None]:
import torch
import torch.nn as nn

# 定义卷积层
conv_layer = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)

# 输入：1张28x28灰度图像，形状为(1, 1, 28, 28)
input_image = torch.randn(1, 1, 90, 90)

# 前向传播
output = conv_layer(input_image)

# 输出形状：(1, 16, 90, 90)，16张28x28特征图
print(f"示例1(3x3卷积核，填充为1，步长为1)：{output.shape}")

# 示例2：步长的影响
conv_layer = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1)
output = conv_layer(input_image)
print(f"示例2(3x3卷积核，填充为1，步长为2)：{output.shape}")

# 示例3：卷积核的影响
conv_layer = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=1)
output = conv_layer(input_image)
print(f"示例3(5x5卷积核，填充为1，步长为1)：{output.shape}")

# 示例4：卷积核和步长的影响
conv_layer = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=2, padding=1)
output = conv_layer(input_image)
print(f"示例4(5x5卷积核，填充为1，步长为2)：{output.shape}")

### 池化操作示例

In [None]:
import torch
import torch.nn as nn

# 定义最大池化层
max_pool_layer = nn.MaxPool2d(kernel_size=2, stride=2)
avg_pool_layer = nn.AvgPool2d(kernel_size=2, stride=2)

# 输入：1张8x8特征图，4个通道，形状为(1, 4, 8, 8)
input_feature = torch.randn(1, 4, 8, 8)

# 前向传播
max_pool_output = max_pool_layer(input_feature)
avg_pool_output = avg_pool_layer(input_feature)

# 输出形状：(1, 4, 8, 8)，通道数不变，空间尺寸减半
print(f"示例1(2x2最大化池化核，填充为0，步长为2)：{max_pool_output.shape}")
print(f"最大池化后的特征图：\n{max_pool_output}")
print(f"\n示例2(2x2平均池化核，填充为0，步长为2)：{avg_pool_output.shape}")
print(f"平均池化后的特征图：\n{avg_pool_output}")

### 可视化卷积核和卷积操作

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# 设置随机种子以确保可重复性
torch.manual_seed(42)

# 加载MNIST数据集
transform = transforms.ToTensor()
mnist_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
data_loader = torch.utils.data.DataLoader(dataset=mnist_dataset, batch_size=1, shuffle=True)

# 获取一张示例图像
image, label = next(iter(data_loader))  # 形状：(1, 1, 28, 28)

# 定义卷积层：1输入通道，4输出通道，3x3核
conv_layer = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1)

# 前向传播
with torch.no_grad():
    feature_maps = conv_layer(image)  # 形状：(1, 4, 28, 28)

# 获取卷积核权重
weights = conv_layer.weight  # 形状：(4, 1, 3, 3)

# 可视化卷积核权重和特征图
plt.figure(figsize=(12, 6))

# 可视化卷积核（4个3x3核）
for i in range(4):
    plt.subplot(2, 4, i + 1)
    plt.title(f'Kernel {i+1}')
    plt.imshow(weights[i, 0].detach().numpy(), cmap='gray')
    plt.axis('off')

# 可视化特征图（4张28x28特征图）
for i in range(4):
    plt.subplot(2, 4, i + 5)
    plt.title(f'Feature Map {i+1}')
    plt.imshow(feature_maps[0, i].detach().numpy(), cmap='gray')
    plt.axis('off')

# 显示输入图像
plt.figure(figsize=(4, 4))
plt.title('Input Image')
plt.imshow(image[0, 0].detach().numpy(), cmap='gray')
plt.axis('off')

plt.tight_layout()
plt.show()

### 可视化卷积核和卷积操作示例

**手动设置卷积核：**
- 设置了4个特定卷积核：
  - 核1：垂直边缘检测（Sobel算子）。
  - 核2：水平边缘检测（Sobel算子）。
  - 核3：拉普拉斯算子，增强边缘和细节。
  - 核4：均值模糊，平滑图像。
- 这些核使特征图更直观，易于理解卷积效果（如突出边缘或模糊区域）。

**ReLU激活：**
- 在卷积后应用nn.ReLU()，将负值置为0，增强特征图的非线性，改善灰度可视化效果（负值不再显示为黑色）。

**特征图归一化：**
- 对每个特征图进行归一化到[0, 1]，避免Matplotlib自动归一化导致的不一致显示

**预期输出**

- 第1个子图：原始MNIST图像（固定为第一张，可能是数字“5”），标题显示其标签。
- 第2-5个子图：4个3×3卷积核的灰度图：
  - 核1：垂直边缘检测，突出纵向边缘。
  - 核2：水平边缘检测，突出横向边缘。
  - 核3：拉普拉斯算子，突出细节和边缘。
  - 核4：均值模糊，显示平滑效果。
- 第6-10个子图：4个28×28特征图，对应各卷积核的输出：
  - 特征图经过ReLU和归一化，白色表示高激活，黑色表示低激活（或零）。
  - 垂直/水平边缘核会突出图像中的边缘，模糊核会平滑图像，拉普拉斯核会增强细节。

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# 设置随机种子以确保可重复性
torch.manual_seed(42)

# 加载MNIST数据集
transform = transforms.ToTensor()  # 转换为张量并归一化到[0, 1]
mnist_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)

# 固定获取第一张图像（索引0），确保结果可重复
image, label = mnist_dataset[1]  # 形状：(1, 28, 28)
image = image.unsqueeze(0)  # 增加batch维度，形状：(1, 1, 28, 28)

# 定义卷积层：1输入通道，4输出通道，3x3核，步幅1，填充1
conv_layer = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1)

# 手动设置卷积核权重以提取特定特征
with torch.no_grad():
    conv_layer.weight[0] = torch.tensor([[[-1., 0., 1.], [-2., 0., 2.], [-1., 0., 1.]]])  # 垂直边缘
    conv_layer.weight[1] = torch.tensor([[[1., 2., 1.], [0., 0., 0.], [-1., -2., -1.]]])  # 水平边缘
    conv_layer.weight[2] = torch.tensor([[[0., 1., 0.], [1., -4., 1.], [0., 1., 0.]]])  # 拉普拉斯算子
    conv_layer.weight[3] = torch.tensor([[[1., 1., 1.], [1., 1., 1.], [1., 1., 1.]]]) / 9.0  # 均值模糊
    conv_layer.bias.fill_(0)  # 偏置设为0

# 定义ReLU激活函数
relu = nn.ReLU()

# 前向传播
with torch.no_grad():
    feature_maps = relu(conv_layer(image))  # 形状：(1, 4, 28, 28)

# 获取卷积核权重
weights = conv_layer.weight  # 形状：(4, 1, 3, 3)

# 可视化：统一2x5网格
plt.figure(figsize=(15, 6))

# 可视化输入图像
plt.subplot(2, 5, 1)
plt.title(f'Input Image (Label: {label})')
plt.imshow(image[0, 0].detach().numpy(), cmap='gray')
plt.axis('off')

# 可视化卷积核（4个3x3核）
for i in range(4):
    plt.subplot(2, 5, i + 2)
    plt.title(f'Kernel {i+1}')
    plt.imshow(weights[i, 0].detach().numpy(), cmap='gray')
    plt.axis('off')

# 可视化特征图（4张28x28特征图）
for i in range(4):
    plt.subplot(2, 5, i + 7)
    plt.title(f'Feature Map {i+1}')
    feature_map = feature_maps[0, i].detach().numpy()
    # 归一化特征图到[0, 1]
    feature_map = (feature_map - feature_map.min()) / (feature_map.max() - feature_map.min() + 1e-8)
    plt.imshow(feature_map, cmap='gray')
    plt.axis('off')

plt.tight_layout()
plt.show()

### 池化操作示例

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# 1. 加载MNIST数据集
transform = transforms.ToTensor()
mnist_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
data_loader = torch.utils.data.DataLoader(mnist_dataset, batch_size=1, shuffle=True)

# 2. 获取一张图像
image, label = next(iter(data_loader))
image = image.squeeze().numpy()  # 形状从 [1, 1, 28, 28] 转为 [28, 28]

# 3. 应用最大池化和平均池化
image_tensor = torch.tensor(image).unsqueeze(0).unsqueeze(0)  # 形状转为 [1, 1, 28, 28]

# 最大池化
max_pool = torch.nn.MaxPool2d(kernel_size=2, stride=2)
max_pooled_image = max_pool(image_tensor).squeeze().numpy()  # 形状为 [14, 14]

# 平均池化
avg_pool = torch.nn.AvgPool2d(kernel_size=2, stride=2)
avg_pooled_image = avg_pool(image_tensor).squeeze().numpy()  # 形状为 [14, 14]

# 4. 可视化
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 原始图像
axes[0].imshow(image, cmap='gray')
axes[0].set_title(f'Original Image (Label: {label.item()})')
axes[0].axis('off')

# 最大池化后的图像
axes[1].imshow(max_pooled_image, cmap='gray')
axes[1].set_title('Max Pooled Image (2x2)')
axes[1].axis('off')

# 平均池化后的图像
axes[2].imshow(avg_pooled_image, cmap='gray')
axes[2].set_title('Average Pooled Image (2x2)')
axes[2].axis('off')

plt.tight_layout()
plt.show()

### 单层卷积与多层连续卷积对比

用于对比：
- 一卷一池（OneConv-OnePool）
- 三卷一池（ThreeConv-OnePool）

In [None]:
# 第一步：导入依赖库
import torch
import torch.nn as nn
import torchvision.transforms as T
import torchvision.models as models
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import requests
from io import BytesIO

# 第二步：读取一张在线图像
#url = "https://zh.wikipedia.org/zh-cn/%E7%8C%AB#/media/File:Cat_November_2010-1a.jpg"
#response = requests.get(url)
#img = Image.open(BytesIO(response.content)).convert("RGB")

# 第二步：读取本地图像
img = Image.open("./computational_graph.png").convert("RGB")

transform = T.Compose([
    T.Resize((64, 64)),
    T.ToTensor(),
])
input_tensor = transform(img).unsqueeze(0)  # Shape: (1, 3, 64, 64)

# 第三步：定义两个CNN模型结构
# 一卷一池结构
class OneConvOnePool(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

    def forward(self, x):
        return self.features(x)

# 三卷一池结构
class ThreeConvOnePool(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

    def forward(self, x):
        return self.features(x)

# 第四步：生成特征图
model_one = OneConvOnePool()
model_three = ThreeConvOnePool()

with torch.no_grad():
    feat_one = model_one(input_tensor)
    feat_three = model_three(input_tensor)

# 第五步：绘图函数
def plot_feature_maps(features, title, num_channels=6):
    plt.figure(figsize=(12, 4))
    for i in range(num_channels):
        plt.subplot(1, num_channels, i + 1)
        plt.imshow(features[0, i].cpu().numpy(), cmap='viridis')
        plt.axis("off")
    plt.suptitle(title)
    plt.show()

# 第六步：可视化
plot_feature_maps(feat_one, "OneConvOnePool：前6个特征图")
plot_feature_maps(feat_three, "ThreeConvOnePool：前6个特征图")

### 展平层示例

#### torch.flatten()

In [None]:
import torch

# 假设一个批次中包含 4 张图片，每张图片经过卷积和池化后，
# 得到 64 个特征图，每个特征图大小是 7x7
feature_map = torch.randn(4, 64, 7, 7)
print(f"原始特征图形状: {feature_map.shape}")

# 使用 torch.flatten()，从维度 1 开始展平，保留批次维度
flattened_feature = torch.flatten(feature_map, start_dim=1)
print(f"展平后特征形状: {flattened_feature.shape}")
# 展平后的特征总数是 64 * 7 * 7 = 3136
print(f"每个样本的特征总数: {flattened_feature.shape[1]}")

#### nn.Flatten

In [None]:
import torch
import torch.nn as nn

# 模拟卷积层输出的特征图
feature_map = torch.randn(2, 128, 4, 4) # 2个批次，128个通道，4x4特征图
print(f"原始特征图形状: {feature_map.shape}")

# 创建一个 Flatten 模块实例
flatten_layer = nn.Flatten()

# 将特征图通过展平层
flattened_feature_nn = flatten_layer(feature_map)
print(f"展平后特征形状: {flattened_feature_nn.shape}")
print(f"每个样本的特征总数: {flattened_feature_nn.shape[1]}")

# 在一个简单的CNN模型中使用 nn.Flatten
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv_block = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # 使用 nn.Flatten 将卷积池化后的多维输出展平
        self.flatten = nn.Flatten() 

        # 假设输入图片是 32x32，经过 conv_block 后，特征图尺寸变为 16x16，通道数为 16
        # 展平后特征数 = 16 (通道数) * 16 (高) * 16 (宽) = 4096
        self.fc_layer = nn.Linear(16 * 16 * 16, 10) # 假设输出 10 个类别

    def forward(self, x):
        x = self.conv_block(x)
        print(f"卷积池化后的中间形状: {x.shape}") 
        x = self.flatten(x) # 展平操作
        print(f"展平后的形状: {x.shape}")
        x = self.fc_layer(x)
        return x

model = SimpleCNN()
dummy_input = torch.randn(1, 3, 32, 32) # 1个批次，3通道，32x32图片
output = model(dummy_input)
print(f"最终输出形状: {output.shape}")

### CNN正向传播和反向传播

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ========= 定义模型 =========
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # 输入：(1, 28, 28)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3, padding=1)   # 输出：(8, 28, 28)
        self.pool = nn.MaxPool2d(2, 2)                                                   # 输出：(8, 14, 14)
        self.conv2 = nn.Conv2d(8, 16, 3, padding=1)                                       # 输出：(16, 14, 14)
        # 再池化后输出：(16, 7, 7)
        self.fc1 = nn.Linear(16 * 7 * 7, 64)
        self.fc2 = nn.Linear(64, 10)  # 假设10分类任务

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))   # Conv1 + ReLU + Pool
        x = self.pool(F.relu(self.conv2(x)))   # Conv2 + ReLU + Pool
        x = x.view(-1, 16 * 7 * 7)              # 展平
        x = F.relu(self.fc1(x))                # 全连接层1
        x = self.fc2(x)                        # 输出层（未使用 softmax）
        return x

# ========= 初始化模型、损失函数、输入 =========
model = SimpleCNN()
criterion = nn.CrossEntropyLoss()

# 伪造一个 batch 数据：batch_size=4，单通道 28x28 图像
inputs = torch.randn(4, 1, 28, 28)
labels = torch.tensor([1, 0, 3, 2])  # 伪造目标标签（0-9）

# ========= 正向传播 =========
outputs = model(inputs)
loss = criterion(outputs, labels)

# ========= 反向传播 =========
loss.backward()

# ========= 打印每层参数的梯度 =========
print("=== 每层的权重梯度 ===")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: grad shape = {param.grad.shape}")
        print(param.grad)  # 可加 .norm() 或 .mean() 简化输出
        print('-' * 50)

### 简单示例

对比使用展平层(nn.Flatten)和全局平均池化层（nn.AdaptiveAvgPool2d((1, 1)）对特征图进行展平操作的不同的效果。

In [None]:
import torch
import torch.nn as nn

class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        
        # 卷积和池化层用于特征提取
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # Input: [N, 3, 32, 32] -> Output: [N, 32, 32, 32]
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),      # Output: [N, 32, 16, 16]
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1), # Output: [N, 64, 16, 16]
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)       # Output: [N, 64, 8, 8]
        )
        
        # 展平层：将 [N, 64, 8, 8] 展平为 [N, 64 * 8 * 8]
        # 64 * 8 * 8 = 4096
        self.flatten = nn.Flatten()
        
        # 全连接层：接收展平后的特征，并映射到类别数量
        # in_features: 4096 (来自展平层)
        # out_features: num_classes (例如10个类别)
        self.classifier = nn.Sequential(
            nn.Linear(64 * 8 * 8, 128), # 第一个全连接层
            nn.ReLU(),
            nn.Dropout(0.5),            # 加入 Dropout 进行正则化，防止过拟合
            nn.Linear(128, num_classes) # 第二个全连接层，输出类别分数
        )

    def forward(self, x):
        # 1. 卷积和池化提取特征
        x = self.features(x)
        print(f"形状在展平前: {x.shape}") # 例如: torch.Size([N, 64, 8, 8])
        
        # 2. 展平特征
        x = self.flatten(x)
        print(f"形状在展平后: {x.shape}") # 例如: torch.Size([N, 4096])
        
        # 3. 通过全连接层进行分类
        x = self.classifier(x)
        return x

# 创建模型实例
model = SimpleCNN(num_classes=10)

# 模拟输入图像 (批次大小4, 3通道, 32x32像素)
dummy_input = torch.randn(4, 3, 32, 32)

# 前向传播
output = model(dummy_input)
print(f"最终输出形状: {output.shape}") # 期望: torch.Size([4, 10])

# 查看模型参数总量 (nn.Linear层会贡献大量参数)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"模型总参数量: {total_params}")

# 如果使用全局平均池化替代展平，参数量会大幅减少
class SimpleCNN_GAP(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleCNN_GAP, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        # 使用全局平均池化，将每个 8x8 的特征图平均为一个值
        # 输出形状变为 [N, 64, 1, 1]
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        # 全连接层：in_features 变为 64 (通道数)，而不是 64*8*8
        self.classifier = nn.Sequential(
            nn.Linear(64, 128), # 参数量大幅减少
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        print(f"\n形状在全局池化前: {x.shape}") # 例如: torch.Size([N, 64, 8, 8])
        
        x = self.global_avg_pool(x)
        print(f"形状在全局池化后: {x.shape}") # 例如: torch.Size([N, 64, 1, 1])
        
        # 全局池化后，张量形状通常是 [N, C, 1, 1]，需要再次展平或移除单维度
        x = x.view(x.size(0), -1) # 或使用 x.squeeze(2).squeeze(2) 如果不确定维度
        print(f"最终输入全连接层形状: {x.shape}") # 例如: torch.Size([N, 64])
        
        x = self.classifier(x)
        return x

model_gap = SimpleCNN_GAP(num_classes=10)
output_gap = model_gap(dummy_input)
print(f"最终输出形状 (GAP): {output_gap.shape}")
total_params_gap = sum(p.numel() for p in model_gap.parameters() if p.requires_grad)
print(f"模型总参数量 (使用GAP): {total_params_gap}") # 会发现参数量显著减少

### LeNet-5示例

LeNet-5是由Yann LeCun等人于1998年提出的经典卷积神经网络（CNN）架构，最初设计用于手写数字识别（如MNIST数据集）。它包含卷积层、池化层、展平层和全连接层，结构简单但奠定了现代CNN的基础。

**架构介绍**

LeNet-5 针对MNIST数据集（28×28灰度图像，10类数字）设计，结构如下：

- C1：卷积层，6个5×5卷积核，输出 [batch, 6, 28, 28]。
- S2：平均池化层，2×2窗口，步幅2，输出 [batch, 6, 14, 14]。
- C3：卷积层，16个5×5卷积核，输出 [batch, 16, 10, 10]。
- S4：平均池化层，2×2窗口，步幅2，输出 [batch, 16, 5, 5]。
- C5：卷积层（或视为全连接层），120个5×5卷积核，输出 [batch, 120, 1, 1]。
- F6：全连接层，120→84，输出 [batch, 84]。
- Output：全连接层，84→10，输出 [batch, 10]（10类概率）。

**注意：**

- 原始LeNet-5使用Sigmoid/Tanh激活函数，现代实现通常用ReLU以加速收敛。
- 原始C5层可视为全连接层（因输入5×5，卷积后为1×1），现代实现可能直接用展平层+全连接层。
- 输出层通常接Softmax（结合交叉熵损失）。

以下示例提供了基于PyTorch的LeNet-5模型代码，包括模型定义、MNIST数据集加载、训练和测试流程，并结合之前的上下文（如MNIST示例）进行说明。代码将训练模型并输出训练/测试准确率及可视化结果。

**模型结构**

模型结构说明：
- C1 (conv1)：nn.Conv2d(1, 6, kernel_size=5, padding=2)，输入 [batch, 1, 28, 28]，输出 [batch, 6, 28, 28]（padding=2 保持尺寸）。
- S2 (pool1)：nn.AvgPool2d(kernel_size=2, stride=2)，输出 [batch, 6, 14, 14]。
- C3 (conv2)：nn.Conv2d(6, 16, kernel_size=5)，输出 [batch, 16, 10, 10]（无填充，尺寸减小）。
- S4 (pool2)：nn.AvgPool2d(kernel_size=2, stride=2)，输出 [batch, 16, 5, 5]。
- Flatten：nn.Flatten()，将 [batch, 16, 5, 5] 展平为 [batch, 16*5*5] = [batch, 400]。
- F5 (fc1)：nn.Linear(400, 120)，输出 [batch, 120]。
- F6 (fc2)：nn.Linear(120, 84)，输出 [batch, 84]。
- Output (fc3)：nn.Linear(84, 10)，输出 [batch, 10]（logits）。

**数据加载**

MNIST数据集：
- 训练集：60000张28×28灰度图像。
- 测试集：10000张。
- 使用 transforms.ToTensor()将图像归一化到 [0, 1]。
- 批次大小：64，适合GPU/CPU训练。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# 设置随机种子
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 定义LeNet-5模型
class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=2)  # C1: [batch, 1, 28, 28] -> [batch, 6, 28, 28]
        self.relu = nn.ReLU()
        self.pool1 = nn.AvgPool2d(kernel_size=2, stride=2)  # S2: [batch, 6, 28, 28] -> [batch, 6, 14, 14]
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1)  # C3: [batch, 6, 14, 14] -> [batch, 16, 10, 10]
        self.pool2 = nn.AvgPool2d(kernel_size=2, stride=2)  # S4: [batch, 16, 10, 10] -> [batch, 16, 5, 5]
        self.flatten = nn.Flatten()  # Flatten: [batch, 16, 5, 5] -> [batch, 16*5*5]
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # C5/F5: [batch, 400] -> [batch, 120]
        self.fc2 = nn.Linear(120, 84)  # F6: [batch, 120] -> [batch, 84]
        self.fc3 = nn.Linear(84, 10)  # Output: [batch, 84] -> [batch, 10]

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool1(x)
        conv2_out = x  # 保存中间特征图用于可视化
        x = self.relu(self.conv2(x))
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)  # 输出logits，交叉熵损失自带Softmax
        return x, conv2_out

# 加载MNIST数据集
transform = transforms.ToTensor()
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# 初始化模型、损失函数和优化器
model = LeNet5().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 5
train_losses = []
train_accuracies = []
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs, _ = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

# 测试模型
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs, _ = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f"Test Accuracy: {test_acc:.2f}%")

# 可视化第一个卷积层的卷积核和第二个卷积层的特征图
model.eval()
with torch.no_grad():
    image, label = train_dataset[0]
    image = image.unsqueeze(0).to(device)
    _, conv2_out = model(image)

# 可视化
plt.figure(figsize=(15, 6))
# 输入图像
plt.subplot(2, 4, 1)
plt.title(f'Input Image (Label: {label})')
plt.imshow(image[0, 0].cpu().numpy(), cmap='gray')
plt.axis('off')

# 第一个卷积层卷积核
weights = model.conv1.weight.cpu().detach().numpy()  # [6, 1, 5, 5]
for i in range(min(3, weights.shape[0])):  # 显示前3个卷积核
    plt.subplot(2, 4, i + 2)
    plt.title(f'Conv1 Kernel {i+1}')
    plt.imshow(weights[i, 0], cmap='gray')
    plt.axis('off')

# 第二个卷积层特征图
feature_maps = conv2_out[0].cpu().detach().numpy()  # [6, 14, 14]
for i in range(min(4, feature_maps.shape[0])):  # 显示前4个特征图
    plt.subplot(2, 4, i + 5)
    plt.title(f'Conv2 Feature {i+1}')
    feature_map = feature_maps[i]
    feature_map = (feature_map - feature_map.min()) / (feature_map.max() - feature_map.min() + 1e-8)
    plt.imshow(feature_map, cmap='gray')
    plt.axis('off')

plt.tight_layout()
plt.show()

# 可视化训练损失和准确率
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Training Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

## 词嵌入示例

### 随机初始化词嵌入矩阵

需要事先安装nltk、datasets、string和collections等包。

In [None]:
import torch
import torch.nn as nn
from datasets import load_dataset
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import string

# 下载 NLTK 数据
nltk.download('punkt')

# 加载 IMDB 数据集
dataset = load_dataset('imdb')

# 文本预处理
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return word_tokenize(text)

# 构建词汇表
def build_vocab(texts, min_freq=5):
    all_tokens = [token for text in texts for token in preprocess_text(text)]
    token_counts = Counter(all_tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    vocab.update({token: idx + 2 for idx, (token, count) in enumerate(token_counts.items()) if count >= min_freq})
    return vocab

# 生成词汇表
train_texts = [item['text'] for item in dataset['train']]
vocab = build_vocab(train_texts, min_freq=5)

# 创建词嵌入矩阵（随机初始化）
vocab_size = len(vocab)
embedding_dim = 100
embedding_matrix = torch.randn(vocab_size, embedding_dim)  # 随机初始化
embedding_matrix[0] = torch.zeros(embedding_dim)  # <PAD> 设置为零向量

# 加载到嵌入层
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  # 可训练

# 示例输入
input_ids = torch.tensor([2, 3, 4], dtype=torch.long)  # 对应 "the", "quick", "fox"
embeddings = embedding_layer(input_ids)
print("词嵌入矩阵形状:", embedding_matrix.shape)  # [vocab_size, embedding_dim]
print("嵌入向量形状:", embeddings.shape)  # [seq_len, embedding_dim]

### 使用预训练 GloVe 嵌入

In [None]:
from torchtext.vocab import GloVe
import torch
import torch.nn as nn
from datasets import load_dataset
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import string

# 下载 NLTK 数据
nltk.download('punkt')

# 加载 IMDB 数据集
dataset = load_dataset('imdb')

# 文本预处理
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return word_tokenize(text)

# 构建词汇表
def build_vocab(texts, min_freq=5):
    all_tokens = [token for text in texts for token in preprocess_text(text)]
    token_counts = Counter(all_tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    vocab.update({token: idx + 2 for idx, (token, count) in enumerate(token_counts.items()) if count >= min_freq})
    return vocab

# 生成词汇表
train_texts = [item['text'] for item in dataset['train']]
vocab = build_vocab(train_texts, min_freq=5)

# 加载 GloVe 嵌入
glove = GloVe(name='6B', dim=300)

# 创建词嵌入矩阵
vocab_size = len(vocab)
embedding_dim = 300
embedding_matrix = torch.zeros(vocab_size, embedding_dim)
for word, idx in vocab.items():
    if word in glove.stoi:
        embedding_matrix[idx] = glove.vectors[glove.stoi[word]]
    else:
        embedding_matrix[idx] = torch.randn(embedding_dim)  # 未登录词随机初始化

# 加载到嵌入层
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)  # 冻结

# 示例输入
input_ids = torch.tensor([2, 3, 4], dtype=torch.long)  # 对应 "the", "quick", "fox"
embeddings = embedding_layer(input_ids)
print("词嵌入矩阵形状:", embedding_matrix.shape)  # [vocab_size, embedding_dim]
print("嵌入向量形状:", embeddings.shape)  # [seq_len, embedding_dim]

### 使用BERT嵌入

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# 加载 BERT 分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 示例文本
text = "The quick brown fox"
encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
input_ids = encoding['input_ids']  # [1, seq_len]

# 获取嵌入（BERT 内部包含词嵌入矩阵）
with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs.last_hidden_state  # [1, seq_len, 768]

print("嵌入向量形状:", embeddings.shape)  # [1, seq_len, 768]

## RNN示例

### RNN的前向传播

**RNN前向传播原理**
对于单层单向RNN，在时间步 $ t $，隐藏状态 $ h_t $ 的计算公式为：
$$h_t = \tanh(W_{ih} \cdot x_t + b_{ih} + W_{hh} \cdot h_{t-1} + b_{hh})$$

- $ x_t $：当前时间步的输入，形状为 (batch_size, input_size)。
- $ h_{t-1} $：上一时间步的隐藏状态，形状为 (batch_size, hidden_size)。
- $ W_{ih} $：输入到隐藏层的权重矩阵，形状为 (hidden_size, input_size)。
- $ W_{hh} $：隐藏层到隐藏层的权重矩阵，形状为 (hidden_size, hidden_size)。
- $ b_{ih}, b_{hh} $：偏置向量，形状为 (hidden_size,)。
- $ \tanh $：激活函数。

**前向传播需要：**

- 对每个时间步 $ t $，根据输入 $ x_t $ 和上一隐藏状态 $ h_{t-1} $ 计算 $ h_t $。
- 保存所有时间步的隐藏状态 $ h_t $ 作为输出序列。
- 返回最终隐藏状态 $ h_n $ 和输出序列。

以下是用PyTorch手动实现RNN前向传播的代码示例，假设输入序列长度为3，批量大小为2，输入维度为4，隐藏状态维度为5。

In [None]:
import torch
import torch.nn as nn

# 设置随机种子以确保结果可复现
torch.manual_seed(42)

# 参数设置
batch_size = 2
seq_len = 3
input_size = 4
hidden_size = 5

# 输入数据
input = torch.randn(batch_size, seq_len, input_size)  # 形状: (2, 3, 4)
h0 = torch.zeros(batch_size, hidden_size)            # 初始隐藏状态: (2, 5)

# 权重和偏置（随机初始化，模拟RNN参数）
W_ih = torch.randn(hidden_size, input_size)  # 形状: (5, 4)
W_hh = torch.randn(hidden_size, hidden_size) # 形状: (5, 5)
b_ih = torch.randn(hidden_size)             # 形状: (5,)
b_hh = torch.randn(hidden_size)             # 形状: (5,)

# 手动实现RNN前向传播
def manual_rnn_forward(input, h0, W_ih, W_hh, b_ih, b_hh):
    # input: (batch_size, seq_len, input_size)
    # h0: (batch_size, hidden_size)
    outputs = []  # 保存每个时间步的隐藏状态
    h_t = h0      # 当前隐藏状态，初始化为 h0

    # 遍历序列的每个时间步
    for t in range(input.size(1)):
        x_t = input[:, t, :]  # 当前时间步输入: (batch_size, input_size)
        # 计算 h_t = tanh(W_ih * x_t + b_ih + W_hh * h_{t-1} + b_hh)
        h_t = torch.tanh(
            torch.matmul(x_t, W_ih.t()) + b_ih + torch.matmul(h_t, W_hh.t()) + b_hh
        )
        outputs.append(h_t.unsqueeze(1))  # 保存 h_t，形状: (batch_size, 1, hidden_size)

    # 拼接所有时间步的输出
    output = torch.cat(outputs, dim=1)  # 形状: (batch_size, seq_len, hidden_size)
    h_n = h_t  # 最后一个时间步的隐藏状态

    return output, h_n

# 执行手动前向传播
manual_output, manual_h_n = manual_rnn_forward(input, h0, W_ih, W_hh, b_ih, b_hh)

# 输出形状
print("手动RNN输出形状:", manual_output.shape)  # (2, 3, 5)
print("手动RNN最终隐藏状态形状:", manual_h_n.shape)  # (2, 5)

#### 用RNN的自动传播机制进行验证

In [None]:
# 定义PyTorch RNN层
rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True, nonlinearity='tanh')

# 设置相同的权重和偏置
with torch.no_grad():
    rnn.weight_ih_l0.copy_(W_ih)
    rnn.weight_hh_l0.copy_(W_hh)
    rnn.bias_ih_l0.copy_(b_ih)
    rnn.bias_hh_l0.copy_(b_hh)

# 执行PyTorch RNN前向传播
pytorch_output, pytorch_h_n = rnn(input, h0.unsqueeze(0))  # h0需扩展为 (1, batch_size, hidden_size)

# 输出形状
print("PyTorch RNN输出形状:", pytorch_output.shape)  # (2, 3, 5)
print("PyTorch RNN最终隐藏状态形状:", pytorch_h_n.shape)  # (1, 2, 5)

# 验证手动实现与PyTorch结果一致
print("输出是否一致:", torch.allclose(manual_output, pytorch_output, atol=1e-6))
print("最终隐藏状态是否一致:", torch.allclose(manual_h_n, pytorch_h_n.squeeze(0), atol=1e-6))

### 基于NumPy的RNN示例

带前向传播和反向传播过程。

In [None]:
import numpy as np

np.random.seed(0)

# 超参数
input_size = 1
hidden_size = 10
output_size = 1
learning_rate = 0.01
epochs = 100

# 初始化权重
Wxh = np.random.randn(hidden_size, input_size) * 0.1
Whh = np.random.randn(hidden_size, hidden_size) * 0.1
Why = np.random.randn(output_size, hidden_size) * 0.1
bh = np.zeros((hidden_size, 1))
by = np.zeros((output_size, 1))

# 合成简单数据
T = 10  # 序列长度
x_seq = np.linspace(0, 2*np.pi, T).reshape(T, 1, 1)  # shape: [T, batch, input_size]
y_seq = np.sin(x_seq)                               # shape: [T, batch, output_size]

def tanh(x):
    return np.tanh(x)

def dtanh(x):
    return 1 - np.tanh(x)**2

# 训练循环
for epoch in range(epochs):
    # 初始化隐藏状态
    h_prev = np.zeros((hidden_size, 1))
    
    xs, hs, ys, targets = {}, {}, {}, {}
    hs[-1] = h_prev
    loss = 0
    
    # ----------- 前向传播 -------------
    for t in range(T):
        x_t = x_seq[t]  # shape: (1, 1)
        y_target = y_seq[t]

        xs[t] = x_t
        hs[t] = tanh(np.dot(Wxh, x_t) + np.dot(Whh, hs[t-1]) + bh)
        ys[t] = np.dot(Why, hs[t]) + by
        targets[t] = y_target

        loss += 0.5 * np.sum((ys[t] - y_target) ** 2)

    # ----------- 反向传播 (BPTT) -------------
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dbh = np.zeros_like(bh)
    dby = np.zeros_like(by)
    dh_next = np.zeros_like(hs[0])

    for t in reversed(range(T)):
        dy = ys[t] - targets[t]  # dL/dy
        dWhy += np.dot(dy, hs[t].T)
        dby += dy

        dh = np.dot(Why.T, dy) + dh_next  # 误差传回隐藏层
        dh_raw = dtanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) * dh  # dL/dh_t * tanh'
        dbh += dh_raw
        dWxh += np.dot(dh_raw, xs[t].T)
        dWhh += np.dot(dh_raw, hs[t-1].T)
        dh_next = np.dot(Whh.T, dh_raw)

    # ----------- 梯度裁剪 (可选) -------------
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -1, 1, out=dparam)

    # ----------- 参数更新 -------------
    Wxh -= learning_rate * dWxh
    Whh -= learning_rate * dWhh
    Why -= learning_rate * dWhy
    bh -= learning_rate * dbh
    by -= learning_rate * dby

    # ----------- 输出损失 -------------
    if epoch % 10 == 0 or epoch == epochs - 1:
        print(f"Epoch {epoch}: Loss = {loss:.4f}")

### 简单RNN示例

#### 单层单向RNN

In [None]:
import torch
import torch.nn as nn

# 参数设置
input_size = 10    # 输入特征维度
hidden_size = 20   # 隐藏状态维度
batch_size = 32    # 批量大小
seq_len = 50       # 序列长度

# 定义RNN层
rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True, nonlinearity='tanh')

# 输入数据
input = torch.randn(batch_size, seq_len, input_size)  # (batch_size, seq_len, input_size)
h0 = torch.randn(1, batch_size, hidden_size)          # 初始隐藏状态 (num_layers, batch_size, hidden_size)

# 前向传播
output, h_n = rnn(input, h0)

# 输出形状
print(output.shape)  # (batch_size, seq_len, hidden_size) = (32, 50, 20)
print(h_n.shape)    # (num_layers, batch_size, hidden_size) = (1, 32, 20)

#### 多层双向RNN

In [None]:
# 参数设置
input_size = 10
hidden_size = 20
num_layers = 2
batch_size = 32
seq_len = 50

# 定义双向RNN
rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, 
             batch_first=True, bidirectional=True, dropout=0.3, nonlinearity='relu')

# 输入数据
input = torch.randn(batch_size, seq_len, input_size)
h0 = torch.randn(2 * 2, batch_size, hidden_size)  # 2层 * 2方向

# 前向传播
output, h_n = rnn(input, h0)

# 输出形状
print(output.shape)  # (batch_size, seq_len, num_directions * hidden_size) = (32, 50, 40)
print(h_n.shape)    # (num_layers * num_directions, batch_size, hidden_size) = (4, 32, 20)

#### 处理变长输入序列

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# 模拟变长序列
input = torch.randn(batch_size, seq_len, input_size)
lengths = torch.randint(10, seq_len + 1, (batch_size,))  # 每个序列的实际长度

# 按长度降序排序
lengths, indices = lengths.sort(descending=True)
input = input[indices]

# 定义RNN
rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True)

# 打包序列
packed_input = pack_padded_sequence(input, lengths, batch_first=True, enforce_sorted=True)

# 前向传播
output, h_n = rnn(packed_input)

# 解包输出
output, output_lengths = pad_packed_sequence(output, batch_first=True)

print(output.shape)  # (batch_size, seq_len, hidden_size)
print(h_n.shape)    # (1, batch_size, hidden_size)

#### RNN的权重访问

- RNN层的权重矩阵可以通过 rnn.weight_ih_l[k]（输入到隐藏层的权重）和 rnn.weight_hh_l[k]（隐藏层到隐藏层的权重）访问，其中 k 是层索引。
- 偏置项通过 rnn.bias_ih_l[k] 和 rnn.bias_hh_l[k] 访问。

In [None]:
# 访问第一层RNN的权重
print(rnn.weight_ih_l0.shape)  # (hidden_size, input_size)
print(rnn.weight_hh_l0.shape)  # (hidden_size, hidden_size)
print(rnn.bias_ih_l0.shape)    # (hidden_size,)
print(rnn.bias_hh_l0.shape)    # (hidden_size,)

#### 带前向传播和反向传播的示例

单次epoch。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# 定义模型
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1):
        super(RNNClassifier, self).__init__()
        # Embedding层
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0)
        # LSTM层
        self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        # 全连接层
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        # text: [batch_size, seq_len]
        embedded = self.embedding(text)  # [batch_size, seq_len, embedding_dim]
        output, (h_n, c_n) = self.rnn(embedded)  # output: [batch_size, seq_len, hidden_dim]
        # 取最后一个时间步的隐藏状态
        final_hidden = h_n[-1, :, :]  # [batch_size, hidden_dim]
        out = self.fc(final_hidden)  # [batch_size, output_dim]
        return out

# 参数设置
vocab_size = 1000  # 词汇表大小
embedding_dim = 100  # 嵌入向量维度
hidden_dim = 128  # 隐藏状态维度
output_dim = 2  # 分类数（例如二分类）
num_layers = 2  # RNN层数
batch_size = 32
seq_len = 10

# 实例化模型
model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers)

# 模拟输入数据
input_text = torch.randint(0, vocab_size, (batch_size, seq_len))  # 随机索引 [batch_size, seq_len]
labels = torch.randint(0, output_dim, (batch_size,))  # 随机标签 [batch_size]

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练一步
model.train()
optimizer.zero_grad()
output = model(input_text)
loss = criterion(output, labels)
loss.backward()
optimizer.step()

print("Input Shape:", input_text.shape)  # torch.Size([32, 10])
print("Output Shape:", output.shape)      # torch.Size([32, 2])
print("Loss:", loss.item())

### 基于IMDB数据集的RNN模型实现示例

我们将使用PyTorch实现一个简单的RNN模型，完成IMDB电影评论的情感分析任务（二分类：正面或负面）。下面的代码包括数据加载、预处理、模型定义、训练和评估等功能的定义。

**提示**： 评论长度可变，需处理词汇表和填充。

1. 数据预处理：

- 数据集：使用Keras的imdb.load_data加载IMDB数据集，限制词汇表大小为10,000。
- 序列处理：将评论截断或填充到固定长度（max_len=500），确保输入形状一致。
- 自定义数据集：IMDBDataset类将数据转换为PyTorch张量，方便DataLoader使用。
- DataLoader：批量加载数据，batch_size=32，打乱训练数据以提高泛化能力。

2. 模型结构：

- 嵌入层（nn.Embedding）：将单词索引转换为128维词嵌入向量。
- RNN层（nn.RNN）：单层RNN，隐藏状态维度为256，使用tanh激活函数，batch_first=True。
- 全连接层（nn.Linear）：将最后一个时间步的隐藏状态映射到1维输出。
- Sigmoid：将输出压缩到[0, 1]，表示正面评论的概率。


3. 训练过程：

- 损失函数：BCELoss（二分类交叉熵），适合二分类任务。
- 优化器：Adam，学习率为0.001。
- 梯度裁剪：通过clip_grad_norm_限制梯度最大范数为1，防止梯度爆炸。
- 训练5个epoch：每次迭代计算损失、反向传播、更新参数。


4. 评估：

- 计算测试集准确率，预测值大于0.5为正面，否则为负面。
- 输出测试集准确率，评估模型性能。

**注意事项**

1. 词典限制：

- IMDB词典只包含训练数据中的单词，未见词（OOV）映射到索引0，可能影响预测准确性。
- 测试评论应尽量使用常见英语词汇，避免生僻词。

2. 序列长度：

- 测试评论被填充或截断到 max_len=500，用于确保同训练数据一致。
- 短评论可能填充较多0，长评论可能丢失信息。


3. 模型局限：

- 标准RNN可能因梯度消失问题对长序列表现不佳，建议替换为 nn.LSTM 或 nn.GRU。
- 增加num_layers或dropout可提高性能，但需更多计算资源。

**扩展建议：**

- 使用预训练词嵌入（如GloVe）替换 nn.Embedding。
- 添加注意力机制或双向RNN（bidirectional=True）。
- 增加验证集监控过拟合。


提示：需要事先安装依赖的模块。重新安装，以确保几个模块间版本上的兼容：
```bash
pip install --force-reinstall numpy==2.1.3 tensorflow gensim scipy
```

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.datasets import imdb
import numpy as np
import re
import string

# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)

# 1. 数据集准备
class IMDBDataset(Dataset):
    def __init__(self, data, labels, max_len=500):
        self.data = data
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 截断或填充序列到 max_len
        seq = self.data[idx][:self.max_len]
        seq = np.pad(seq, (0, self.max_len - len(seq)), mode='constant') if len(seq) < self.max_len else seq
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

# 加载IMDB数据集和词典
vocab_size = 20000  # 限制词汇表大小
max_len = 500       # 最大序列长度
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
word_index = imdb.get_word_index()  # 获取单词到索引的映射

# 创建PyTorch数据集
train_dataset = IMDBDataset(x_train, y_train, max_len)
test_dataset = IMDBDataset(x_test, y_test, max_len)

# 创建数据加载器
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 2. 定义RNN模型
class IMDBRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1):
        super(IMDBRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # 词嵌入层
        self.rnn = nn.RNN(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            nonlinearity='tanh'
        )
        self.fc = nn.Linear(hidden_size, output_size)  # 输出层
        self.sigmoid = nn.Sigmoid()  # 用于二分类

    def forward(self, x, h_0=None):
        # x: [batch_size, seq_len]
        x = self.embedding(x)  # [batch_size, seq_len, embed_size]
        output, h_n = self.rnn(x, h_0)  # output: [batch_size, seq_len, hidden_size]
        out = self.fc(output[:, -1, :])  # 取最后一个时间步: [batch_size, output_size]
        out = self.sigmoid(out)  # [batch_size, 1]
        return out

# 3. 模型参数
embed_size = 128    # 词嵌入维度
hidden_size = 256   # 隐藏状态维度
output_size = 1     # 输出维度（二分类）
num_layers = 1      # RNN层数

# 初始化模型、损失函数和优化器
model = IMDBRNN(vocab_size, embed_size, hidden_size, output_size, num_layers)
criterion = nn.BCELoss()  # 二分类交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. 训练模型
def train_model(model, train_loader, criterion, optimizer, epochs=5, device='cpu'):
    model = model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, labels)
            loss.backward()
            # 梯度裁剪防止爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# 5. 评估模型
def evaluate_model(model, test_loader, device='cpu'):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            output = model(data)
            predictions = (output >= 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.2f}%")

# 6. 测试功能：处理示例评论
def text_to_sequence(text, word_index, max_len=500):
    # 文本预处理：小写、去除标点
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    # 分词
    tokens = text.split()
    # 转换为单词索引
    sequence = [word_index.get(word, 0) for word in tokens if word_index.get(word, 0) < vocab_size]
    # 截断或填充
    sequence = sequence[:max_len]
    sequence = np.pad(sequence, (0, max_len - len(sequence)), mode='constant') if len(sequence) < max_len else sequence
    return torch.tensor([sequence], dtype=torch.long)  # [1, max_len]

def test_model(model, texts, word_index, device='cpu'):
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for text in texts:
            # 转换为序列
            seq = text_to_sequence(text, word_index, max_len)
            seq = seq.to(device)
            # 预测
            output = model(seq)  # [1, 1]
            prob = output.item()
            sentiment = "Positive" if prob >= 0.5 else "Negative"
            print(f"Text: {text}")
            print(f"Sentiment: {sentiment}, Probability: {prob:.4f}\n")

# 7. 运行训练、评估和测试
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 训练轮数会影响模型的性能，例如对于本示例，5轮训练，对于下面测试示例中的第一个可能会得到错误结果，
# 但10轮训练就可能会得到正确结果。
train_model(model, train_loader, criterion, optimizer, epochs=30, device=device)
evaluate_model(model, test_loader, device=device)

# 示例评论
test_texts = [
    "I absolutely love this movie, it's fantastic and thrilling!",
    "This film was boring and a complete waste of time.",
    "The plot was okay, but the acting was amazing.",
    "Terrible movie, I hated every minute of it."
]
test_model(model, test_texts, word_index, device=device)

#### 示例结果说明

测试时使用的各示例可能会被错误地分类到不正确的类别中，这通常有几个可能的原因。

1. 词汇表限制（Out-of-Vocabulary, OOV）：示例代码使用了imdb.load_data(num_words=20000)，这意味着它只加载了数据集中最常见的20000个单词。如果“fantastic”或“thrilling”这类词汇不在这个有限的词汇表中，它们就会被替换为未知的词汇标记（通常是0），在模型中得不到有效的学习。这会导致模型无法理解这些表达强烈正面情感的词汇，从而无法正确判断句子的情感倾向。

2. 模型训练不足或欠拟合：示例代码中，模型只训练了10个周期（epochs）。对于一个大型数据集和复杂的任务来说，这可能不足以让模型充分学习到所有的情感模式。如果模型没有在训练过程中见到足够多的像“fantastic”或“thrilling”这样描述积极情感的词汇及其上下文，它就可能无法正确地将这些词与正面情感关联起来。

3. 数据预处理或编码问题：虽然示例代码中的数据预处理看起来是标准的，但仍有可能存在细微的问题。例如，如果使用了自定义的词汇表或编码方式，并且在测试时没有保持一致，就可能导致模型无法识别这些词。在示例中，text_to_sequence函数将未在词典中的词汇映射为0，这与IMDB数据集本身的约定是一致的，但如果词典和训练数据在其他方面存在不匹配，也可能导致问题。

4. 训练数据中的特殊情况或噪声：在IMDB数据集中，某些词汇可能在不同的上下文中被赋予了相反的情感。例如，在讽刺的评论中，“fantastic”可能会被用于表达负面情感。如果训练数据中存在大量的这类“噪声”，模型可能会产生错误的学习偏差。当然，这在IMDB数据集中并不常见，但也是一个理论上的可能性。

为了解决以上这些类问题，可以尝试：
- 增加训练周期：将epochs的值从10增加到20或30，让模型有更多时间学习。
- 检查词汇表：确保所有重要的情感词汇都包含在你的词汇表中。你可以通过打印word_index来验证。
- 调整模型参数：增加hidden_size或num_layers，让模型有更强的表达能力，以便更好地理解复杂的语言模式。

## LSTM示例

### LSTM层的语法格式

```python
import torch
import torch.nn as nn

lstm = nn.LSTM(input_size, hidden_size, num_layers=1, bias=True, batch_first=False, dropout=0, bidirectional=False)
```

**LSTM计算公式**

对于单层单向LSTM，每个时间步的计算包括以下步骤：

输入门：
$$i_t = \sigma(W_{ii} \cdot x_t + b_{ii} + W_{hi} \cdot h_{t-1} + b_{hi})$$

遗忘门：
$$f_t = \sigma(W_{if} \cdot x_t + b_{if} + W_{hf} \cdot h_{t-1} + b_{hf})$$

输出门：
$$o_t = \sigma(W_{io} \cdot x_t + b_{io} + W_{ho} \cdot h_{t-1} + b_{ho})$$

候选记忆单元：
$$\tilde{c}_t = \tanh(W_{ic} \cdot x_t + b_{ic} + W_{hc} \cdot h_{t-1} + b_{hc})$$

记忆单元更新：
$$c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c}_t$$

隐藏状态更新：
$$h_t = o_t \odot \tanh(c_t)$$

- $ W_{ii}, W_{if}, W_{io}, W_{ic} $：输入到门的权重矩阵，形状为 (hidden_size, input_size)。
- $ W_{hi}, W_{hf}, W_{ho}, W_{hc} $：隐藏层到门的权重矩阵，形状为 (hidden_size, hidden_size)。
- $ b_{ii}, b_{if}, b_{io}, b_{ic}, b_{hi}, b_{hf}, b_{ho}, b_{hc} $：偏置向量，形状为 (hidden_size,)。
- $ \sigma $：sigmoid激活函数；$ \odot $：逐元素乘法。

### 门控计算过程示例

模拟1个时间步。

In [None]:
import torch
import torch.nn.functional as F

# 超参数设置
input_size = 4     # 每个时间步输入维度
hidden_size = 3    # 隐藏状态维度
output_size = 2    # 最终输出维度（例如分类）

# 模拟输入 x_t、前一时刻的 h_{t-1}, c_{t-1}
x_t = torch.randn(input_size)       # (4,)
h_prev = torch.randn(hidden_size)   # (3,)
c_prev = torch.randn(hidden_size)   # (3,)

# 初始化参数
def init_param():
    return torch.randn(hidden_size, input_size), torch.randn(hidden_size, hidden_size), torch.randn(hidden_size)

# 门参数
W_f, U_f, b_f = init_param()  # 忘记门
W_i, U_i, b_i = init_param()  # 输入门
W_o, U_o, b_o = init_param()  # 输出门
W_c, U_c, b_c = init_param()  # 候选记忆

# 输出层参数
W_out = torch.randn(output_size, hidden_size)
b_out = torch.randn(output_size)

# ========== 各门计算 ==========
f_t = torch.sigmoid(W_f @ x_t + U_f @ h_prev + b_f)  # 忘记门
i_t = torch.sigmoid(W_i @ x_t + U_i @ h_prev + b_i)  # 输入门
o_t = torch.sigmoid(W_o @ x_t + U_o @ h_prev + b_o)  # 输出门
c_tilde = torch.tanh(W_c @ x_t + U_c @ h_prev + b_c) # 候选记忆

# ========== 状态更新 ==========
c_t = f_t * c_prev + i_t * c_tilde      # 当前单元状态
h_t = o_t * torch.tanh(c_t)             # 当前隐藏状态

# ========== 输出计算 ==========
y_t = W_out @ h_t + b_out

# 打印结果
print("忘记门 f_t:", f_t)
print("输入门 i_t:", i_t)
print("候选记忆 c~_t:", c_tilde)
print("单元状态 c_t:", c_t)
print("输出门 o_t:", o_t)
print("隐藏状态 h_t:", h_t)
print("最终输出 y_t:", y_t)

### 多时间步+batch_size=3的LSTM手动前向传播实现

**我们将模拟出如下环境：**
- 输入序列：长度为 3
- 每个序列样本：输入维度为 4
- 批大小：3
- 隐藏状态维度（和 cell 状态维度）：3
- 输出维度：2

In [None]:
import torch
import torch.nn.functional as F

# 设置超参数
seq_len = 3
batch_size = 3
input_size = 4
hidden_size = 3
output_size = 2

# 初始化输入：x 形状为 (seq_len, batch_size, input_size)
x = torch.randn(seq_len, batch_size, input_size)

# 初始化隐藏状态 h_0 和单元状态 c_0：形状为 (batch_size, hidden_size)
h = torch.randn(batch_size, hidden_size)
c = torch.randn(batch_size, hidden_size)

# 初始化参数
def init_param():
    return (
        torch.randn(hidden_size, input_size),  # W: 对输入 x_t
        torch.randn(hidden_size, hidden_size), # U: 对隐藏状态 h_{t-1}
        torch.randn(1, hidden_size)            # b: 偏置广播到 batch
    )

# 初始化各门参数
W_f, U_f, b_f = init_param()  # forget gate
W_i, U_i, b_i = init_param()  # input gate
W_o, U_o, b_o = init_param()  # output gate
W_c, U_c, b_c = init_param()  # candidate memory

# 输出层参数
W_out = torch.randn(output_size, hidden_size)
b_out = torch.randn(1, output_size)

# 保存每个时间步的输出
outputs = []

# 开始时间步循环
for t in range(seq_len):
    x_t = x[t]  # 当前时间步输入：shape = (batch_size, input_size)

    # 各门计算
    f_t = torch.sigmoid(x_t @ W_f.T + h @ U_f.T + b_f)        # forget gate
    i_t = torch.sigmoid(x_t @ W_i.T + h @ U_i.T + b_i)        # input gate
    o_t = torch.sigmoid(x_t @ W_o.T + h @ U_o.T + b_o)        # output gate
    c_tilde = torch.tanh(x_t @ W_c.T + h @ U_c.T + b_c)       # candidate

    # 状态更新
    c = f_t * c + i_t * c_tilde                               # new cell state
    h = o_t * torch.tanh(c)                                   # new hidden state

    # 输出层
    y_t = h @ W_out.T + b_out                                 # linear output
    outputs.append(y_t)

# 拼接输出序列：shape = (seq_len, batch_size, output_size)
outputs = torch.stack(outputs, dim=0)

print("最终输出 outputs.shape:", outputs.shape)  # 应为 (3, 3, 2)
print("输出结果:\n", outputs)

### LSTM的简单示例

#### 单层单向LSTM

In [None]:
import torch
import torch.nn as nn

# 参数设置
input_size = 10    # 输入特征维度
hidden_size = 20   # 隐藏状态维度
batch_size = 32    # 批量大小
seq_len = 50       # 序列长度

# 定义LSTM层
lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)

# 输入数据
input = torch.randn(batch_size, seq_len, input_size)  # (batch_size, seq_len, input_size)
h0 = torch.randn(1, batch_size, hidden_size)          # 初始隐藏状态 (num_layers, batch_size, hidden_size)
c0 = torch.randn(1, batch_size, hidden_size)          # 初始记忆单元 (num_layers, batch_size, hidden_size)

# 前向传播
output, (h_n, c_n) = lstm(input, (h0, c0))

# 输出形状
print(output.shape)  # (batch_size, seq_len, hidden_size) = (32, 50, 20)
print(h_n.shape)    # (num_layers, batch_size, hidden_size) = (1, 32, 20)
print(c_n.shape)    # (num_layers, batch_size, hidden_size) = (1, 32, 20)

#### 多层双向LSTM

In [None]:
# 参数设置
input_size = 10
hidden_size = 20
num_layers = 2
batch_size = 32
seq_len = 50

# 定义双向LSTM
lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, 
               batch_first=True, bidirectional=True, dropout=0.3)

# 输入数据
input = torch.randn(batch_size, seq_len, input_size)
h0 = torch.randn(2 * 2, batch_size, hidden_size)  # 2层 * 2方向
c0 = torch.randn(2 * 2, batch_size, hidden_size)  # 2层 * 2方向

# 前向传播
output, (h_n, c_n) = lstm(input, (h0, c0))

# 输出形状
print(output.shape)  # (batch_size, seq_len, num_directions * hidden_size) = (32, 50, 40)
print(h_n.shape)    # (num_layers * num_directions, batch_size, hidden_size) = (4, 32, 20)
print(c_n.shape)    # (num_layers * num_directions, batch_size, hidden_size) = (4, 32, 20)

#### 处理变长序列

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# 模拟变长序列
input = torch.randn(batch_size, seq_len, input_size)
lengths = torch.randint(10, seq_len + 1, (batch_size,))  # 每个序列的实际长度

# 按长度降序排序
lengths, indices = lengths.sort(descending=True)
input = input[indices]

# 定义LSTM
lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)

# 打包序列
packed_input = pack_padded_sequence(input, lengths, batch_first=True, enforce_sorted=True)

# 前向传播
output, (h_n, c_n) = lstm(packed_input)

# 解包输出
output, output_lengths = pad_packed_sequence(output, batch_first=True)

print(output.shape)  # (batch_size, seq_len, hidden_size)
print(h_n.shape)    # (1, batch_size, hidden_size)
print(c_n.shape)    # (1, batch_size, hidden_size)

#### 权重访问

- LSTM层的权重矩阵可以通过 lstm.weight_ih_l[k]（输入到隐藏层的权重）和 lstm.weight_hh_l[k]（隐藏层到隐藏层的权重）访问，其中 k 是层索引。
- 偏置项通过 lstm.bias_ih_l[k] 和 lstm.bias_hh_l[k] 访问。

In [None]:
# 访问第一层LSTM的权重
print(lstm.weight_ih_l0.shape)  # (4 * hidden_size, input_size)，4个权重矩阵（输入门、遗忘门、输出门、候选记忆单元）
print(lstm.weight_hh_l0.shape)  # (4 * hidden_size, hidden_size)
print(lstm.bias_ih_l0.shape)    # (4 * hidden_size,)
print(lstm.bias_hh_l0.shape)    # (4 * hidden_size,)

### 基于IMDB数据集的LSTM模型实现示例

提示：需要事先安装依赖的模块。重新安装，以确保几个模块间版本上的兼容：
```bash
pip install --force-reinstall numpy==2.1.3 tensorflow gensim scipy
```

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.datasets import imdb
import numpy as np
import re
import string

# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)

# 1. 数据集准备
class IMDBDataset(Dataset):
    def __init__(self, data, labels, max_len=500):
        self.data = data
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 截断或填充序列到 max_len
        seq = self.data[idx][:self.max_len]
        seq = np.pad(seq, (0, self.max_len - len(seq)), mode='constant') if len(seq) < self.max_len else seq
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

# 加载IMDB数据集和词典
vocab_size = 20000  # 限制词汇表大小
max_len = 500       # 最大序列长度
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
word_index = imdb.get_word_index()  # 获取单词到索引的映射

# 创建PyTorch数据集
train_dataset = IMDBDataset(x_train, y_train, max_len)
test_dataset = IMDBDataset(x_test, y_test, max_len)

# 创建数据加载器
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 2. 定义LSTM模型
class IMDBLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1):
        super(IMDBLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # 词嵌入层
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, output_size)  # 输出层
        self.sigmoid = nn.Sigmoid()  # 用于二分类

    def forward(self, x, h_0=None, c_0=None):
        # x: [batch_size, seq_len]
        x = self.embedding(x)  # [batch_size, seq_len, embed_size]
        # LSTM需要h_0和c_0，若未提供则初始化为零
        if h_0 is None:
            h_0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        if c_0 is None:
            c_0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))  # output: [batch_size, seq_len, hidden_size]
        out = self.fc(output[:, -1, :])  # 取最后一个时间步: [batch_size, output_size]
        out = self.sigmoid(out)  # [batch_size, 1]
        return out

# 3. 模型参数
embed_size = 128    # 词嵌入维度
hidden_size = 256   # 隐藏状态维度
output_size = 1     # 输出维度（二分类）
num_layers = 1      # LSTM层数

# 初始化模型、损失函数和优化器
model = IMDBLSTM(vocab_size, embed_size, hidden_size, output_size, num_layers)
criterion = nn.BCELoss()  # 二分类交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. 训练模型
def train_model(model, train_loader, criterion, optimizer, epochs=5, device='cpu'):
    model = model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, labels)
            loss.backward()
            # 梯度裁剪防止爆炸
            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# 5. 评估模型
def evaluate_model(model, test_loader, device='cpu'):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            output = model(data)
            predictions = (output >= 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.2f}%")

# 6. 测试功能：处理示例评论
def text_to_sequence(text, word_index, max_len=500):
    # 文本预处理：小写、去除标点
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    # 分词
    tokens = text.split()
    # 转换为单词索引
    sequence = [word_index.get(word, 0) for word in tokens if word_index.get(word, 0) < vocab_size]
    # 截断或填充
    sequence = sequence[:max_len]
    sequence = np.pad(sequence, (0, max_len - len(sequence)), mode='constant') if len(sequence) < max_len else sequence
    return torch.tensor([sequence], dtype=torch.long)  # [1, max_len]

def test_model(model, texts, word_index, device='cpu'):
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for text in texts:
            # 转换为序列
            seq = text_to_sequence(text, word_index, max_len)
            seq = seq.to(device)
            # 预测
            output = model(seq)  # [1, 1]
            prob = output.item()
            sentiment = "Positive" if prob >= 0.5 else "Negative"
            print(f"Text: {text}")
            print(f"Sentiment: {sentiment}, Probability: {prob:.4f}\n")

# 7. 运行训练、评估和测试
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, criterion, optimizer, epochs=10, device=device)
evaluate_model(model, test_loader, device=device)

# 示例评论
test_texts = [
    "I absolutely love this movie, it's fantastic and thrilling!",
    "This film was boring and a complete waste of time.",
    "The plot was okay, but the acting was amazing.",
    "Terrible movie, I hated every minute of it."
]
test_model(model, test_texts, word_index, device=device)

### 扩展上面的LSTM示例

提示：更新相关的包至最新版本。

```bash
pip install --upgrade gensim numpy scipy torch tensorflow
```


**改进说明**

1. 双向LSTM：
- 设置 nn.LSTM(bidirectional=True)，使模型同时从正向和反向处理序列。
- 输出维度变为 hidden_size * 2（正向和反向隐藏状态拼接）。
- 调整全连接层输入维度，适应双向输出。

2. Dropout：
- 在LSTM层设置dropout=0.5，在多层LSTM的隐藏层之间应用Dropout（不影响最后一层）。
- 在全连接层后添加nn.Dropout(0.5)，进一步正则化。

3. 超参数调优：
- 隐藏状态维度：hidden_size=512（增大以提高模型容量）。
- 层数：num_layers=2（堆叠两层LSTM，增强表达能力）。
- 训练轮数：epochs=10（增加以确保充分训练）。
- 学习率：测试 lr=0.0005（稍降低以稳定训练）。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.datasets import imdb
import numpy as np
import re
import string

# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)

# 1. 数据集准备
class IMDBDataset(Dataset):
    def __init__(self, data, labels, max_len=500):
        self.data = data
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 截断或填充序列到 max_len
        seq = self.data[idx][:self.max_len]
        seq = np.pad(seq, (0, self.max_len - len(seq)), mode='constant') if len(seq) < self.max_len else seq
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

# 加载IMDB数据集和词典
vocab_size = 20000  # 限制词汇表大小
max_len = 500       # 最大序列长度
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
word_index = imdb.get_word_index()  # 获取单词到索引的映射

# 创建PyTorch数据集
train_dataset = IMDBDataset(x_train, y_train, max_len)
test_dataset = IMDBDataset(x_test, y_test, max_len)

# 创建数据加载器
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 2. 初始化嵌入矩阵（随机初始化）
embed_size = 300  # 嵌入维度
embedding_matrix = torch.randn(vocab_size, embed_size)  # 随机初始化嵌入矩阵

# 3. 定义改进的LSTM模型
class IMDBLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=2):
        super(IMDBLSTM, self).__init__()
        # 初始化嵌入层并加载随机权重
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(embedding_matrix)
        self.embedding.weight.requires_grad = True  # 允许微调嵌入权重
        
        # 双向LSTM，设置dropout=0.5
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,  # 启用双向LSTM
            dropout=0.5 if num_layers > 1 else 0  # 多层LSTM间应用Dropout
        )
        # 全连接层，输入维度为hidden_size * 2（双向LSTM）
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(0.5)  # 全连接层后的Dropout
        self.sigmoid = nn.Sigmoid()  # 用于二分类

    def forward(self, x, h_0=None, c_0=None):
        # x: [batch_size, seq_len]
        x = self.embedding(x)  # [batch_size, seq_len, embed_size]
        # 初始化隐藏状态和单元状态
        if h_0 is None:
            h_0 = torch.zeros(self.lstm.num_layers * 2, x.size(0), self.lstm.hidden_size).to(x.device)  # 双向需乘2
        if c_0 is None:
            c_0 = torch.zeros(self.lstm.num_layers * 2, x.size(0), self.lstm.hidden_size).to(x.device)
        # LSTM输出: [batch_size, seq_len, hidden_size * 2]
        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        # 取最后一个时间步，拼接正向和反向隐藏状态
        out = self.fc(self.dropout(output[:, -1, :]))  # [batch_size, output_size]
        out = self.sigmoid(out)  # [batch_size, 1]
        return out

# 4. 模型参数
hidden_size = 512   # 增大隐藏状态维度
output_size = 1     # 输出维度（二分类）
num_layers = 2      # 堆叠两层LSTM

# 初始化模型、损失函数和优化器
model = IMDBLSTM(vocab_size, embed_size, hidden_size, output_size, num_layers)
criterion = nn.BCELoss()  # 二分类交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # 降低学习率以稳定训练

# 5. 训练模型
def train_model(model, train_loader, criterion, optimizer, epochs=10, device='cpu'):
    model = model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, labels)
            loss.backward()
            # 梯度裁剪防止爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# 6. 评估模型
def evaluate_model(model, test_loader, device='cpu'):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            output = model(data)
            predictions = (output >= 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.2f}%")

# 7. 测试功能：处理示例评论
def text_to_sequence(text, word_index, max_len=500):
    # 文本预处理：小写、去除标点
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    # 分词
    tokens = text.split()
    # 转换为单词索引
    sequence = [word_index.get(word, 0) for word in tokens if word_index.get(word, 0) < vocab_size]
    # 截断或填充
    sequence = sequence[:max_len]
    sequence = np.pad(sequence, (0, max_len - len(sequence)), mode='constant') if len(sequence) < max_len else sequence
    return torch.tensor([sequence], dtype=torch.long)  # [1, max_len]

def test_model(model, texts, word_index, device='cpu'):
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for text in texts:
            # 转换为序列
            seq = text_to_sequence(text, word_index, max_len)
            seq = seq.to(device)
            # 预测
            output = model(seq)  # [1, 1]
            prob = output.item()
            sentiment = "Positive" if prob >= 0.5 else "Negative"
            print(f"Text: {text}")
            print(f"Sentiment: {sentiment}, Probability: {prob:.4f}\n")

# 8. 运行训练、评估和测试
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, criterion, optimizer, epochs=10, device=device)
evaluate_model(model, test_loader, device=device)

# 示例评论
test_texts = [
    "I absolutely love this movie, it's fantastic and thrilling!",
    "This film was boring and a complete waste of time.",
    "The plot was okay, but the acting was amazing.",
    "Terrible movie, I hated every minute of it."
]
test_model(model, test_texts, word_index, device=device)

### RNN BPTT示例

In [None]:
import numpy as np

def tanh(x):
    return np.tanh(x)

def tanh_deriv(x):
    return 1 - np.tanh(x) ** 2

# 参数设置
input_dim, hidden_dim, output_dim = 10, 20, 5
W_xh = np.random.randn(hidden_dim, input_dim)
W_hh = np.random.randn(hidden_dim, hidden_dim)
W_hy = np.random.randn(output_dim, hidden_dim)
b_h = np.random.randn(hidden_dim)
b_y = np.random.randn(output_dim)

def rnn_forward(sequence, h_0):
    h_t = h_0
    hidden_states = [h_t]
    outputs = []
    zs = []  # 存储 z_t = W_hh * h_{t-1} + W_xh * x_t + b_h
    for x_t in sequence:
        z_t = np.dot(W_hh, h_t) + np.dot(W_xh, x_t) + b_h
        h_t = tanh(z_t)
        y_t = np.dot(W_hy, h_t) + b_y
        hidden_states.append(h_t)
        outputs.append(y_t)
        zs.append(z_t)
    return hidden_states, outputs, zs

def rnn_backward(sequence, targets, hidden_states, outputs, zs):
    T = len(sequence)
    dW_xh = np.zeros_like(W_xh)
    dW_hh = np.zeros_like(W_hh)
    dW_hy = np.zeros_like(W_hy)
    db_h = np.zeros_like(b_h)
    db_y = np.zeros_like(b_y)
    dh_next = np.zeros_like(hidden_states[0])

    for t in range(T-1, -1, -1):
        # 损失对 \hat{y}_t 的导数
        dy_t = outputs[t] - targets[t]
        # 对 W_hy 和 b_y 的梯度
        dW_hy += np.outer(dy_t, hidden_states[t+1])
        db_y += dy_t
        # 对 h_t 的梯度
        dh_t = np.dot(W_hy.T, dy_t) + dh_next
        # 对 z_t 的梯度
        dz_t = dh_t * tanh_deriv(zs[t])
        # 对 W_hh, W_xh, b_h 的梯度
        dW_hh += np.outer(dz_t, hidden_states[t])
        dW_xh += np.outer(dz_t, sequence[t])
        db_h += dz_t
        # 传递到上一时间步
        dh_next = np.dot(W_hh.T, dz_t)

    return dW_xh, dW_hh, dW_hy, db_h, db_y

# 示例调用
sequence = [np.random.randn(input_dim) for _ in range(3)]
targets = [np.random.randn(output_dim) for _ in range(3)]
h_0 = np.zeros(hidden_dim)
hidden_states, outputs, zs = rnn_forward(sequence, h_0)
grads = rnn_backward(sequence, targets, hidden_states, outputs, zs)
print("Gradients:", grads)

## GRU示例

### 手动实现GRU前向传播的计算过程

In [None]:
import torch
import torch.nn.functional as F

# 假设输入维度和隐藏状态维度
input_size = 4
hidden_size = 3
output_size = 2  # 最终输出层维度

# 模拟一个输入向量x_t 和前一时刻隐藏状态h_{t-1}
x_t = torch.randn(input_size)       # 输入 (4,)
h_prev = torch.randn(hidden_size)   # h_{t-1} (3,)

# 初始化参数：权重和偏置
# 重置门参数
W_r = torch.randn(hidden_size, input_size)
U_r = torch.randn(hidden_size, hidden_size)
b_r = torch.randn(hidden_size)

# 更新门参数
W_z = torch.randn(hidden_size, input_size)
U_z = torch.randn(hidden_size, hidden_size)
b_z = torch.randn(hidden_size)

# 候选隐藏状态参数
W_h = torch.randn(hidden_size, input_size)
U_h = torch.randn(hidden_size, hidden_size)
b_h = torch.randn(hidden_size)

# 最终输出层参数
W_out = torch.randn(output_size, hidden_size)
b_out = torch.randn(output_size)

# ========================
# 1. 重置门 r_t
# ========================
r_t = torch.sigmoid(W_r @ x_t + U_r @ h_prev + b_r)
print("重置门 r_t:", r_t)

# ========================
# 2. 更新门 z_t
# ========================
z_t = torch.sigmoid(W_z @ x_t + U_z @ h_prev + b_z)
print("更新门 z_t:", z_t)

# ========================
# 3. 候选隐藏状态 h_t~
# ========================
h_tilde = torch.tanh(W_h @ x_t + U_h @ (r_t * h_prev) + b_h)
print("候选隐藏状态 h~_t:", h_tilde)

# ========================
# 4. 当前隐藏状态 h_t
# ========================
h_t = (1 - z_t) * h_prev + z_t * h_tilde
print("最终隐藏状态 h_t:", h_t)

# ========================
# 5. 输出层（例如分类任务）
# ========================
y_t = W_out @ h_t + b_out
print("预测输出 y_t:", y_t)

### 多时间步，支持batch的GRU前向传播

**设置说明：**
- 输入序列长度：3（即3个时间步）
- Batch大小：3
- 每个时间步输入维度：4
- 隐藏状态维度：3
- 输出维度：2（用于预测）

In [None]:
import torch
import torch.nn.functional as F

# 设置参数
seq_len = 3
batch_size = 3
input_size = 4
hidden_size = 3
output_size = 2

# 模拟一个输入序列: (seq_len, batch_size, input_size)
x = torch.randn(seq_len, batch_size, input_size)

# 初始化隐藏状态 h_0: (batch_size, hidden_size)
h = torch.randn(batch_size, hidden_size)

# 初始化参数
def init_param(*shape):
    return torch.randn(*shape)

# 重置门参数
W_r = init_param(hidden_size, input_size)
U_r = init_param(hidden_size, hidden_size)
b_r = init_param(1, hidden_size)

# 更新门参数
W_z = init_param(hidden_size, input_size)
U_z = init_param(hidden_size, hidden_size)
b_z = init_param(1, hidden_size)

# 候选隐藏状态参数
W_h = init_param(hidden_size, input_size)
U_h = init_param(hidden_size, hidden_size)
b_h = init_param(1, hidden_size)

# 输出层参数
W_out = init_param(output_size, hidden_size)
b_out = init_param(1, output_size)

# 保存每个时间步的输出
outputs = []

for t in range(seq_len):
    x_t = x[t]  # (batch_size, input_size)
    
    # 重置门 r_t: (batch_size, hidden_size)
    r_t = torch.sigmoid(x_t @ W_r.T + h @ U_r.T + b_r)
    
    # 更新门 z_t: (batch_size, hidden_size)
    z_t = torch.sigmoid(x_t @ W_z.T + h @ U_z.T + b_z)
    
    # 候选隐藏状态 h~_t: (batch_size, hidden_size)
    h_tilde = torch.tanh(x_t @ W_h.T + (r_t * h) @ U_h.T + b_h)
    
    # 当前隐藏状态 h_t: (batch_size, hidden_size)
    h = (1 - z_t) * h + z_t * h_tilde
    
    # 输出层 y_t: (batch_size, output_size)
    y_t = h @ W_out.T + b_out  # Linear output layer
    outputs.append(y_t)

# 堆叠输出序列: (seq_len, batch_size, output_size)
outputs = torch.stack(outputs, dim=0)

print("初始输入 x_0:\n", x)
print("初始隐藏状态h_0:\n", h)
print("最终输出 outputs.shape:", outputs.shape)  # 应为 (3, 3, 2)
print("输出结果:\n", outputs)

### GRU的简单使用示例

**单层单向GRU示例**

In [None]:
import torch
import torch.nn as nn

# 参数设置
input_size = 10    # 输入特征维度
hidden_size = 20   # 隐藏状态维度
batch_size = 32    # 批量大小
seq_len = 50       # 序列长度

# 定义GRU层
gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, batch_first=True)

# 输入数据
input = torch.randn(batch_size, seq_len, input_size)  # (batch_size, seq_len, input_size)
h0 = torch.randn(1, batch_size, hidden_size)          # 初始隐藏状态 (num_layers, batch_size, hidden_size)

# 前向传播
output, h_n = gru(input, h0)

# 输出形状
print(output.shape)  # (batch_size, seq_len, hidden_size) = (32, 50, 20)
print(h_n.shape)    # (num_layers, batch_size, hidden_size) = (1, 32, 20)

#### 多层双向GRU示例

In [None]:
# 参数设置
input_size = 10
hidden_size = 20
num_layers = 2
batch_size = 32
seq_len = 50

# 定义双向GRU
gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, 
             batch_first=True, bidirectional=True, dropout=0.3)

# 输入数据
input = torch.randn(batch_size, seq_len, input_size)
h0 = torch.randn(2 * 2, batch_size, hidden_size)  # 2层 * 2方向

# 前向传播
output, h_n = gru(input, h0)

# 输出形状
print(output.shape)  # (batch_size, seq_len, num_directions * hidden_size) = (32, 50, 40)
print(h_n.shape)    # (num_layers * num_directions, batch_size, hidden_size) = (4, 32, 20)

#### 处理变长输入序列

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# 模拟变长序列
input = torch.randn(batch_size, seq_len, input_size)
lengths = torch.randint(10, seq_len + 1, (batch_size,))  # 每个序列的实际长度

# 按长度降序排序（必要步骤）
lengths, indices = lengths.sort(descending=True)
input = input[indices]

# 定义GRU
gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, batch_first=True)

# 打包序列
packed_input = pack_padded_sequence(input, lengths, batch_first=True, enforce_sorted=True)

# 前向传播
output, h_n = gru(packed_input)

# 解包输出
output, output_lengths = pad_packed_sequence(output, batch_first=True)

print(output.shape)  # (batch_size, seq_len, hidden_size)
print(h_n.shape)    # (1, batch_size, hidden_size)

### GRU模型案例

修改前面LSTM示例使用GRU层实现：使用PyTorch实现一个简单的GRU模型，完成IMDB电影评论的情感分析任务（二分类：正面或负面）。下面的代码包括数据加载、预处理、模型定义、训练和评估等功能的定义。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.datasets import imdb
import numpy as np
import re
import string

# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)

# 1. 数据集准备
class IMDBDataset(Dataset):
    def __init__(self, data, labels, max_len=500):
        self.data = data
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 截断或填充序列到 max_len
        seq = self.data[idx][:self.max_len]
        seq = np.pad(seq, (0, self.max_len - len(seq)), mode='constant') if len(seq) < self.max_len else seq
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

# 加载IMDB数据集和词典
vocab_size = 20000  # 限制词汇表大小
max_len = 500       # 最大序列长度
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
word_index = imdb.get_word_index()  # 获取单词到索引的映射

# 创建PyTorch数据集
train_dataset = IMDBDataset(x_train, y_train, max_len)
test_dataset = IMDBDataset(x_test, y_test, max_len)

# 创建数据加载器
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 2. 定义GRU模型
# 关键修改点：
# - 类名从IMDBLSTM变为IMDBGRU
# - 将 nn.LSTM替换为nn.GRU
# - forward方法中，GRU的输出元组只有一个隐藏状态h_n，不需要c_n
class IMDBGRU(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1):
        super(IMDBGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU( # <-- 修改点
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, h_0=None):
        # x: [batch_size, seq_len]
        x = self.embedding(x) # [batch_size, seq_len, embed_size]
        
        # GRU只需要隐藏状态h_0，若未提供则初始化为零
        # h_0 的形状为 (num_layers * num_directions, batch, hidden_size)
        if h_0 is None:
            h_0 = torch.zeros(self.gru.num_layers, x.size(0), self.gru.hidden_size).to(x.device)
            
        output, h_n = self.gru(x, h_0) # <-- 修改点，GRU的第二个输出是h_n，没有c_n
        
        # h_n 是最后一个时间步的隐藏状态，形状为[num_layers, batch_size, hidden_size]
        # 取最后一个时间步的隐藏状态作为分类器的输入
        out = self.fc(h_n[-1, :, :]) # <-- 修改点，这里可以直接用h_n[-1, :, :]
        out = self.sigmoid(out)
        return out

# 3. 模型参数
embed_size = 128    # 词嵌入维度
hidden_size = 256   # 隐藏状态维度
output_size = 1     # 输出维度（二分类）
num_layers = 1      # GRU层数

# 初始化模型、损失函数和优化器
model = IMDBGRU(vocab_size, embed_size, hidden_size, output_size, num_layers) # <-- 修改点
criterion = nn.BCELoss()  # 二分类交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. 训练模型
def train_model(model, train_loader, criterion, optimizer, epochs=10, device='cpu'):
    model = model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, labels)
            loss.backward()
            # 梯度裁剪防止爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# 5. 评估模型
def evaluate_model(model, test_loader, device='cpu'):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            output = model(data)
            predictions = (output >= 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.2f}%")

# 6. 测试功能：处理示例评论
def text_to_sequence(text, word_index, max_len=500):
    # 文本预处理：小写、去除标点
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    # 分词
    tokens = text.split()
    # 转换为单词索引
    sequence = [word_index.get(word, 0) for word in tokens if word_index.get(word, 0) < vocab_size]
    # 截断或填充
    sequence = sequence[:max_len]
    sequence = np.pad(sequence, (0, max_len - len(sequence)), mode='constant') if len(sequence) < max_len else sequence
    return torch.tensor([sequence], dtype=torch.long)  # [1, max_len]

def test_model(model, texts, word_index, device='cpu'):
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for text in texts:
            # 转换为序列
            seq = text_to_sequence(text, word_index, max_len)
            seq = seq.to(device)
            # 预测
            output = model(seq)  # [1, 1]
            prob = output.item()
            sentiment = "Positive" if prob >= 0.5 else "Negative"
            print(f"Text: {text}")
            print(f"Sentiment: {sentiment}, Probability: {prob:.4f}\n")

# 7. 运行训练、评估和测试
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, criterion, optimizer, epochs=10, device=device)
evaluate_model(model, test_loader, device=device)

# 示例评论
test_texts = [
    "I absolutely love this movie, it's fantastic and thrilling!",
    "This film was boring and a complete waste of time.",
    "The plot was okay, but the acting was amazing.",
    "Terrible movie, I hated every minute of it."
]
test_model(model, test_texts, word_index, device=device)

### 双向GRU的实际使用案例

将上面的示例修改为使用双向GRU实现。

**关键修改点解析**
1. nn.GRU 初始化：
- 在 IMDBGRU 类的 __init__ 方法中，为 nn.GRU 添加 bidirectional=True 参数。这个参数告诉 PyTorch 构建一个双向的 GRU 层。

2. 全连接层输入维度：
- 单向GRU的隐藏状态维度是hidden_size。
- 双向GRU会将前向和后向的隐藏状态拼接起来，因此其隐藏状态维度会变为 2 * hidden_size。
- 因此，需要将全连接层self.fc的输入维度从hidden_size调整为hidden_size * 2。

3. forward 方法：

- 在forward方法中，双向GRU的最后一个隐藏状态h_n 的形状是 [num_layers * num_directions, batch_size, hidden_size]。
- 对于num_layers=1的情况，h_n的形状为 [2, batch_size, hidden_size]。其中：
  - h_n[0, :, :] 是前向（从左到右）序列的最后一个隐藏状态。
  - h_n[1, :, :] 是后向（从右到左）序列的最后一个隐藏状态。
- 我们需要将这两个方向的隐藏状态沿着维度 1 拼接起来，形成一个形状为 [batch_size, hidden_size * 2] 的张量，再将其作为全连接层的输入。
- 代码中通过 torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1) 实现了这个拼接操作。由于 num_layers=1 且 bidirectional=True，最后一个和倒数第二个隐藏状态就是两个方向的隐藏状态。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.datasets import imdb
import numpy as np
import re
import string

# 设置随机种子以确保可重复性
torch.manual_seed(42)
np.random.seed(42)

# 1. 数据集准备
class IMDBDataset(Dataset):
    def __init__(self, data, labels, max_len=500):
        self.data = data
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 截断或填充序列到 max_len
        seq = self.data[idx][:self.max_len]
        seq = np.pad(seq, (0, self.max_len - len(seq)), mode='constant') if len(seq) < self.max_len else seq
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

# 加载IMDB数据集和词典
vocab_size = 20000  # 限制词汇表大小
max_len = 500       # 最大序列长度
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
word_index = imdb.get_word_index()  # 获取单词到索引的映射

# 创建PyTorch数据集
train_dataset = IMDBDataset(x_train, y_train, max_len)
test_dataset = IMDBDataset(x_test, y_test, max_len)

# 创建数据加载器
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 2. 定义双向GRU模型
# 修改点：
# - `bidirectional=True` 参数
# - 调整全连接层的输入维度
# - 调整 `forward` 方法中隐藏状态的维度处理
class BidirectionalIMDBGRU(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1):
        super(BidirectionalIMDBGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True  # <-- 修改点 1: 设置为双向
        )
        # 双向 GRU 的隐藏状态维度是 2 * hidden_size
        self.fc = nn.Linear(hidden_size * 2, output_size) # <-- 修改点 2: 调整全连接层输入维度
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: [batch_size, seq_len]
        x = self.embedding(x) # [batch_size, seq_len, embed_size]
        
        # h_0 的形状为 (num_layers * num_directions, batch, hidden_size)
        # 双向时 num_directions = 2
        # 注意：这里我们不再需要手动传入 h_0，因为 GRU 在未提供时会自动初始化为零。
        
        # output 的形状为 [batch_size, seq_len, num_directions * hidden_size]
        # h_n 的形状为 [num_layers * num_directions, batch_size, hidden_size]
        output, h_n = self.gru(x)
        
        # 双向 GRU 的 h_n 包含了前向和后向的最后一个隐藏状态
        # h_n 的第一层的前向隐藏状态是 h_n[0, :, :]
        # h_n 的第一层的后向隐藏状态是 h_n[1, :, :]
        # 我们可以将它们拼接起来作为分类器的输入
        
        # 提取最后一个时间步的前向和后向隐藏状态
        h_n_forward = h_n[-2, :, :]
        h_n_backward = h_n[-1, :, :]
        
        # 将两个隐藏状态在最后一个维度拼接
        # out 的形状为 [batch_size, hidden_size * 2]
        out = torch.cat((h_n_forward, h_n_backward), dim=1) # <-- 修改点 3: 拼接双向的隐藏状态
        
        out = self.fc(out)
        out = self.sigmoid(out)
        return out

# 3. 模型参数
embed_size = 128    # 词嵌入维度
hidden_size = 256   # 隐藏状态维度
output_size = 1     # 输出维度（二分类）
num_layers = 1      # GRU层数

# 初始化模型、损失函数和优化器
model = BidirectionalIMDBGRU(vocab_size, embed_size, hidden_size, output_size, num_layers) # <-- 修改点
criterion = nn.BCELoss()  # 二分类交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. 训练模型
def train_model(model, train_loader, criterion, optimizer, epochs=10, device='cpu'):
    model = model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, labels)
            loss.backward()
            # 梯度裁剪防止爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# 5. 评估模型
def evaluate_model(model, test_loader, device='cpu'):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, labels in test_loader:
            data, labels = data.to(device), labels.to(device).view(-1, 1)
            output = model(data)
            predictions = (output >= 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.2f}%")

# 6. 测试功能：处理示例评论
def text_to_sequence(text, word_index, max_len=500):
    # 文本预处理：小写、去除标点
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    # 分词
    tokens = text.split()
    # 转换为单词索引
    sequence = [word_index.get(word, 0) for word in tokens if word_index.get(word, 0) < vocab_size]
    # 截断或填充
    sequence = sequence[:max_len]
    sequence = np.pad(sequence, (0, max_len - len(sequence)), mode='constant') if len(sequence) < max_len else sequence
    return torch.tensor([sequence], dtype=torch.long)  # [1, max_len]

def test_model(model, texts, word_index, device='cpu'):
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for text in texts:
            # 转换为序列
            seq = text_to_sequence(text, word_index, max_len)
            seq = seq.to(device)
            # 预测
            output = model(seq)  # [1, 1]
            prob = output.item()
            sentiment = "Positive" if prob >= 0.5 else "Negative"
            print(f"Text: {text}")
            print(f"Sentiment: {sentiment}, Probability: {prob:.4f}\n")

# 7. 运行训练、评估和测试
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_model(model, train_loader, criterion, optimizer, epochs=10, device=device)
evaluate_model(model, test_loader, device=device)

# 示例评论
test_texts = [
    "I absolutely love this movie, it's fantastic and thrilling!",
    "This film was boring and a complete waste of time.",
    "The plot was okay, but the acting was amazing.",
    "Terrible movie, I hated every minute of it."
]
test_model(model, test_texts, word_index, device=device)

## Seq2Seq示例

### 简单的seq2seq模型

下面是一个使用PyTorch构建Encoder-Decoder模型的简单示例。为了方便演示，我们省略了数据预处理部分，只展示模型的核心结构。

In [None]:
import torch
import torch.nn as nn

# 定义编码器
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, n_layers):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        # 嵌入层：将输入的词语索引转换为词向量
        self.embedding = nn.Embedding(input_dim, embed_dim)
        
        # LSTM层：处理序列数据
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers)

    def forward(self, src):
        # src: [sequence_length, batch_size]
        
        # 词嵌入：[sequence_length, batch_size, embed_dim]
        embedded = self.embedding(src)
        
        # LSTM前向传播
        # outputs: [sequence_length, batch_size, hidden_dim * num_directions]
        # (hidden, cell)是最终的隐藏状态和细胞状态，用于传递给解码器
        outputs, (hidden, cell) = self.lstm(embedded)
        
        # 返回最终的隐藏状态和细胞状态
        return hidden, cell

# 定义解码器
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, n_layers):
        super().__init__()
        
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        # 嵌入层
        self.embedding = nn.Embedding(output_dim, embed_dim)
        
        # LSTM层
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers)
        
        # 线性层：将LSTM的输出映射到词汇表大小
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden, cell):
        # input: [batch_size] -> 解码器的输入通常是一个词，所以需要增加一个维度
        input = input.unsqueeze(0)
        
        # 嵌入
        embedded = self.embedding(input)
        
        # LSTM前向传播，初始隐藏状态和细胞状态来自编码器
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        
        # output: [1, batch_size, hidden_dim]
        # 将 output 展平，以便输入到全连接层
        prediction = self.fc_out(output.squeeze(0))
        
        return prediction, hidden, cell

# 定义Seq2Seq模型
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        
        # 确保编码器和解码器的隐藏层维度和层数匹配
        assert encoder.hidden_dim == decoder.hidden_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Number of layers of encoder and decoder must be equal!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [src_len, batch_size]
        # trg: [trg_len, batch_size]
        
        # 获得目标序列的长度
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        
        # 存储解码器输出的张量
        outputs = torch.zeros(trg_len, batch_size, self.decoder.output_dim)
        
        # 编码器获得隐藏状态和细胞状态
        hidden, cell = self.encoder(src)
        
        # 解码器的第一个输入是起始符 <sos>
        input = trg[0, :]
        
        for t in range(1, trg_len):
            # 将前一步的隐藏状态、细胞状态和当前输入传给解码器
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            # 将输出保存到 outputs 中
            outputs[t] = output
            
            # 决定是否使用“教师强制”（Teacher Forcing）
            # 教师强制：用真实的目标序列词作为下一步输入
            # 否则：用模型预测出的最可能的词作为下一步输入
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            
            input = trg[t] if teacher_force else top1
        
        return outputs

# 实例化模型
INPUT_DIM = 5000  # 源语言词汇量
OUTPUT_DIM = 5000 # 目标语言词汇量
EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2

encoder = Encoder(INPUT_DIM, EMBED_DIM, HIDDEN_DIM, N_LAYERS)
decoder = Decoder(OUTPUT_DIM, EMBED_DIM, HIDDEN_DIM, N_LAYERS)
model = Seq2Seq(encoder, decoder)

### 机器翻译模型示例

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import Counter
from torch.utils.data import Dataset, DataLoader

# 数据集与词表
pairs = [
    ("你好", "Hello"),
    ("谢谢", "Thanks"),
    ("我爱你", "I love you"),
    ("早上好", "Good morning"),
    ("晚安", "Good night"),
    ('天气很好', 'The weather is good'),
    ('我喜欢学习编程', 'I like to learn programming'),
    ('他正在看电影', 'He is watching a movie'),
    ('请告诉我你的名字', 'Please tell me your name'),
    ('她是一个学生', 'She is a student'),
    ('我们是朋友', 'We are friends'),
    ('这本书很有趣', 'This book is interesting'),
]

# 特殊标记
SOS_token = "<SOS>"
EOS_token = "<EOS>"
PAD_token = "<PAD>"
UNK_token = "<UNK>"  # 添加未知词标记以处理未见词

# 改进的词汇表构建函数，确保包含所有单词
def build_vocab(sentences, is_target=False):
    counter = Counter()
    if is_target:
        # 对于英文，按单词分割
        for sent in sentences:
            words = sent.split()  # 直接按空格分割
            counter.update(words)
    else:
        # 对于中文，按字符分割
        for sent in sentences:
            counter.update(list(sent))
    # 添加特殊标记
    vocab = {PAD_token: 0, SOS_token: 1, EOS_token: 2, UNK_token: 3}
    vocab.update({word: i + 4 for i, word in enumerate(counter.keys())})
    return vocab, {i: word for word, i in vocab.items()}

# 分离源语言和目标语言句子
src_sentences, tgt_sentences = zip(*pairs)
src_vocab, src_itos = build_vocab(src_sentences, is_target=False)
tgt_vocab, tgt_itos = build_vocab(tgt_sentences, is_target=True)

# 数据集类
class TranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        # 中文按字符分割
        src_indices = [self.src_vocab[SOS_token]] + [self.src_vocab.get(c, self.src_vocab[UNK_token]) for c in src] + [self.src_vocab[EOS_token]]
        # 英文按单词分割
        tgt_words = tgt.split()
        tgt_indices = [self.tgt_vocab[SOS_token]] + [self.tgt_vocab.get(w, self.tgt_vocab[UNK_token]) for w in tgt_words] + [self.tgt_vocab[EOS_token]]
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

# 模型定义
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
    
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
    
    def forward(self, decoder_hidden, encoder_outputs):
        score = self.Va(torch.tanh(self.Wa(decoder_hidden.unsqueeze(1)) + self.Ua(encoder_outputs)))
        attn_weights = torch.softmax(score, dim=1)
        context = torch.bmm(attn_weights.transpose(1, 2), encoder_outputs)
        return context.squeeze(1), attn_weights.squeeze(2)

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, batch_first=True)
        self.attention = BahdanauAttention(hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, tgt, hidden, cell, encoder_outputs):
        embedded = self.embedding(tgt)
        context, attn_weights = self.attention(hidden[-1], encoder_outputs)
        lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.out(output.squeeze(1))
        return output, hidden, cell, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.out.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(src.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        
        input = tgt[:, 0].unsqueeze(1)  # <SOS>
        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = tgt[:, t].unsqueeze(1) if teacher_force else output.argmax(1).unsqueeze(1)
        
        return outputs

# 数据准备
dataset = TranslationDataset(pairs, src_vocab, tgt_vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: (
    nn.utils.rnn.pad_sequence([item[0] for item in x], batch_first=True, padding_value=src_vocab[PAD_token]),
    nn.utils.rnn.pad_sequence([item[1] for item in x], batch_first=True, padding_value=tgt_vocab[PAD_token])
))

# 模型参数
embed_size = 128
hidden_size = 256
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)

encoder = Encoder(src_vocab_size, embed_size, hidden_size)
decoder = Decoder(tgt_vocab_size, embed_size, hidden_size)
model = Seq2Seq(encoder, decoder)

# 训练设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab[PAD_token])
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
def train(model, dataloader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt)
            loss = criterion(output[:, 1:, :].reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

# 推理函数
def translate(model, sentence, src_vocab, tgt_vocab, src_itos, tgt_itos, max_len=20):
    model.eval()
    with torch.no_grad():
        src_indices = [src_vocab[SOS_token]] + [src_vocab.get(c, src_vocab[UNK_token]) for c in sentence] + [src_vocab[EOS_token]]
        src_tensor = torch.tensor([src_indices], dtype=torch.long).to(device)
        
        encoder_outputs, hidden, cell = model.encoder(src_tensor)
        input = torch.tensor([[tgt_vocab[SOS_token]]]).to(device)
        output_words = []
        
        for _ in range(max_len):
            output, hidden, cell, _ = model.decoder(input, hidden, cell, encoder_outputs)
            pred_token = output.argmax(1).item()
            if pred_token == tgt_vocab[EOS_token]:
                break
            output_words.append(tgt_itos[pred_token])
            input = torch.tensor([[pred_token]]).to(device)
        
        return ' '.join(output_words)

# 训练模型
train(model, dataloader, criterion, optimizer)

# 测试翻译
print("\n===== 测试翻译 =====")
test_sentences = ["你好", "我爱你", "这本书很有趣", "请告诉我你的名字" ]
for sent in test_sentences:
    translation = translate(model, sent, src_vocab, tgt_vocab, src_itos, tgt_itos)
    print(f"Chinese: {sent} -> English: {translation}")

### 示例2：SRE告警处理建议

**代码说明**
- 数据集：使用SRE相关的告警-操作对，模拟真实场景。告警消息和操作建议都按单词分割。
- 模型：与翻译示例相同，使用LSTM+Bahdanau注意力，适合SRE的序列映射任务。
- 预处理：词汇表按单词构建，添加<UNK>处理未见词。
- 训练：小数据集，20个epoch，Adam优化器，Teacher Forcing比率0.5。
- 推理：handle_alert函数将告警消息翻译为操作建议。

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import Counter
from torch.utils.data import Dataset, DataLoader

# 数据集
pairs = [
    ("CPU usage > 90%", "Restart service"),
    ("Memory leak detected", "Increase memory allocation"),
    ("Disk space low", "Clean up disk"),
    ("Service A timeout", "Check network connection"),
    ("High latency in API", "Scale up servers"),
    ("Database connection failed", "Restart database"),
]

# 特殊标记
SOS_token = "<SOS>"
EOS_token = "<EOS>"
PAD_token = "<PAD>"
UNK_token = "<UNK>"

# 构建词表
def build_vocab(sentences):
    counter = Counter()
    for sent in sentences:
        words = sent.split()  # 按单词分割
        counter.update(words)
    vocab = {PAD_token: 0, SOS_token: 1, EOS_token: 2, UNK_token: 3}
    vocab.update({word: i + 4 for i, word in enumerate(counter.keys())})
    return vocab, {i: word for word, i in vocab.items()}

src_sentences, tgt_sentences = zip(*pairs)
src_vocab, src_itos = build_vocab(src_sentences)
tgt_vocab, tgt_itos = build_vocab(tgt_sentences)

# 数据集类
class AlertDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_indices = [self.src_vocab[SOS_token]] + [self.src_vocab.get(w, self.src_vocab[UNK_token]) for w in src.split()] + [self.src_vocab[EOS_token]]
        tgt_indices = [self.tgt_vocab[SOS_token]] + [self.tgt_vocab.get(w, self.tgt_vocab[UNK_token]) for w in tgt.split()] + [self.tgt_vocab[EOS_token]]
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

# 模型定义（与翻译示例相同）
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
    
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
    
    def forward(self, decoder_hidden, encoder_outputs):
        score = self.Va(torch.tanh(self.Wa(decoder_hidden.unsqueeze(1)) + self.Ua(encoder_outputs)))
        attn_weights = torch.softmax(score, dim=1)
        context = torch.bmm(attn_weights.transpose(1, 2), encoder_outputs)
        return context.squeeze(1), attn_weights.squeeze(2)

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, batch_first=True)
        self.attention = BahdanauAttention(hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, tgt, hidden, cell, encoder_outputs):
        embedded = self.embedding(tgt)
        context, attn_weights = self.attention(hidden[-1], encoder_outputs)
        lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.out(output.squeeze(1))
        return output, hidden, cell, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.out.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(src.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        
        input = tgt[:, 0].unsqueeze(1)  # <SOS>
        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = tgt[:, t].unsqueeze(1) if teacher_force else output.argmax(1).unsqueeze(1)
        
        return outputs

# 数据准备
dataset = AlertDataset(pairs, src_vocab, tgt_vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: (
    nn.utils.rnn.pad_sequence([item[0] for item in x], batch_first=True, padding_value=src_vocab[PAD_token]),
    nn.utils.rnn.pad_sequence([item[1] for item in x], batch_first=True, padding_value=tgt_vocab[PAD_token])
))

# 模型参数
embed_size = 128
hidden_size = 256
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)

encoder = Encoder(src_vocab_size, embed_size, hidden_size)
decoder = Decoder(tgt_vocab_size, embed_size, hidden_size)
model = Seq2Seq(encoder, decoder)

# 训练设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab[PAD_token])
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
def train(model, dataloader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt)
            loss = criterion(output[:, 1:, :].reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

# 推理函数
def handle_alert(model, alert, src_vocab, tgt_vocab, src_itos, tgt_itos, max_len=20):
    model.eval()
    with torch.no_grad():
        src_indices = [src_vocab[SOS_token]] + [src_vocab.get(w, src_vocab[UNK_token]) for w in alert.split()] + [src_vocab[EOS_token]]
        src_tensor = torch.tensor([src_indices], dtype=torch.long).to(device)
        
        encoder_outputs, hidden, cell = model.encoder(src_tensor)
        input = torch.tensor([[tgt_vocab[SOS_token]]]).to(device)
        output_words = []
        
        for _ in range(max_len):
            output, hidden, cell, _ = model.decoder(input, hidden, cell, encoder_outputs)
            pred_token = output.argmax(1).item()
            if pred_token == tgt_vocab[EOS_token]:
                break
            output_words.append(tgt_itos[pred_token])
            input = torch.tensor([[pred_token]]).to(device)
        
        return ' '.join(output_words)

# 训练模型
train(model, dataloader, criterion, optimizer)

# 测试告警处理
print("\n===== 测试告警处理 =====")
test_alerts = ["CPU usa > 90% xxx xxx xxx", "Disk space low xxx", "High latency in API"]
for alert in test_alerts:
    suggestion = handle_alert(model, alert, src_vocab, tgt_vocab, src_itos, tgt_itos)
    print(f"Alert: {alert} -> Suggestion: {suggestion}")

### 示例3：日志分析与异常检测

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import Counter
from torch.utils.data import Dataset, DataLoader

# 模拟日志数据集：(日志序列, 标签序列)
# 日志序列：按空格分割的日志消息列表（模拟多个日志条目）
# 标签序列：每个日志的标签 ("normal" 或 "anomalous")
pairs = [
    ("[INFO] System started successfully", "normal"),
    ("[ERROR] Connection timeout occurred", "anomalous"),
    ("[WARN] Disk space low at 90%", "anomalous"),
    ("[INFO] User logged in", "normal"),
    ("[FATAL] Database crash detected", "anomalous"),
    ("[DEBUG] Request processed in 10ms", "normal"),
    ("[ERROR] Invalid credentials provided", "anomalous"),
    ("[INFO] Backup completed", "normal"),
    ("[WARN] High CPU usage 95%", "anomalous"),
    ("[INFO] Service restarted", "normal"),
    ("[ERROR] Network failure", "anomalous"),
    ("[INFO] All systems operational", "normal"),
]

# 特殊标记
SOS_token = "<SOS>"
EOS_token = "<EOS>"
PAD_token = "<PAD>"
UNK_token = "<UNK>"

# 构建词表
def build_vocab(sentences, is_label=False):
    counter = Counter()
    for sent in sentences:
        words = sent.split()  # 按空格分割日志或标签
        counter.update(words)
    vocab = {PAD_token: 0, SOS_token: 1, EOS_token: 2, UNK_token: 3}
    vocab.update({word: i + 4 for i, word in enumerate(counter.keys())})
    return vocab, {i: word for word, i in vocab.items()}

src_sentences, tgt_sentences = zip(*pairs)
src_vocab, src_itos = build_vocab(src_sentences)  # 日志词表
tgt_vocab, tgt_itos = build_vocab(tgt_sentences)  # 标签词表（简单标签如 "normal", "anomalous"）

# 数据集类
class LogDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        # 日志按单词分割
        src_indices = [self.src_vocab[SOS_token]] + [self.src_vocab.get(w, self.src_vocab[UNK_token]) for w in src.split()] + [self.src_vocab[EOS_token]]
        # 标签按单词分割（这里标签是单个词，但可扩展为序列）
        tgt_indices = [self.tgt_vocab[SOS_token]] + [self.tgt_vocab.get(w, self.tgt_vocab[UNK_token]) for w in tgt.split()] + [self.tgt_vocab[EOS_token]]
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

# 模型定义
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
    
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
    
    def forward(self, decoder_hidden, encoder_outputs):
        score = self.Va(torch.tanh(self.Wa(decoder_hidden.unsqueeze(1)) + self.Ua(encoder_outputs)))
        attn_weights = torch.softmax(score, dim=1)
        context = torch.bmm(attn_weights.transpose(1, 2), encoder_outputs)
        return context.squeeze(1), attn_weights.squeeze(2)

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, batch_first=True)
        self.attention = BahdanauAttention(hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, tgt, hidden, cell, encoder_outputs):
        embedded = self.embedding(tgt)
        context, attn_weights = self.attention(hidden[-1], encoder_outputs)
        lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.out(output.squeeze(1))
        return output, hidden, cell, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.out.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(src.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        
        input = tgt[:, 0].unsqueeze(1)  # <SOS>
        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = tgt[:, t].unsqueeze(1) if teacher_force else output.argmax(1).unsqueeze(1)
        
        return outputs

# 数据准备
dataset = LogDataset(pairs, src_vocab, tgt_vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: (
    nn.utils.rnn.pad_sequence([item[0] for item in x], batch_first=True, padding_value=src_vocab[PAD_token]),
    nn.utils.rnn.pad_sequence([item[1] for item in x], batch_first=True, padding_value=tgt_vocab[PAD_token])
))

# 模型参数
embed_size = 128
hidden_size = 256
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)

encoder = Encoder(src_vocab_size, embed_size, hidden_size)
decoder = Decoder(tgt_vocab_size, embed_size, hidden_size)
model = Seq2Seq(encoder, decoder)

# 训练设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab[PAD_token])
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
def train(model, dataloader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt)
            loss = criterion(output[:, 1:, :].reshape(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

# 异常检测函数（推理）
def detect_anomaly(model, log_message, src_vocab, tgt_vocab, src_itos, tgt_itos, max_len=5):
    model.eval()
    with torch.no_grad():
        src_indices = [src_vocab[SOS_token]] + [src_vocab.get(w, src_vocab[UNK_token]) for w in log_message.split()] + [src_vocab[EOS_token]]
        src_tensor = torch.tensor([src_indices], dtype=torch.long).to(device)
        
        encoder_outputs, hidden, cell = model.encoder(src_tensor)
        input = torch.tensor([[tgt_vocab[SOS_token]]]).to(device)
        output_labels = []
        
        for _ in range(max_len):
            output, hidden, cell, _ = model.decoder(input, hidden, cell, encoder_outputs)
            pred_token = output.argmax(1).item()
            if pred_token == tgt_vocab[EOS_token]:
                break
            output_labels.append(tgt_itos[pred_token])
            input = torch.tensor([[pred_token]]).to(device)
        
        label = ' '.join(output_labels)
        is_anomalous = "anomalous" in label.lower()
        return label, is_anomalous

# 训练模型
train(model, dataloader, criterion, optimizer)

# 测试异常检测
print("\n===== 测试异常检测 =====")
test_logs = [
    "[ERROR] Connection timeout occurred",
    "[INFO] System started successfully",
    "[WARN] High CPU usage 95%",
    "[FATAL] Unknown error in module X",
]
for log in test_logs:
    label, is_anomalous = detect_anomaly(model, log, src_vocab, tgt_vocab, src_itos, tgt_itos)
    print(f"Log: {log} -> Predicted Label: {label} (Anomalous: {is_anomalous})")

### 示例4：日志分析与异常检测改进版

- 扩展数据集为多条日志的序列
- 使用自监督学习机制

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# 模拟日志数据集：每个样本是多条日志的序列
log_sequences = [
    ["[INFO] System started successfully", "[WARN] Disk space low at 90%", "[ERROR] Connection timeout occurred"],
    ["[INFO] User logged in", "[DEBUG] Request processed in 10ms", "[INFO] Backup completed"],
    ["[FATAL] Database crash detected", "[ERROR] Invalid credentials provided"],
    ["[INFO] All systems operational", "[INFO] Service restarted"],
    ["[WARN] High CPU usage 95%", "[ERROR] Network failure"],
    ["[INFO] System started successfully", "[INFO] User logged in"],
    ["[ERROR] Connection timeout occurred", "[WARN] Disk space low at 90%"],
    ["[DEBUG] Request processed in 10ms", "[INFO] Backup completed"],
    ["[FATAL] Database crash detected", "[WARN] High CPU usage 95%"],
    ["[INFO] All systems operational", "[ERROR] Invalid credentials provided"],
    ["[INFO] Service restarted", "[INFO] User logged in"],
    ["[ERROR] Network failure", "[WARN] Disk space low at 90%"],
]

# 特殊标记
SOS_token = "<SOS>"
EOS_token = "<EOS>"
PAD_token = "<PAD>"
UNK_token = "<UNK>"
SEQ_SEP = "<SEP>"

# 构建词表
def build_vocab(sequences):
    counter = Counter()
    for seq in sequences:
        for log in seq:
            words = log.split()
            counter.update(words)
    counter.update([SEQ_SEP])
    vocab = {PAD_token: 0, SOS_token: 1, EOS_token: 2, UNK_token: 3}
    vocab.update({word: i + 4 for i, word in enumerate(counter.keys())})
    return vocab, {i: word for word, i in vocab.items()}

vocab, itos = build_vocab(log_sequences)

# 数据集类：自监督，目标序列 = 输入序列
class LogDataset(Dataset):
    def __init__(self, sequences, vocab):
        self.sequences = sequences
        self.vocab = vocab
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        flattened_seq = []
        for log in seq:
            flattened_seq.extend(log.split())
            flattened_seq.append(SEQ_SEP)
        flattened_seq = flattened_seq[:-1]  # 移除最后一个<SEP>
        
        indices = [self.vocab[SOS_token]] + [self.vocab.get(w, self.vocab[UNK_token]) for w in flattened_seq] + [self.vocab[EOS_token]]
        return torch.tensor(indices), torch.tensor(indices)

# 模型定义
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
    
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
    
    def forward(self, decoder_hidden, encoder_outputs):
        score = self.Va(torch.tanh(self.Wa(decoder_hidden.unsqueeze(1)) + self.Ua(encoder_outputs)))
        attn_weights = torch.softmax(score, dim=1)
        context = torch.bmm(attn_weights.transpose(1, 2), encoder_outputs)
        return context.squeeze(1), attn_weights.squeeze(2)

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, batch_first=True)
        self.attention = BahdanauAttention(hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, tgt, hidden, cell, encoder_outputs):
        embedded = self.embedding(tgt)
        context, attn_weights = self.attention(hidden[-1], encoder_outputs)
        lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.out(output.squeeze(1))
        return output, hidden, cell, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.out.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(src.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        
        input = tgt[:, 0].unsqueeze(1)  # <SOS>
        for t in range(1, tgt_len):
            output, hidden, cell, _ = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = tgt[:, t].unsqueeze(1) if teacher_force else output.argmax(1).unsqueeze(1)
        
        return outputs

# 数据准备
dataset = LogDataset(log_sequences, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, drop_last=True, collate_fn=lambda x: (
    nn.utils.rnn.pad_sequence([item[0] for item in x], batch_first=True, padding_value=vocab[PAD_token]),
    nn.utils.rnn.pad_sequence([item[1] for item in x], batch_first=True, padding_value=vocab[PAD_token])
))

# 模型参数
embed_size = 128
hidden_size = 256
vocab_size = len(vocab)

encoder = Encoder(vocab_size, embed_size, hidden_size)
decoder = Decoder(vocab_size, embed_size, hidden_size)
model = Seq2Seq(encoder, decoder)

# 训练设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=vocab[PAD_token])
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练循环
def train(model, dataloader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt)
            loss = criterion(output[:, 1:, :].reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

# 异常检测函数（基于重建误差）
def detect_anomaly(model, log_seq, vocab, itos, device, max_len=50, threshold=0.5):
    model.eval()
    with torch.no_grad():
        # 扁平化输入序列
        flattened_seq = []
        for log in log_seq:
            flattened_seq.extend(log.split())
            flattened_seq.append(SEQ_SEP)
        flattened_seq = flattened_seq[:-1]
        
        src_indices = [vocab[SOS_token]] + [vocab.get(w, vocab[UNK_token]) for w in flattened_seq] + [vocab[EOS_token]]
        src_tensor = torch.tensor([src_indices], dtype=torch.long).to(device)
        tgt_tensor = src_tensor.clone()  # 自监督，tgt = src
        
        # 使用模型forward生成重建输出（无Teacher Forcing，以模拟推理）
        recon_outputs = model(src_tensor, tgt_tensor, teacher_forcing_ratio=0.0)
        
        # 计算重建误差
        error = criterion(recon_outputs[:, 1:, :].reshape(-1, vocab_size), tgt_tensor[:, 1:].reshape(-1))
        
        # 生成重建序列以可视化
        reconstructed = []
        for t in range(1, recon_outputs.size(1)):
            pred_token = recon_outputs[0, t].argmax(0).item()
            if pred_token == vocab[EOS_token]:
                break
            reconstructed.append(itos[pred_token])
        
        is_anomalous = error.item() > threshold
        reconstructed_seq = ' '.join(reconstructed).replace(SEQ_SEP, ' | ')
        
        return reconstructed_seq, error.item(), is_anomalous

# 训练模型
train(model, dataloader, criterion, optimizer)

# 测试异常检测
print("\n===== 测试异常检测 =====")
test_sequences = [
    ["[INFO] System started successfully", "[WARN] Disk space low at 90%", "[ERROR] Connection timeout occurred"],
    ["[INFO] User logged in", "[DEBUG] Request processed in 10ms"],
    ["[FATAL] Unknown error detected", "[ERROR] System overload"],
]
for seq in test_sequences:
    reconstructed, error, is_anomalous = detect_anomaly(model, seq, vocab, itos, device)
    print(f"Input Sequence: {' | '.join(seq)}\nReconstructed: {reconstructed}\nError: {error:.4f} (Anomalous: {is_anomalous})\n")