In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 定义 Transformer 模型
class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim):
        super(SimpleTransformer, self).__init__()
        
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model_dim = model_dim
        
        # 线性层将输入从 input_dim 映射到 model_dim
        self.input_fc = nn.Linear(input_dim, model_dim)
        
        # Transformer 编码器部分
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim,  # 模型的维度
            nhead=num_heads,    # 自注意力机制中的头数
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.encoder_layer, num_layers=num_layers  # 编码器层数
        )

        # 线性层用于最终输出
        self.fc_out = nn.Linear(model_dim, output_dim)

    def forward(self, x):
        # 输入 x 的形状是 (batch_size, seq_length, input_dim)
        # 先通过线性层将输入从 input_dim 映射到 model_dim
        x = self.input_fc(x)  # 现在的 x 形状是 (batch_size, seq_length, model_dim)

        # 将输入转换为 (seq_length, batch_size, model_dim) 形式
        x = x.permute(1, 0, 2)  # 转换为 (seq_length, batch_size, model_dim)

        # 使用 Transformer 编码器处理输入
        transformer_out = self.transformer_encoder(x)

        # 从 Transformer 输出中提取最后一个时刻的隐藏状态
        output = transformer_out[-1, :, :]  # 取最后一个时间步，形状 (batch_size, model_dim)

        # 通过线性层输出最终的预测结果
        output = self.fc_out(output)  # 形状为 (batch_size, output_dim)
        
        return output


# 配置模型参数
input_dim = 10  # one-hot 编码的维度，假设有 10 个类
model_dim = 64  # Transformer 中每层的维度
num_heads = 8   # 自注意力机制的头数
num_layers = 4  # Transformer 编码器的层数
output_dim = 2  # 二分类问题

# 检查是否有可用的 GPU，如果有则使用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 创建模型实例
model = SimpleTransformer(input_dim, model_dim, num_heads, num_layers, output_dim).to(device)

# 打印模型结构
print(model)

# 假设的训练数据
batch_size = 32
seq_length = 50  # 假设输入序列长度为 50
num_samples = 1000  # 假设我们有 1000 个样本

# 随机生成一批训练数据
x_train = torch.randn(num_samples, seq_length, input_dim).to(device)  # 移动数据到 GPU
y_train = torch.randint(0, 2, (num_samples,)).to(device)  # 移动标签到 GPU

# 创建数据集和数据加载器
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 随机生成一些验证数据
x_val = torch.randn(num_samples, seq_length, input_dim).to(device)  # 验证数据
y_val = torch.randint(0, 2, (num_samples,)).to(device)  # 验证标签

# 创建验证数据集和数据加载器
val_dataset = TensorDataset(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()  # 二分类任务使用交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 定义训练过程
def train_one_epoch(model, train_loader, optimizer, criterion, device):
    model.train()  # 设置模型为训练模式
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # 将每个批次的数据和标签移到 GPU

        # 前向传播
        outputs = model(inputs)  # 输出形状 (batch_size, output_dim)
        
        # 计算损失
        loss = criterion(outputs, labels)  # 交叉熵损失
        running_loss += loss.item()
        
        # 反向传播和优化
        optimizer.zero_grad()  # 清空之前的梯度
        loss.backward()  # 计算梯度
        optimizer.step()  # 更新参数

        # 计算准确率
        _, predicted = torch.max(outputs, 1)  # 获取最大概率的类
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = running_loss / len(train_loader)
    accuracy = correct / total * 100
    return avg_loss, accuracy

# 定义验证过程
def validate(model, val_loader, criterion, device):
    model.eval()  # 设置模型为评估模式
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # 在评估时不计算梯度
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # 前向传播
            outputs = model(inputs)
            
            # 计算损失
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # 计算准确率
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = running_loss / len(val_loader)
    accuracy = correct / total * 100
    return avg_loss, accuracy

# 训练和验证循环
num_epochs = 10
for epoch in range(num_epochs):
    # 训练
    train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer, criterion, device)

    # 验证
    val_loss, val_accuracy = validate(model, val_loader, criterion, device)

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")


Using device: cpu
SimpleTransformer(
  (input_fc): Linear(in_features=10, out_features=64, bias=True)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (linear1): Linear(in_features=64, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=64, bias=True)
    (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_



Epoch [1/10]
Train Loss: 0.7312, Train Accuracy: 49.60%
Validation Loss: 0.7037, Validation Accuracy: 49.80%
Epoch [2/10]
Train Loss: 0.6977, Train Accuracy: 50.80%
Validation Loss: 0.7031, Validation Accuracy: 50.20%
Epoch [3/10]
Train Loss: 0.6993, Train Accuracy: 48.30%
Validation Loss: 0.6939, Validation Accuracy: 50.20%
Epoch [4/10]
Train Loss: 0.7016, Train Accuracy: 48.40%
Validation Loss: 0.7001, Validation Accuracy: 50.20%
Epoch [5/10]
Train Loss: 0.7064, Train Accuracy: 47.50%
Validation Loss: 0.6975, Validation Accuracy: 49.80%
Epoch [6/10]
Train Loss: 0.6957, Train Accuracy: 49.00%
Validation Loss: 0.6943, Validation Accuracy: 49.80%
Epoch [7/10]
Train Loss: 0.6990, Train Accuracy: 49.60%
Validation Loss: 0.6945, Validation Accuracy: 49.80%
Epoch [8/10]
Train Loss: 0.7011, Train Accuracy: 51.60%
Validation Loss: 0.6932, Validation Accuracy: 50.20%
Epoch [9/10]
Train Loss: 0.7009, Train Accuracy: 49.40%
Validation Loss: 0.6932, Validation Accuracy: 50.20%
Epoch [10/10]
Train