In [1]:
%load_ext autoreload
%autoreload 2

import os
import zipfile

if not os.path.exists("datasets/"):
    with zipfile.ZipFile("Multi30K.zip", "r") as zip_ref:
        zip_ref.extractall()

In [2]:
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# 1. 初始化GPT2 Tokenizer
en_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
de_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# 添加特殊标记
# special_tokens = {"bos_token": "<sos>", "eos_token": "<eos>", "pad_token": "<pad>"}
# en_tokenizer.add_special_tokens(special_tokens)
# de_tokenizer.add_special_tokens(special_tokens)

# 2. 自定义数据集
class Multi30KDataset(Dataset):
    def __init__(self, en_path, de_path, en_tokenizer, de_tokenizer):
        self.en_sentences = self._read_file(en_path)
        self.de_sentences = self._read_file(de_path)
        self.en_tokenizer = en_tokenizer
        self.de_tokenizer = de_tokenizer
        assert len(self.en_sentences) == len(self.de_sentences), "数据不匹配！"

    def _read_file(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f]

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, idx):
        en_encoded = self.en_tokenizer(
            self.en_sentences[idx],
            return_tensors="pt",
            padding=False,
            truncation=True,
            add_special_tokens=True,
        )["input_ids"].squeeze(0)

        de_encoded = self.de_tokenizer(
            self.de_sentences[idx],
            return_tensors="pt",
            padding=False,
            truncation=True,
            add_special_tokens=True,
        )["input_ids"].squeeze(0)

        return en_encoded, de_encoded

# 3. 定义collate_fn
def collate_fn(batch):
    en_batch, de_batch = zip(*batch)
    en_batch = pad_sequence(en_batch, batch_first=True, padding_value=en_tokenizer.pad_token_id)
    de_batch = pad_sequence(de_batch, batch_first=True, padding_value=de_tokenizer.pad_token_id)
    return en_batch, de_batch

# 4. 初始化数据集和数据加载器
en_file_path = 'datasets/train/train.en'
de_file_path = 'datasets/train/train.de'

dataset = Multi30KDataset(en_file_path, de_file_path, en_tokenizer, de_tokenizer)
dataloader = DataLoader(dataset, batch_size=64, num_workers=12, pin_memory=True, shuffle=True, collate_fn=collate_fn)

# 5. 测试数据加载器
for en_batch, de_batch in dataloader:
    print("English batch shape:", en_batch.shape)
    print("German batch shape:", de_batch.shape)
    print("English batch example (tokens):", en_batch[0])
    print("German batch example (tokens):", de_batch[0])
    print("Decoded English:", en_tokenizer.decode(en_batch[0], skip_special_tokens=False))
    print("Decoded German:", de_tokenizer.decode(de_batch[0], skip_special_tokens=False))
    break


English batch shape: torch.Size([64, 31])
German batch shape: torch.Size([64, 44])
English batch example (tokens): tensor([  101, 12136, 17416, 14879, 15228, 10135, 12962, 10135, 15365, 10106,
          169, 12249, 10111, 25520, 11570,   119,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0])
German batch example (tokens): tensor([  101, 12210, 61694, 10237, 91982, 32380, 10129, 15898, 28508, 10106,
        10599, 12158,   118,   112,   182,   112,   118, 22304,   118, 11928,
        12962, 73053, 10329, 10599, 67643,   119,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
Decoded English: [CLS] White male playing guitar on live on stage in a rock and roll band. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Decoded German: [CLS] Ein hellhäutiger Mann spielt in einer Rock -'n'- Rol

In [3]:
from transformer import Transformer

# 1. Transformer 模型参数
vocab_size = len(en_tokenizer)
d_model = 512
num_heads = 8
num_layers = 2
d_ff = 2048
max_seq_len = 100
dropout = 0.1

# 2. 填充值索引
src_pad_idx = en_tokenizer.pad_token_id
tgt_pad_idx = de_tokenizer.pad_token_id

# 3. 初始化 Transformer
transformer = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout)

# 打印验证
print(f"Transformer initialized.")
print(f"Source padding index: {src_pad_idx}, Target padding index: {tgt_pad_idx}")
print(f"Vocabulary size: {vocab_size}")



Transformer initialized.
Source padding index: 0, Target padding index: 0
Vocabulary size: 119547


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# 1. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=tgt_pad_idx)  # 忽略填充标记的损失
optimizer = optim.AdamW(transformer.parameters(), lr=0.0001)
num_epochs = 5

# 2. 定义训练函数
def train_epoch(transformer, dataloader, criterion, optimizer, device):
    transformer.train()  # 切换到训练模式
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        
        # 修正后的生成掩码
        tgt_input = tgt[:, :-1]
        tgt_target = tgt[:, 1:]

        # 构造掩码
        src_mask = transformer.make_src_mask(src, src_pad_idx)
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)  # 修正为 tgt_input

        # 前向传播
        output = transformer(src, tgt_input, src_mask, tgt_mask)

        # 调整输出形状以计算损失
        output = output.reshape(-1, vocab_size)
        tgt_target = tgt_target.reshape(-1)

        # 计算损失
        loss = criterion(output, tgt_target)

        # 反向传播与优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        # 在tqdm进度条中显示当前batch的loss
        progress_bar.set_postfix(batch_loss=loss.item())

    return total_loss / len(dataloader)

# 3. 定义训练主循环
def train_model(transformer, dataloader, num_epochs, device, pretrain=None):
    if pretrain:
        transformer.load_state_dict(torch.load(pretrain))
        print(f"Loaded pre-trained model from {pretrain}")
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = train_epoch(transformer, dataloader, criterion, optimizer, device)
        print(f"Epoch Loss: {epoch_loss:.4f}")

# 4. 开始训练
transformer = transformer.to(device)
pretrain_path = 'transformer.pth'

In [5]:
train_model(transformer, dataloader, num_epochs, device, pretrain=None)


Epoch 1/5


Training:   0%|          | 0/454 [00:00<?, ?it/s]

Training: 100%|██████████| 454/454 [01:29<00:00,  5.05it/s, batch_loss=3.83]


Epoch Loss: 5.1123
Epoch 2/5


Training: 100%|██████████| 454/454 [01:31<00:00,  4.97it/s, batch_loss=2.68]


Epoch Loss: 3.1154
Epoch 3/5


Training: 100%|██████████| 454/454 [01:31<00:00,  4.94it/s, batch_loss=2.4] 


Epoch Loss: 2.4791
Epoch 4/5


Training: 100%|██████████| 454/454 [01:31<00:00,  4.94it/s, batch_loss=1.78]


Epoch Loss: 2.0926
Epoch 5/5


Training: 100%|██████████| 454/454 [01:31<00:00,  4.96it/s, batch_loss=1.35]

Epoch Loss: 1.8131





In [6]:
torch.save(transformer.state_dict(), "transformer.pth")

In [14]:
import torch

# 修正拼写错误
transformer.load_state_dict(torch.load("transformer.pth", map_location=torch.device('cpu')))
transformer.eval()  # 切换到评估模式

with torch.no_grad():
    for batch in dataloader:
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        
        # 修正后的生成掩码
        tgt_input = tgt[:, :-1]
        tgt_target = tgt[:, 1:]

        # 构造掩码
        src_mask = transformer.make_src_mask(src, src_pad_idx)
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)  # 修正为 tgt_input

        # 前向传播
        output = transformer(src, tgt_input, src_mask, tgt_mask)

        # 获取预测结果
        output = output.argmax(dim=-1)

        # 显示原始句子和预测句子
        print("Source Sentence:", en_tokenizer.decode(src[0], skip_special_tokens=True, clear_special_tokens=True))
        print("Target Sentence:", de_tokenizer.decode(tgt[0], skip_special_tokens=False, clear_special_tokens=True))
        print("Predicted Sentence:", de_tokenizer.decode(output[0], skip_special_tokens=False, clear_special_tokens=True))
        break

Source Sentence: A group of children outside a building includes a boy jumping in the grass.
Target Sentence: [CLS] Eine Gruppe Kinder vor einem Gebäude, darunter ein Junge, der im Gras springt. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Predicted Sentence: Eine Gruppe Kinder vor einem Gebäude, der ein Junge spring der im Gras springt. [SEP].eee [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP]


In [8]:
sentence = "A woman is walking on the street."
transformer.eval()  # 切换到评估模式
with torch.no_grad():
    # 将自定义句子进行编码
    src = en_tokenizer.encode(sentence, return_tensors='pt').to(device)
    
    # 构造目标输入和目标掩码
    tgt_input = torch.tensor([[de_tokenizer.bos_token_id]], device=device)  # 目标输入以BOS开始
    src_mask = transformer.make_src_mask(src, src_pad_idx)
    
    # 逐步生成目标序列
    for _ in range(100):  # 假设最大生成长度为100
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)
        output = transformer(src, tgt_input, src_mask, tgt_mask)
        next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
        tgt_input = torch.cat([tgt_input, next_token], dim=-1)
        if next_token.item() == de_tokenizer.eos_token_id:
            break
    
    # 获取预测结果
    output_sentence = de_tokenizer.decode(tgt_input[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
    # 显示原始句子和预测句子
    print("Source Sentence:", sentence)
    print("Predicted Sentence:", output_sentence)


RuntimeError: Could not infer dtype of NoneType

---

观察👀：

1. 预测的句子的首字母总是缺少，我猜测是GPT2分词器的缘故
2. 翻译的句子末尾总会有乱码一样的幻觉，可能也是分词器？

我将使用BERT分词器尝试

使用BERT之后，句子开头的首字母的缺少的情况解决了

但是末尾总会出现乱码，并且使用自定义source sentence的时候总会出现开头的 `##er` 这是GPT2分词器的特征，很奇怪

