In [16]:
%load_ext autoreload
%autoreload 2

import os
import zipfile

if not os.path.exists("datasets/"):
    with zipfile.ZipFile("Multi30K.zip", "r") as zip_ref:
        zip_ref.extractall()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# 1. 初始化GPT2 Tokenizer
en_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
de_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# 添加特殊标记
special_tokens = {"bos_token": "<sos>", "eos_token": "<eos>", "pad_token": "<pad>"}
en_tokenizer.add_special_tokens(special_tokens)
de_tokenizer.add_special_tokens(special_tokens)

# 2. 自定义数据集
class Multi30KDataset(Dataset):
    def __init__(self, en_path, de_path, en_tokenizer, de_tokenizer):
        self.en_sentences = self._read_file(en_path)
        self.de_sentences = self._read_file(de_path)
        self.en_tokenizer = en_tokenizer
        self.de_tokenizer = de_tokenizer
        assert len(self.en_sentences) == len(self.de_sentences), "数据不匹配！"

    def _read_file(self, path):
        with open(path, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f]

    def __len__(self):
        return len(self.en_sentences)

    def __getitem__(self, idx):
        en_encoded = self.en_tokenizer(
            self.en_sentences[idx],
            return_tensors="pt",
            padding=False,
            truncation=True,
            add_special_tokens=True,
        )["input_ids"].squeeze(0)

        de_encoded = self.de_tokenizer(
            self.de_sentences[idx],
            return_tensors="pt",
            padding=False,
            truncation=True,
            add_special_tokens=True,
        )["input_ids"].squeeze(0)

        return en_encoded, de_encoded

# 3. 定义collate_fn
def collate_fn(batch):
    en_batch, de_batch = zip(*batch)
    en_batch = pad_sequence(en_batch, batch_first=True, padding_value=en_tokenizer.pad_token_id)
    de_batch = pad_sequence(de_batch, batch_first=True, padding_value=de_tokenizer.pad_token_id)
    return en_batch, de_batch

# 4. 初始化数据集和数据加载器
en_file_path = 'datasets/train/train.en'
de_file_path = 'datasets/train/train.de'

dataset = Multi30KDataset(en_file_path, de_file_path, en_tokenizer, de_tokenizer)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

# 5. 测试数据加载器
for en_batch, de_batch in dataloader:
    print("English batch shape:", en_batch.shape)
    print("German batch shape:", de_batch.shape)
    print("English batch example (tokens):", en_batch[0])
    print("German batch example (tokens):", de_batch[0])
    print("Decoded English:", en_tokenizer.decode(en_batch[0]))
    print("Decoded German:", de_tokenizer.decode(de_batch[0]))
    break


English batch shape: torch.Size([64, 30])
German batch shape: torch.Size([64, 32])
English batch example (tokens): tensor([   101,    138,  10817,  56629,    169,  99345,  10135,  10105,  19642,
         10129,    119,    102, 119549, 119549, 119549, 119549, 119549, 119549,
        119549, 119549, 119549, 119549, 119549, 119549, 119549, 119549, 119549,
        119549, 119549, 119549])
German batch example (tokens): tensor([   101,  12210,  15898,    174,  65623,  10329,  10268,  41941,  44271,
         16757,  18599,    119,    102, 119549, 119549, 119549, 119549, 119549,
        119549, 119549, 119549, 119549, 119549, 119549, 119549, 119549, 119549,
        119549, 119549, 119549, 119549, 119549])
Decoded English: [CLS] A man riding a bike on the pier. [SEP] <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Decoded German: [CLS] Ein Mann fährt auf dem Pier Fahrrad. [SEP] <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

In [18]:
from transformer import Transformer

# 1. Transformer 模型参数
vocab_size = len(en_tokenizer)
d_model = 512
num_heads = 8
num_layers = 2
d_ff = 2048
max_seq_len = 100
dropout = 0.1

# 2. 填充值索引
src_pad_idx = en_tokenizer.pad_token_id
tgt_pad_idx = de_tokenizer.pad_token_id

# 3. 初始化 Transformer
transformer = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout)

# 打印验证
print(f"Transformer initialized.")
print(f"Source padding index: {src_pad_idx}, Target padding index: {tgt_pad_idx}")
print(f"Vocabulary size: {vocab_size}")



Transformer initialized.
Source padding index: 119549, Target padding index: 119549
Vocabulary size: 119550


In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# 1. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=tgt_pad_idx)  # 忽略填充标记的损失
optimizer = optim.AdamW(transformer.parameters(), lr=0.0001)
num_epochs = 30

# 2. 定义训练函数
def train_epoch(transformer, dataloader, criterion, optimizer, device):
    transformer.train()  # 切换到训练模式
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        
        # 修正后的生成掩码
        tgt_input = tgt[:, :-1]
        tgt_target = tgt[:, 1:]

        # 构造掩码
        src_mask = transformer.make_src_mask(src, src_pad_idx)
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)  # 修正为 tgt_input

        # 前向传播
        output = transformer(src, tgt_input, src_mask, tgt_mask)

        # 调整输出形状以计算损失
        output = output.reshape(-1, vocab_size)
        tgt_target = tgt_target.reshape(-1)

        # 计算损失
        loss = criterion(output, tgt_target)

        # 反向传播与优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        # 在tqdm进度条中显示当前batch的loss
        progress_bar.set_postfix(batch_loss=loss.item())

    return total_loss / len(dataloader)

# 3. 定义训练主循环
def train_model(transformer, dataloader, num_epochs, device, pretrain=None):
    if pretrain:
        transformer.load_state_dict(torch.load(pretrain))
        print(f"Loaded pre-trained model from {pretrain}")
    
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = train_epoch(transformer, dataloader, criterion, optimizer, device)
        print(f"Epoch Loss: {epoch_loss:.4f}")

# 4. 开始训练
transformer = transformer.to(device)
pretrain_path = '/path/to/pretrained/model.pth'
train_model(transformer, dataloader, num_epochs, device, pretrain=None)

In [20]:
train_model(transformer, dataloader, num_epochs, device)


Epoch 1/10


Training:   0%|          | 0/454 [00:00<?, ?it/s]

Training: 100%|██████████| 454/454 [02:57<00:00,  2.56it/s, batch_loss=3.77]


Epoch Loss: 5.1243
Epoch 2/10


Training: 100%|██████████| 454/454 [02:57<00:00,  2.56it/s, batch_loss=2.35]


Epoch Loss: 3.1264
Epoch 3/10


Training: 100%|██████████| 454/454 [02:56<00:00,  2.57it/s, batch_loss=2.78]


Epoch Loss: 2.4791
Epoch 4/10


Training: 100%|██████████| 454/454 [02:56<00:00,  2.57it/s, batch_loss=1.49]


Epoch Loss: 2.0872
Epoch 5/10


Training: 100%|██████████| 454/454 [02:56<00:00,  2.57it/s, batch_loss=1.65]


Epoch Loss: 1.8046
Epoch 6/10


Training: 100%|██████████| 454/454 [02:56<00:00,  2.57it/s, batch_loss=1.71]


Epoch Loss: 1.5799
Epoch 7/10


Training: 100%|██████████| 454/454 [02:57<00:00,  2.56it/s, batch_loss=1.67]


Epoch Loss: 1.3928
Epoch 8/10


Training: 100%|██████████| 454/454 [02:57<00:00,  2.56it/s, batch_loss=0.968]


Epoch Loss: 1.2247
Epoch 9/10


Training: 100%|██████████| 454/454 [02:57<00:00,  2.56it/s, batch_loss=0.683]


Epoch Loss: 1.0776
Epoch 10/10


Training:  38%|███▊      | 171/454 [01:06<01:56,  2.42it/s, batch_loss=0.96] 

In [None]:
torch.save(transformer.state_dict(), "transformer.pth")

In [None]:
import torch

# 修正拼写错误
transformer.load_state_dict(torch.load("transformer.pth", map_location=torch.device('cpu')))
transformer.eval()  # 切换到评估模式

with torch.no_grad():
    for batch in dataloader:
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        
        # 修正后的生成掩码
        tgt_input = tgt[:, :-1]
        tgt_target = tgt[:, 1:]

        # 构造掩码
        src_mask = transformer.make_src_mask(src, src_pad_idx)
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)  # 修正为 tgt_input

        # 前向传播
        output = transformer(src, tgt_input, src_mask, tgt_mask)

        # 获取预测结果
        output = output.argmax(dim=-1)

        # 显示原始句子和预测句子
        print("Source Sentence:", en_tokenizer.decode(src[0], skip_special_tokens=True))
        print("Target Sentence:", de_tokenizer.decode(tgt[0], skip_special_tokens=True))
        print("Predicted Sentence:", de_tokenizer.decode(output[0], skip_special_tokens=True))
        break

Source Sentence: An arts and crafts class paints a project while the teacher oversees and smiles at them.
Target Sentence: Eine Kunstklasse malt an einem Projekt und wird von einem lächelnden Lehrer betreut.
Predicted Sentence: Ein Familie und undt und einem Klassen und mit von ihnen Klassenächelt Fenster undracht und,eeeeeeeeeeeeeeeeeee


In [None]:
sentence = "fuck you."
transformer.eval()  # 切换到评估模式
with torch.no_grad():
    # 将自定义句子进行编码
    src = en_tokenizer.encode(sentence, return_tensors='pt').to(device)
    
    # 构造目标输入和目标掩码
    tgt_input = torch.tensor([[de_tokenizer.bos_token_id]], device=device)  # 目标输入以BOS开始
    src_mask = transformer.make_src_mask(src, src_pad_idx)
    
    # 逐步生成目标序列
    for _ in range(100):  # 假设最大生成长度为100
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)
        output = transformer(src, tgt_input, src_mask, tgt_mask)
        next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
        tgt_input = torch.cat([tgt_input, next_token], dim=-1)
        if next_token.item() == de_tokenizer.eos_token_id:
            break
    
    # 获取预测结果
    output_sentence = de_tokenizer.decode(tgt_input[0], skip_special_tokens=True)
    
    # 显示原始句子和预测句子
    print("Source Sentence:", sentence)
    print("Predicted Sentence:", output_sentence)


Source Sentence: fuck you.
Predicted Sentence: ##ränköpfen auf.. aus...........................


---

观察👀：

1. 预测的句子的首字母总是缺少，我猜测是GPT2分词器的缘故
2. 翻译的句子末尾总会有乱码一样的幻觉，可能也是分词器？

我将使用BERT分词器尝试

使用