In [7]:
%load_ext autoreload
%autoreload 2

import os
import zipfile

if not os.path.exists("datasets/"):
    with zipfile.ZipFile("Multi30K.zip", "r") as zip_ref:
        zip_ref.extractall()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---

## Tokenizer

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformer_tokenizer import Tokenizer

class Multi30KDataset(Dataset):
    def __init__(self, en_file, de_file, en_tokenizer, de_tokenizer):
        self.en_tokenizer = en_tokenizer
        self.de_tokenizer = de_tokenizer
        with open(en_file, 'r', encoding='utf-8') as f:
            self.en_lines = f.readlines()
        with open(de_file, 'r', encoding='utf-8') as f:
            self.de_lines = f.readlines()
        assert len(self.en_lines) == len(self.de_lines), "English and German files must have the same number of lines."

    def __len__(self):
        return len(self.en_lines)

    def __getitem__(self, idx):
        en_sentence = self.en_lines[idx].strip()
        de_sentence = self.de_lines[idx].strip()
        en_tokens = self.en_tokenizer.tokenize(en_sentence)
        de_tokens = self.de_tokenizer.tokenize(de_sentence)
        return torch.tensor(en_tokens), torch.tensor(de_tokens)
    
def collate_fn(batch):
    en_batch, de_batch = zip(*batch)
    en_batch = torch.nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=0)
    de_batch = torch.nn.utils.rnn.pad_sequence(de_batch, batch_first=True, padding_value=0)
    return en_batch, de_batch

en_tokenizer = Tokenizer(vocab_size=10000)
de_tokenizer = Tokenizer(vocab_size=10000)

with open("datasets/train/train.en", "r", encoding="utf-8") as f:
    en_lines = f.readlines()
    en_tokenizer.fit(en_lines)
with open("datasets/train/train.de", "r", encoding="utf-8") as f:
    de_lines = f.readlines()
    de_tokenizer.fit(de_lines)

dataset = Multi30KDataset(
    en_file="datasets/train/train.en",
    de_file="datasets/train/train.de",
    en_tokenizer=en_tokenizer,
    de_tokenizer=de_tokenizer
)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

batch_size = 64
num_workers = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers)



In [9]:
# 获取一个 batch
batch = next(iter(train_dataloader))

# 提取英文和德文数据
src_batch, tgt_batch = batch

# 提取第一对样本
first_src_sample = src_batch[0]  # 第一个英文句子的 token ID 序列
first_tgt_sample = tgt_batch[0]  # 第一个德文句子的 token ID 序列

# 将 token ID 序列转换为自然语言句子
first_src_sentence = en_tokenizer.detokenize(first_src_sample.tolist())
first_tgt_sentence = de_tokenizer.detokenize(first_tgt_sample.tolist())

# 打印结果
print("First English Sentence (Token IDs):", first_src_sample.tolist())
print("First English Sentence (Text):", first_src_sentence)
print("First German Sentence (Token IDs):", first_tgt_sample.tolist())
print("First German Sentence (Text):", first_tgt_sentence)

First English Sentence (Token IDs): [2, 23, 32, 6, 90, 66, 162, 423, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
First English Sentence (Text): <SOS> young girl in pink hat taking pictures. <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
First German Sentence (Token IDs): [2, 5, 177, 25, 9, 7, 6, 328, 106, 9, 68, 447, 4, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
First German Sentence (Text): <SOS> ein junges mädchen, in einem rosafarbenen hut, macht fotos. <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


---

## Preprocess

In [10]:
from transformer import Transformer

# 1. Transformer 模型参数
vocab_size = en_tokenizer.get_vocab_size()
d_model = 512
num_heads = 8
num_layers = 2
d_ff = 2048
max_seq_len = 100
dropout = 0.1

# 2. 填充值索引
src_pad_idx = en_tokenizer.pad_token_id
tgt_pad_idx = de_tokenizer.pad_token_id

# 3. 初始化 Transformer
transformer = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout)

# 打印验证
print(f"Transformer initialized.")
print(f"Source padding index: {src_pad_idx}, Target padding index: {tgt_pad_idx}")
print(f"Vocabulary size: {vocab_size}")



Transformer initialized.
Source padding index: 0, Target padding index: 0
Vocabulary size: 10000


---

## Train

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.auto import tqdm

vocab_size = en_tokenizer.get_vocab_size()
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_len = 100
dropout = 0.3
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
transformer = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout).to(device)

# 1. 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=tgt_pad_idx)  # 忽略填充标记的损失
optimizer = optim.AdamW(transformer.parameters(), lr=1e-5)

# 2. 定义训练函数
def train_epoch(transformer, dataloader, criterion, optimizer, device):
    transformer.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=True)
    for batch in progress_bar:
        # break
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        # 修正后的生成掩码
        tgt_input = tgt[:, :-1]
        tgt_target = tgt[:, 1:]
        
        # 构造掩码
        src_mask = transformer.make_src_mask(src, src_pad_idx)
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)
        
        # 前向传播
        output = transformer(src, tgt_input, src_mask, tgt_mask)
        
        # 找到每个序列中EOS的位置
        eos_positions = (tgt_target == 3).nonzero(as_tuple=True)
        
        # 处理每个序列
        batch_loss = 0
        for i in range(tgt_target.size(0)):
            # 找到当前序列的EOS位置
            eos_idx = eos_positions[1][eos_positions[0] == i]
            if eos_idx.numel() > 0:
                # 如果找到EOS，截取到EOS位置
                seq_len = eos_idx[0] + 1
            else:
                # 如果没有EOS，使用整个序列
                seq_len = tgt_target.size(1)
            
            # 计算当前序列的损失
            seq_output = output[i, :seq_len]
            seq_target = tgt_target[i, :seq_len]
            loss = criterion(seq_output, seq_target)
            batch_loss += loss
        
        # 平均损失
        loss = batch_loss / tgt_target.size(0)
        
        # 反向传播与优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(batch_loss=loss.item())
    return total_loss / len(dataloader)

# 3. 定义验证函数
def validate_epoch(transformer, dataloader, criterion, device):
    transformer.eval()  # 切换到评估模式
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Validation", leave=True)
    with torch.no_grad():
        for batch in progress_bar:
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)
            # 修正后的生成掩码
            tgt_input = tgt[:, :-1]
            tgt_target = tgt[:, 1:]
            # 构造掩码
            src_mask = transformer.make_src_mask(src, src_pad_idx)
            tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)  # 修正为 tgt_input
            # 前向传播
            output = transformer(src, tgt_input, src_mask, tgt_mask)
            # 找到每个序列中EOS的位置
            eos_positions = (tgt_target == 3).nonzero(as_tuple=True)
            
            # 处理每个序列
            batch_loss = 0
            for i in range(tgt_target.size(0)):
                # 找到当前序列的EOS位置
                eos_idx = eos_positions[1][eos_positions[0] == i]
                if eos_idx.numel() > 0:
                    # 如果找到EOS，截取到EOS位置
                    seq_len = eos_idx[0] + 1
                else:
                    # 如果没有EOS，使用整个序列
                    seq_len = tgt_target.size(1)
                
                # 计算当前序列的损失
                seq_output = output[i, :seq_len]
                seq_target = tgt_target[i, :seq_len]
                loss = criterion(seq_output, seq_target)
                batch_loss += loss
            
            # 平均损失
            loss = batch_loss / tgt_target.size(0)
            
            total_loss += loss.item()
            progress_bar.set_postfix(batch_loss=loss.item())
    return total_loss / len(dataloader)

# 4. 定义训练主循环
def train_model(transformer, train_dataloader, val_dataloader, num_epochs, device, pretrain=None):
    best_loss = float('inf')
    if pretrain:
        transformer.load_state_dict(torch.load(pretrain))
        print(f"Loaded pre-trained model from {pretrain}")
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = train_epoch(transformer, train_dataloader, criterion, optimizer, device)
        print(f"Training Loss: {epoch_loss:.4f}")
        val_loss = validate_epoch(transformer, val_dataloader, criterion, device)
        if val_loss < best_loss:
            best_loss = val_loss
            best_state_dict = transformer.state_dict()
            print(f"New best validation loss: {best_loss:.4f} in epoch {epoch + 1}")
        print(f"Validation Loss: {val_loss:.4f}")
    if best_state_dict:
            model_name = f"best_loss_{val_loss:.4f}_in_{epoch + 1}.pth"
            torch.save(best_state_dict, model_name)
            print(f"Saved model to {model_name}")

# 5. 开始训练
transformer = transformer.to(device)
pretrain_path = 'transformer.pth'

In [12]:
train_model(transformer, train_dataloader, val_dataloader, num_epochs=50, device=device, pretrain=None)

Epoch 1/50


Training:   0%|          | 0/363 [00:00<?, ?it/s]

Training Loss: 6.1780


Validation:   0%|          | 0/46 [00:00<?, ?it/s]

New best validation loss: 5.0948 in epoch 1
Validation Loss: 5.0948
Epoch 2/50


Training:   0%|          | 0/363 [00:00<?, ?it/s]

In [None]:
torch.save(transformer.state_dict(), "transformer.pth")

---

## Eval

In [None]:
transformer.load_state_dict(torch.load('transformer.pth', map_location=device))

In [None]:
def predict_sample(transformer=transformer, dataloader=train_dataloader, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
    transformer.eval()  # 切换到评估模式
    with torch.no_grad():  # 禁用梯度计算
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)
        # 修正后的生成掩码
        tgt_input = tgt[:, :-1]
        tgt_target = tgt[:, 1:]
        # 构造掩码
        src_mask = transformer.make_src_mask(src, src_pad_idx)
        tgt_mask = transformer.make_trg_mask(tgt_input, tgt_pad_idx)  # 修正为 tgt_input
        # 前向传播
        output = transformer(src, tgt_input, src_mask, tgt_mask)
        # 调整输出形状以计算损失
        output = output.reshape(-1, vocab_size)
        tgt_target = tgt_target.reshape(-1)

        src_sample = src_batch[0]
        tgt_sample = tgt_batch[0]

        src_sample_sentence = en_tokenizer.detokenize(src_sample.tolist())
        tgt_sample_sentence = de_tokenizer.detokenize(tgt_sample.tolist())

        print("Source Sentence (Token IDs):", src_sample.tolist())
        print("Source Sentence (Text):", src_sample_sentence)
        print("Target Sentence (Token IDs):", tgt_sample.tolist())
        print("Target Sentence (Text):", tgt_sample_sentence)

        # 提取预测的 token ID 序列
        predicted_tokens = output.argmax(dim=-1).cpu().tolist()
        eos_index = predicted_tokens.index(3)
        predicted_tokens = predicted_tokens[:eos_index + 1]
        # 将预测的 token ID 序列还原为自然语言
        predicted_sentence = de_tokenizer.detokenize(predicted_tokens)

        # 打印结果
        print("Predicted Tokens:", predicted_tokens)
        print("Predicted Sentence:", predicted_sentence)

In [None]:
predict_sample()