In [1]:
%load_ext autoreload
%autoreload 2
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2Tokenizer
from tqdm import tqdm
import csv
from transformer import Transformer

In [2]:
# 文件路径
en_file_path = 'Multi30K/datasets/train/train.en'

# 打开并读取文件
with open(en_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 打印前几行数据
for line in lines[:5]:
    print(line.strip())
    
print(f'lines number: {len(lines)}')
    
####################################

print('\n')
de_file_path = 'Multi30K/datasets/train/train.de'

# 打开并读取文件
with open(de_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    
# 打印前几行数据
for line in lines[:5]:
    print(line.strip())

print(f'lines number: {len(lines)}')


Two young, White males are outside near many bushes.
Several men in hard hats are operating a giant pulley system.
A little girl climbing into a wooden playhouse.
A man in a blue shirt is standing on a ladder cleaning a window.
Two men are at the stove preparing food.
lines number: 29001


Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.
Ein kleines Mädchen klettert in ein Spielhaus aus Holz.
Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.
Zwei Männer stehen am Herd und bereiten Essen zu.
lines number: 29001


In [3]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [4]:
import torch
from torch.utils.data import Dataset

class TranslateDataset(Dataset):
    def __init__(self, en_file_path, de_file_path, seq_length):
        # 读取英文数据
        with open(en_file_path, 'r', encoding='utf-8') as file:
            self.en_lines = [line.strip() for line in file.readlines()]
        
        # 读取德文数据
        with open(de_file_path, 'r', encoding='utf-8') as file:
            self.de_lines = [line.strip() for line in file.readlines()]
        
        # 确保数据长度一致
        assert len(self.en_lines) == len(self.de_lines), "英文和德文数据长度不一致"
        
        self.seq_length = seq_length

    def __len__(self):
        return len(self.en_lines)

    def __getitem__(self, idx):
        # 获取源序列和目标序列
        src = self.en_lines[idx]
        tgt = self.de_lines[idx]
        
        # 将文本转换为 token（假设你已经有一个分词器）
        src_tokens = tokenizer.encode(src)
        tgt_tokens = tokenizer.encode(tgt)
        
        
        # 截取或填充序列到固定长度
        src_tokens = src_tokens[:self.seq_length]
        tgt_tokens = tgt_tokens[:self.seq_length]
        
        # 填充到固定长度
        src_tokens = src_tokens + [0] * (self.seq_length - len(src_tokens))
        tgt_tokens = tgt_tokens + [0] * (self.seq_length - len(tgt_tokens))
        
        return torch.tensor(src_tokens, dtype=torch.long), torch.tensor(tgt_tokens, dtype=torch.long)

In [5]:
from torch.utils.data import DataLoader
import random

# 文件路径
en_file_path = 'Multi30K/datasets/train/train.en'
de_file_path = 'Multi30K/datasets/train/train.de'

# 创建数据集
seq_length = 50  # 假设序列长度为50
dataset = TranslateDataset(en_file_path, de_file_path, seq_length)

# 创建 DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 显示随机的样本
random_index = random.randint(0, len(dataset) - 1)
random_sample = dataset[random_index]
print("随机样本的英文句子:", random_sample[0])
print("随机样本的德文句子:", random_sample[1])

随机样本的英文句子: tensor([   32,   582,   287,   257, 28738,  4077, 19750, 32254,  4979,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
随机样本的德文句子: tensor([   36,   259, 20291,   287,   304,  7274,   300,   521,  2164,  9116,
        38572,  2688,    68,   842,  2120,  2853,  4643,   365, 11840,    13,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [6]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from transformer import Transformer

model = Transformer(vocab_size=tokenizer.vocab_size, d_model=512, num_heads=8, num_layers=2, d_ff=2048, max_seq_len=50)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # 忽略填充 token
optimizer = optim.Adam(model.parameters(), lr=0.0001)

for epoch in range(10):
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False)
    for src, tgt in progress_bar:
        optimizer.zero_grad()
        
        # 生成掩码
        src_mask = model.make_src_mask(src, src_pad_idx=0)
        print(f"src mask shape in notebook: {src_mask.shape}")
        tgt_mask = model.make_trg_mask(tgt, trg_pad_idx=0)
        print(f"tgt mask shape in notebook: {tgt_mask.shape}")
        # 前向传播
        output = model(src, tgt[:, :-1], src_mask, tgt_mask[:, :, :-1, :-1]) # TODO: Problems fucking here
        
        # 计算损失
        loss = criterion(output.view(-1, 10000), tgt[:, 1:].reshape(-1))
        
        # 反向传播
        loss.backward()
        optimizer.step()
    
        # 更新进度条显示
        progress_bar.set_postfix(loss=loss.item())
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1:   0%|          | 0/907 [00:00<?, ?it/s]

src mask shape in notebook: torch.Size([32, 1, 50, 1])
tgt mask shape in notebook: torch.Size([32, 1, 50, 50])


                                                

mask shape: torch.Size([32, 1, 49, 49])
scores shape: torch.Size([32, 8, 49, 49])




RuntimeError: shape '[32, 49, 8, 64]' is invalid for input of size 819200