In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy
import time
from datetime import timedelta

import spacy # 一个自然语言文本处理库

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F

from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
from torchtext.data.functional import to_map_style_dataset  # 将迭代器转化为 Dataset 类型，可直接索引


from transformer_package import make_model, scheduler, LabelSmoothingKL

In [2]:
class ComputeLoss:
    def __init__(self, criterion):
        self.criterion = criterion

    def __call__(self, x, y):
        x = F.log_softmax(x, dim=-1)  # if KL divergence is used
        loss = (
            self.criterion(    # x: [b, len, vocab_size] ---> [b*len, vocab_size];  y: [b, len] ---> [b*len, ]
                x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
            )
        )
        return loss

In [3]:
def train_epoch(epoch, data_loader, model, loss_compute, optimizer, scheduler, padding_idx):
    """Train a single epoch."""
    start = time.time()
    total_loss = 0
    total_tokens = 0
    model.train()
    for i, (src, tgt) in enumerate(data_loader):
        # src, tgt shape: [batch_size, max_len]
        tgt_y = copy.deepcopy(tgt[:, 1:])  # 真实的序列，用来构建loss，第一位往往是起始符
        tgt_seq = copy.deepcopy(tgt[:, :-1])   # 输入decoder的序列，最后一位的token是用不到的。因为按照decoder的工作原理，最后一个token的生成是不会依赖到最后一个token的信息的。
        ntokens = (tgt_y != padding_idx).data.sum()
        
        # get padding mask and sequence mask
        src_mask = model.padding_mask(src, padding_idx)
        tgt_mask = model.padding_mask(tgt_seq, padding_idx) & model.sequence_mask(tgt_seq.size(-1))
        
        # train
        logit = model(src, tgt_seq, src_mask, tgt_mask)
        loss = loss_compute(logit, tgt_y)
        
        
        total_loss += loss
        total_tokens += ntokens
        
        loss /= ntokens # mean loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad(set_to_none=True) # 优化内存使用
        scheduler.step()

        lr = optimizer.param_groups[0]["lr"]
        elapsed = time.time() - start
        print(
            (
                "| Epoch {:3d} | {:5d}/{:5d} batches | Loss: {:6.2f} "
                + "| Tokens: {:5d} | Learning Rate: {:6.1e} | Time: {} |"
            ).format(epoch, i, len(data_loader), loss, ntokens, lr, timedelta(seconds=elapsed))
        )
        
        del loss

    return total_loss / total_tokens


def eval_epoch(epoch, data_loader, model, loss_compute, padding_idx):
    """Eval a single epoch."""
    start = time.time()
    total_loss = 0
    total_tokens = 0
    model.eval()
    with torch.no_grad():
        for i, (src, tgt) in enumerate(data_loader):
            # src, tgt shape: [batch_size, max_len]
            tgt_y = copy.deepcopy(tgt[:, 1:])  # 真实的序列，用来构建loss，第一位往往是起始符
            tgt_seq = copy.deepcopy(tgt[:, :-1])   # 输入decoder的序列，最后一位的token是用不到的。因为按照decoder的工作原理，最后一个token的生成是不会依赖到最后一个token的信息的。
            ntokens = (tgt_y != padding_idx).data.sum()
            
            # get padding mask and sequence mask
            src_mask = model.padding_mask(src, padding_idx)
            tgt_mask = model.padding_mask(tgt_seq, padding_idx) & model.sequence_mask(tgt_seq.size(-1))
            
            # train
            logit = model(src, tgt_seq, src_mask, tgt_mask)
            loss = loss_compute(logit, tgt_y)
        

            total_loss += loss
            total_tokens += ntokens
            
            
            loss /= ntokens # mean loss
    
            elapsed = time.time() - start
            print(
                (
                    "| Epoch {:3d} | {:5d}/{:5d} batches | Loss: {:6.2f} "
                    + "| Tokens: {:5d} | Time: {} |"
                ).format(epoch, i, len(data_loader), loss, ntokens, timedelta(seconds=elapsed))
            )
            
            del loss
            
    return total_loss / total_tokens

# Multi30k German-English Translation task

## Tokenization

In [4]:
def load_tokenizers():

    try:
        spacy_de = spacy.load("de_core_news_sm")
    except IOError:
        os.system("python -m spacy download de_core_news_sm")  # 大概率要翻墙才行
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    return spacy_de, spacy_en

# spacy_de, spacy_en = load_tokenizers()
# doc = spacy_en.tokenizer("This is a sentence.")
# print([(w.text, w.pos_) for w in doc])


def tokenize(text, tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]


def yield_tokens(data_iter, tokenizer, index):
    for from_to_tuple in data_iter:
        yield tokenizer(from_to_tuple[index])

## Build vocabulary

In [5]:
def build_vocabulary(spacy_de, spacy_en):
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    print("Building German Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_de, index=0),  # 'de'
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],   # 分别代表起始符、终止符、padding字符、未知字符
    )

    print("Building English Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_tgt = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_en, index=1),  # 'en'
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    vocab_src.set_default_index(vocab_src["<unk>"])  # This index will be returned when OOV token is queried.
    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_src, vocab_tgt



def load_vocab(spacy_de, spacy_en):
    if not os.path.exists("vocab.pt"):
        vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en)
        torch.save((vocab_src, vocab_tgt), "vocab.pt")
    else:
        vocab_src, vocab_tgt = torch.load("vocab.pt")
    print("Finished.\nVocabulary sizes:")
    print(len(vocab_src))
    print(len(vocab_tgt))
    return vocab_src, vocab_tgt



# vocab_src, vocab_tgt = load_vocab(spacy_de, spacy_en)
# print(vocab_src.get_stoi())

## Data Loader

In [6]:
def collate_batch(batch, src_pipline, tgt_pipline, src_vocab, tgt_vocab, max_padding=128, pad_id=2):  # <blank> token id
    '''
    负责在 DataLoad 提取一个 batch 的样本时，完成一系列预处理工作。
    所以，我们将 collate_batch 函数通过参数 collate_fn 传入 DataLoader，
    即可实现对变长数据的处理。
    '''
    bs_id = torch.tensor([0])  # <s> token id
    eos_id = torch.tensor([1])  # </s> token id
    src_list, tgt_list = [], []
    
    for (src, tgt) in batch:
        # 为每一句话添加起始符和结束符
        processed_src = torch.cat([bs_id, 
                                  torch.as_tensor(src_vocab(src_pipline(src)), dtype=torch.int64),
                                  eos_id],
                                  dim=0)
        processed_tgt = torch.cat([bs_id, 
                                  torch.as_tensor(tgt_vocab(tgt_pipline(tgt)), dtype=torch.int64),
                                  eos_id],
                                  dim=0)
        
        # 给长度不足max_padding的句子打padding
        processed_src = F.pad(processed_src, (0, max_padding - len(processed_src)), value=pad_id)  # # warning - overwrites values for negative values of padding - len
        src_list.append(processed_src)
        processed_tgt = F.pad(processed_tgt, (0, max_padding - len(processed_tgt)), value=pad_id)
        tgt_list.append(processed_tgt)
        
       
    src_batch = torch.stack(src_list)
    tgt_batch = torch.stack(tgt_list)
    
    return (src_batch, tgt_batch)



def create_dataloaders(vocab_src, vocab_tgt, spacy_de, spacy_en, batch_size=512, max_padding=128):
    
    def tokenize_de(text):    # src_pipline
        return tokenize(text, spacy_de)

    def tokenize_en(text):   # tgt_pipline
        return tokenize(text, spacy_en)
        
    def collate_fn(batch):
        return collate_batch(batch, 
                             tokenize_de, 
                             tokenize_en, 
                             vocab_src, 
                             vocab_tgt,
                             max_padding=max_padding,
                             pad_id=vocab_src.get_stoi()['<blank>'])
    
    train_iter, valid_iter, test_iter = datasets.Multi30k(language_pair=("de", "en"))
    
    train_iter_map = to_map_style_dataset(train_iter)
    valid_iter_map = to_map_style_dataset(valid_iter)
    
    train_dataloader = DataLoader(train_iter_map,
                                  batch_size,
                                  collate_fn=collate_fn)

    valid_dataloader = DataLoader(valid_iter_map,
                                  batch_size,
                                  collate_fn=collate_fn)

    return train_dataloader, valid_dataloader


# train_dataloader, valid_dataloader = create_dataloaders(vocab_src, vocab_tgt, spacy_de, spacy_en, batch_size=512, max_padding=128)

## Training

In [9]:
spacy_de, spacy_en = load_tokenizers()
vocab_src, vocab_tgt = load_vocab(spacy_de, spacy_en)

src_vocab_size = len(vocab_src)
tgt_vocab_size = len(vocab_tgt)
pad_idx = vocab_src['<blank>']
d_model = 512
init_lr = 1.0
warmup = 3000
num_epochs = 8
batch_size = 32
max_padding = 72

train_dataloader, valid_dataloader = create_dataloaders(vocab_src, vocab_tgt, spacy_de, spacy_en, batch_size, max_padding)


model = make_model(src_vocab_size, tgt_vocab_size, d_model)

criterion = LabelSmoothingKL(vocab_size=tgt_vocab_size, padding_idx=pad_idx, smoothing=0.1)

optimizer = optim.Adam(model.parameters(), lr=init_lr, betas=(0.9, 0.98), eps=1e-9)
lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                           lr_lambda=lambda step: scheduler(step, d_model, factor=1, warmup=warmup))

train_losses = []
val_losses = []
for epoch in range(num_epochs):
    start = time.time()
    model.train()
    train_loss = train_epoch(epoch,
        train_dataloader,
        model,
        ComputeLoss(criterion),
        optimizer,
        lr_scheduler,
        pad_idx
    )
    train_losses.append(train_loss.item())
    
    model.eval()
    val_loss = eval_epoch(epoch,
        valid_dataloader,
        model,
        ComputeLoss(criterion),
        pad_idx
    )

    val_losses.append(val_loss.item())
    print('-' * 59)
    print('| End of epoch {:3d} | Train Loss: {:8.3f} | Val Loss: {:8.3f} | time: {} |'
          .format(epoch, train_loss, val_loss, timedelta(seconds=time.time()-start)))
    print('-' * 59)

Finished.
Vocabulary sizes:
8185
6291
| Epoch   0 |     0/  907 batches | Loss:   7.67 | Tokens:   434 | Learning Rate: 2.7e-07 | Time: 0:00:08.505929 |
| Epoch   0 |     1/  907 batches | Loss:   7.67 | Tokens:   457 | Learning Rate: 5.4e-07 | Time: 0:00:17.537343 |
| Epoch   0 |     2/  907 batches | Loss:   7.66 | Tokens:   451 | Learning Rate: 8.1e-07 | Time: 0:00:27.362524 |
| Epoch   0 |     3/  907 batches | Loss:   7.67 | Tokens:   450 | Learning Rate: 1.1e-06 | Time: 0:00:36.351264 |
| Epoch   0 |     4/  907 batches | Loss:   7.66 | Tokens:   461 | Learning Rate: 1.3e-06 | Time: 0:00:45.246767 |
| Epoch   0 |     5/  907 batches | Loss:   7.68 | Tokens:   428 | Learning Rate: 1.6e-06 | Time: 0:00:54.132383 |
| Epoch   0 |     6/  907 batches | Loss:   7.64 | Tokens:   426 | Learning Rate: 1.9e-06 | Time: 0:01:02.069751 |
| Epoch   0 |     7/  907 batches | Loss:   7.64 | Tokens:   480 | Learning Rate: 2.2e-06 | Time: 0:01:10.344749 |
| Epoch   0 |     8/  907 batches | Loss: 

KeyboardInterrupt: 

In [None]:
plt.plot(train_losses, label='train_loss')
plt.plot(val_losses, label='val_loss')
plt.legend()