In [1]:
#输入需要的库
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import numpy as np
import spacy

import random
import math
import time
import datetime
import os
#设置随机数种子使结果可重复
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(seq):
    """
    tokenize the sentence and return a list
    """
    return [tok.text for tok in spacy_de.tokenizer(seq)]
def tokenize_en(seq):
    """
    tokenize the sentence and return a list
    """
    return [tok.text for tok in spacy_en.tokenizer(seq)]
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

In [3]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device, **kwargs):
        super().__init__(**kwargs)

        assert hid_dim % n_heads == 0

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim//n_heads
        self.device = device

        self.dropout = nn.Dropout(dropout)

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        E = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        if mask is not None:
            E = E.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(E, dim=-1)

        X = torch.matmul(self.dropout(attention), V)

        X = X.permute(0, 2, 1, 3).contiguous()

        X = X. view(batch_size, -1, self.hid_dim)

        X = self.fc_o(X)

        return X, attention

In [4]:
class PositionWiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim: int, pf_dim: int, dropout: float):
        super().__init__()

        self.fc1 = nn.Linear(hid_dim, pf_dim)
        self.fc2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [5]:
class EncoderLayer(nn.Module):
    def __init__(self,
                 hid_dim:int,
                 pf_dim:int,
                 n_heads:int,
                 dropout:float,
                 device:torch.device,
                 **kwargs)->None:
        super().__init__(**kwargs)

        self.device = device

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionWiseFeedforwardLayer(hid_dim, pf_dim, dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src:torch.tensor, src_mask:torch.tensor)->torch.tensor:
        _src,_ = self.self_attention(src, src, src, src_mask)

        src = self.self_attn_layer_norm(src + self.dropout(_src))

        _src = self.positionwise_feedforward(src)

        src = self.ff_layer_norm(src + self.dropout(_src))

        return src

In [6]:
class Encoder(nn.Module):
    def __init__(self,
                 input_dim:int,
                 hid_dim:int,
                 n_layers:int,
                 n_heads:int,
                 pf_dim:int,
                 dropout:float,
                 device:torch.device,
                 max_len=100, **kwargs)->None:
        super().__init__(**kwargs)
        self.device = device

        self.tok_embeddings = nn.Embedding(input_dim, hid_dim)
        self.pos_embeddings = nn.Embedding(max_len, hid_dim)

        self.layers = nn.ModuleList([EncoderLayer(hid_dim, pf_dim, n_heads, dropout, device) for _ in range(n_layers)])

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask)->torch.tensor:
        batch_size = src.shape[0]
        src_len = src.shape[1]

        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        src = self.dropout((self.tok_embeddings(src) * self.scale) + self.pos_embeddings(pos))

        for layer in self.layers:
            src = layer(src, src_mask)

        return src

In [7]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device, **kwargs):
        super().__init__(**kwargs)

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.enc_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionWiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)

        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

        _trg, attention = self.enc_attention(trg, enc_src, enc_src, src_mask)

        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))

        _trg = self.positionwise_feedforward(trg)

        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        return trg, attention

In [8]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_len=100, **kwargs):
        super().__init__(*kwargs)

        self.device = device

        self.tok_embeddings = nn.Embedding(output_dim, hid_dim)
        self.pos_embeddings = nn.Embedding(max_len, hid_dim)

        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        trg = self.dropout((self.tok_embeddings(trg) * self.scale) + self.pos_embeddings(pos))

        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        output = self.fc_out(trg)

        return output, attention

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device, **kwargs):
        super().__init__(**kwargs)

        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

        trg_len = trg.shape[1]

        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()

        trg_mask = trg_pad_mask & trg_sub_mask

        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        enc_src = self.encoder(src, src_mask)

        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)

        return output, attention

In [10]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 128
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 256
DEC_PF_DIM = 256
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device)

dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 3,539,077 trainable parameters


In [12]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [13]:
model.apply(initialize_weights);

In [14]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [15]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [17]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output, _ = model(src, trg[:,:-1])

        output_dim = output.shape[-1]

        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])

            output_dim = output.shape[-1]

            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
N_EPOCHS = 10
CLIP = 1

LOG_DIR = os.path.join(os.curdir, ".logs", str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")))

os.makedirs(LOG_DIR)

MODEL_PATH = os.path.join(LOG_DIR, "Attention_is_All_You_Need.pt")

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_PATH)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 9s
	Train Loss: 4.359 | Train PPL:  78.167
	 Val. Loss: 3.198 |  Val. PPL:  24.476
Epoch: 02 | Time: 1m 12s
	Train Loss: 3.022 | Train PPL:  20.523
	 Val. Loss: 2.488 |  Val. PPL:  12.042
Epoch: 03 | Time: 1m 12s
	Train Loss: 2.464 | Train PPL:  11.754
	 Val. Loss: 2.149 |  Val. PPL:   8.579
Epoch: 04 | Time: 1m 12s
	Train Loss: 2.115 | Train PPL:   8.286
	 Val. Loss: 1.957 |  Val. PPL:   7.080
Epoch: 05 | Time: 1m 32s
	Train Loss: 1.864 | Train PPL:   6.447
	 Val. Loss: 1.825 |  Val. PPL:   6.202
Epoch: 06 | Time: 1m 11s
	Train Loss: 1.677 | Train PPL:   5.347
	 Val. Loss: 1.759 |  Val. PPL:   5.806
Epoch: 07 | Time: 1m 12s
	Train Loss: 1.526 | Train PPL:   4.600
	 Val. Loss: 1.707 |  Val. PPL:   5.514
Epoch: 08 | Time: 1m 12s
	Train Loss: 1.398 | Train PPL:   4.046
	 Val. Loss: 1.681 |  Val. PPL:   5.372
Epoch: 09 | Time: 1m 12s
	Train Loss: 1.295 | Train PPL:   3.651
	 Val. Loss: 1.658 |  Val. PPL:   5.250
Epoch: 10 | Time: 1m 12s
	Train Loss: 1.203 | Train PPL:

In [18]:
model.load_state_dict(torch.load(MODEL_PATH))

test_loss = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}')

Test Loss: 1.700 | Test PPL:   5.476


In [19]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=100):
    model.eval()

    # Tokenize sentence if not already.
    if isinstance(sentence, str):
        spacy_de = spacy.load("de_core_news_sm")
        tokens = [tok.text.lower() for tok in spacy_de(sentence)]
    else:
        tokens = [tok.lower() for tok in sentence]

    # Add <sos> and <eos> tokens
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]

    # Convert tokens from text to numbers
    src = [src_field.vocab.stoi[tok] for tok in tokens]

    # Convert the list into tensor and add Batch dimension
    src = torch.LongTensor(src).unsqueeze(0).to(device)

    # Create source mask
    src_mask = model.make_src_mask(src)

    # pass the src and src_mask through encoder
    with torch.no_grad():
        enc_src = model.encoder(src, src_mask)
    # Define Target list
    trg_indx = [trg_field.vocab.stoi[trg_field.init_token]]

    # While length of target is less then max length
    for i in range(max_len):
        # Convert target list into tensor and add a batch dimension.
        trg = torch.LongTensor(trg_indx).unsqueeze(0).to(device)

        # Create target mask.
        trg_mask = model.make_trg_mask(trg)

        # Run the trg, trg_mask, enc_src, trg_mask and src_mask through decoder.
        with torch.no_grad():
            output, attention = model.decoder(trg, enc_src, trg_mask, src_mask)

        # Get the predicted token.
        predicted_token = output.argmax(2)[:,-1].item()

        #Added prediction to target list
        trg_indx.append(predicted_token)

        # if predicted token is <eos> break.
        if predicted_token == trg_field.vocab.stoi[trg_field.eos_token]:
            trg_indx.pop(-1)
            break

    # Convert target tokens from numbers into text
    trg_tokens = [trg_field.vocab.itos[tok] for tok in trg_indx]

    return trg_tokens[1:], attention

In [20]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):

    trgs = []
    pred_trgs = []

    for datum in data:

        src = vars(datum)['src']
        trg = vars(datum)['trg']

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len)

        #cut off <eos> token
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

    return bleu_score(pred_trgs, trgs)
bleu_score = calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score = {bleu_score*100:.2f}')

BLEU score = 31.75
