In [1]:
%config IPCompleter.use_jedi = False

In [2]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import torch
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import spacy

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
%run contractions.ipynb
%run ../Utils.ipynb

In [3]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [4]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

TRG = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

In [5]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.en', '.de'), 
                                                    fields = (SRC, TRG))

In [6]:
for x in train_data:
    print(vars(x))
    break

{'src': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.'], 'trg': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']}


In [7]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [8]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 5893
Unique tokens in target (en) vocabulary: 7853


In [9]:
BATCH_SIZE = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

In [10]:
for x in train_iterator:
    print(x.src.size())
    break

torch.Size([50, 24])


# Encoder
<img src="./picture/encoder.png">

# Multi head attention
## Overview

<!-- <img src="./multi head attention.png"> -->
<img src="./picture/multi head attention 2.png">

**Encoder use self attention to encode the relationships between each word in an input sentence. As you can see in this pic below, The combination of Input Embedding and Position Embedding is fed to the Query, Key and Value of the first Encoder in the stack.**

<img src="./picture/input into stack.png">

**The combination of Input Embedding and Position Embedding has the shape [batch_size, seq_len, embedding_size] (if you dont know why it has this shape, check out this [link](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html). So to make this simple, i just visualize a sample instead of a batch**

<img src="./picture/sample.png">



**The first step is to calculate the Query, Key, and Value matrices. We do that by packing our embeddings into a matrix X, and multiplying it by the weight matrices we’ve trained (WQ, WK, WV)**

<img src="./picture/weight.png">

**How do we get this weight? Well, We fetch the Query, Key, Value into the linear model, and each linear model has its own weight Wq, Wk, Wv. The input is passed through these Linear layers to product the Q, K, V matrices.(See this pic below for more detail)**

<img src="./picture/weight2.png">

**Then we split data across Attention heads so that each can process it independently .We can achieve this by choosing the Query Size: 
Query Size = Embedding Size / Number of heads. Example when we choose head is 2:**

<img src="./picture/heads.png">

**The first input has shape [batch_size, seq_len, embedding_size]. After splitting input into Attention heads, the shape becomes to [batch_size, seq_len, n_heads, query_size]. Now we reshape the Q, K, V matrices for compute Attention Score. We swap Head and Seq_len dimensions. This pic below visualizes one sample:**

<img src="./picture/swap.png">

**We summerize all step for reshaping in this pic:**

<img src="./picture/sumup.png">

## Now we compute the Attention Score for each head

<img src="./picture/score.png">

**The first step is to multiply Q and K matrix. (we must inverse one of these two matrix for matrix multiplication)** 

<img src="./picture/multi.png">

**Then a mask value is added to the result. In the Encoder Self-attention, the mask is used to mask out the Padding values so that they don’t participate in the Attention Score. The result of this step we call X**

<img src="./picture/mask.png">

**Finally, we compute Attention score by multiplying softmax of X/sqrt(query_size) with V matrix**

<img src="./picture/compute.png">

**We sum up all this step in this pic below:**

<img src="./picture/summerize.png">

# Finnaly, we merge each Head’s Attention Scores together

<img src="./picture/merge.png">


# End to end attention

<img src="./picture/end-to-end-attention.png">

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Before token and position embeddings are summed, The tokem embedding should be multiplied by a scaling factor which is sqrt(embedding_size). But why? I dont know. This aspect is not justified by the authors, either on the paper or anywhere else. It was specifically asked as an issue in the original implementation by Google with no response. [link](https://datascience.stackexchange.com/questions/87906/transformer-model-why-are-word-embeddings-scaled-before-adding-positional-encod)

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, pf_dim, n_layers, n_heads, max_length, p_drop):
        super().__init__()
        self.embedding_size = embedding_size
        self.input_size = input_size
        self.max_length = max_length
        self.pf_dim = pf_dim
        self.n_heads = n_heads
        
        self.input_embedding = nn.Embedding(input_size, embedding_size)
        self.positional_embedding = nn.Embedding(max_length, embedding_size)
        self.encoder_layer = nn.ModuleList([EncoderLayer(embedding_size, pf_dim, n_heads, p_drop) for _ in range(n_layers)])
        self.dropout = nn.Dropout(p_drop)
        self.scale = torch.sqrt(torch.FloatTensor([embedding_size])).to(device)
        
    def forward(self, x, src_mask):
        #x: [batch_size, seq_len]
        # prepare input before fetch it into n layer 
        out = self.input_embedding(x)
        #out:[batch_size, seq_len, embedding_size]
        batch_size = x.shape[0]
        src_len = x.shape[1]
        #position embedding
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
        #pos:[batch_size, src_len]
        pos_embed = self.positional_embedding(pos)
        #input
        out = self.dropout(out * self.scale + pos_embed)
        
        for layer in self.encoder_layer:
            out = layer(out, src_mask)
        
        return out 

In [13]:
class EncoderLayer(nn.ModuleList):
    def __init__(self, embedding_size, pf_size, n_heads, p_drop):
        super().__init__()
        self.embedding_size = embedding_size
        self.n_heads = n_heads
        
        self.multi_head_attention = MultiHeadAtt(embedding_size, n_heads, p_drop)
        self.dropout = nn.Dropout(p_drop)
        self.attention_norm = nn.LayerNorm(embedding_size)
        self.feed_forward = FeedForward(embedding_size, pf_size, p_drop)
        self.ff_norm = nn.LayerNorm(embedding_size)
        self.pf_size = pf_size
    
    def forward(self,x, src_mask):
        out1, _ = self.multi_head_attention(x, x, x , src_mask)
        out = self.attention_norm(x + self.dropout(out1)) 
        out1 = self.feed_forward(out)
        out = self.ff_norm(out + self.dropout(out1)) 
        
        return out

In [14]:
class MultiHeadAtt(nn.Module):
    def __init__(self, embedding_size, n_heads, p_drop):
        super().__init__()
        assert embedding_size % n_heads == 0
        
        self.embedding_size = embedding_size
        self.n_heads = n_heads
        self.head_dim = embedding_size // n_heads
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
        self.fq = nn.Linear(embedding_size, embedding_size)
        self.fk = nn.Linear(embedding_size, embedding_size)
        self.fv = nn.Linear(embedding_size, embedding_size)
        self.fo = nn.Linear(embedding_size, embedding_size)
        
        self.dropout = nn.Dropout(p_drop)
    
    def forward(self, query, key, value, src_mask = None):
        Q = self.fq(query)
        K = self.fk(key)
        V = self.fv(value)
        batch_size = query.shape[0]
        Q = Q.view(batch_size, self.n_heads, -1, self.head_dim)
        K = K.view(batch_size, self.n_heads, -1, self.head_dim)
        V = V.view(batch_size, self.n_heads, -1, self.head_dim)
        
        QK = torch.matmul(Q, K.permute(0,1,3,2)) / self.scale
        if src_mask is not None:
            QK = QK.masked_fill(src_mask == 0, -1e10)
        #softmax for last dimension
        attention = torch.softmax(QK, dim = -1)
        
        out = torch.matmul(self.dropout(attention), V)
        out = out.permute(0, 2, 1, 3).contiguous()
        out = out.view(batch_size, -1, self.embedding_size)
        out = self.fo(out)
        return out, attention

In [15]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim, pf_dim, p_drop):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.pf_dim = pf_dim
        self.fc1 = nn.Linear(embedding_dim, pf_dim)
        self.fc2 = nn.Linear(pf_dim, embedding_dim)
        self.dropout = nn.Dropout(p_drop)
    
    def forward(self, x):
        #x = [batch_size, seq_len, embedding_dim]
        out = self.dropout(torch.relu(self.fc1(x)))
        #x = [batch size, seq len, pf dim]
        out = self.fc2(out)
        #x = [batch size, seq len, embedding dim]
        
        return out

# Decoder

<img src="./picture/decoder.png">

In [16]:
class Decoder(nn.Module):
    def __init__(self,output_size, embedding_dim, pf_dim, n_layers, n_heads ,max_length, p_drop):
        super().__init__()
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.pf_dim = pf_dim
        self.max_length = max_length
        
        self.input_embedding = nn.Embedding(output_size, embedding_dim)
        self.pos_embedding = nn.Embedding(max_length, embedding_dim)
        self.dropout = nn.Dropout(p_drop)
        self.decoder_layer = nn.ModuleList([DecoderLayer(embedding_dim, pf_dim, n_heads, p_drop) for _ in range(n_layers)])
        self.fc = nn.Linear(embedding_dim, output_size)
        self.scale = torch.sqrt(torch.FloatTensor([embedding_dim])).to(device)
        
    def forward(self, x, enc_src, trg_mask, src_mask):
        out = self.input_embedding(x)
        batch_size = x.shape[0]
        src_len = x.shape[1]
        #position embedding
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
        pos = self.pos_embedding(pos)
        
        out = self.dropout(out * self.scale + pos)
        for layer in self.decoder_layer:
            out, attention = layer(out, enc_src, trg_mask, src_mask)
        out = self.fc(out)
        
        return out, attention
            

In [17]:
class DecoderLayer(nn.Module):
    def __init__(self, embedding_size, pf_dim, n_heads, p_drop):
        super().__init__()
        self.embedding_size = embedding_size
        self.pf_dim = pf_dim
        self.n_heads = n_heads
        
        self.self_attention = MultiHeadAtt(embedding_size, n_heads, p_drop)
        self.encoder_attention = MultiHeadAtt(embedding_size, n_heads, p_drop)
        self.attention_norm = nn.LayerNorm(embedding_size)
        self.ff_norm = nn.LayerNorm(embedding_size)
        self.encode_norm = nn.LayerNorm(embedding_size)
        self.feed_forward = FeedForward(embedding_size,pf_dim, p_drop)
        self.dropout = nn.Dropout(p_drop)
    
    def forward(self, x, enc_output, trg_mask, src_mask):
        out1, _ = self.self_attention(x, x, x, trg_mask)
        out = self.attention_norm(x + self.dropout(out1))
        out1, attention = self.encoder_attention(out, enc_output, enc_output, src_mask)
        out = self.encode_norm(out + self.dropout(out1))
        out1 = self.feed_forward(out)
        out = self.ff_norm(out + self.dropout(out1))
        
        return out, attention
        

# For Debugging

In [18]:
# INPUT_SIZE = 15
# EMBEDDING_SIZE = 120
# PF_SIZE = 256
# N_LAYERS = 3
# N_HEADS = 5
# # encoder = Encoder(INPUT_SIZE, EMBEDDING_SIZE, PF_SIZE, N_HEADS, N_LAYERS).to(device)
# decoder = Decoder(OUTPUT_SIZE, EMBEDDING_SIZE, PF_SIZE, DEC_LAYERS, N_HEADS, MAX_LENGTH, DEC_DROPOUT).to(device)

# input = torch.randint(2, 5, (50, 10)).to(device)
# src_mask = (input != 1).unsqueeze(1).unsqueeze(2)
# # res_en = encoder(input, src_mask)
# res = decoder(input, res_en, 1, 1)
# res.size()

In [19]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

In [20]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def create_src_mask(self, src):
        #src = [batch size, src len]
        src_mask = (src != SRC_PAD_IDX).unsqueeze(1).unsqueeze(2)
        #src_mask = [batch size, 1, 1, src len]
        return src_mask
    
    def create_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != TRG_PAD_IDX).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask
    
    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.create_src_mask(src)
        trg_mask = self.create_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

In [21]:
INPUT_SIZE = len(SRC.vocab)
OUTPUT_SIZE = len(TRG.vocab)
EMBEDDING_SIZE = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
N_HEADS = 8
PF_SIZE = 512
MAX_LENGTH = 100
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3
# input_size, embedding_size, pf_dim, n_layers, n_heads, max_length = 100, p_drop=0.5
encoder = Encoder(INPUT_SIZE, EMBEDDING_SIZE, PF_SIZE, ENC_LAYERS, N_HEADS, MAX_LENGTH, ENC_DROPOUT).to(device)
decoder = Decoder(OUTPUT_SIZE, EMBEDDING_SIZE, PF_SIZE, DEC_LAYERS, N_HEADS, MAX_LENGTH, DEC_DROPOUT).to(device)
transformer = Transformer(encoder, decoder).to(device)


In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(transformer):,} trainable parameters')
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
transformer.apply(initialize_weights);

The model has 9,542,061 trainable parameters


## i have chosen learning rate with 0.001 and 0.0005, but the valid loss not decreasing while the train loss is still going down, which means my model is overfitting, so i choose 0.0001.

In [23]:
LEARNING_RATE = 0.0001

optimizer = torch.optim.Adam(transformer.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')

In [24]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        
        output, _ = transformer(src, trg[:,:-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
    
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output, _ = model(src, trg[:,:-1])
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


In [25]:
import math
import time
valid_loss_arr = []
train_loss_arr =[]


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(transformer, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(transformer, valid_iterator, criterion)
    train_loss_arr.append(train_loss)
    valid_loss_arr.append(valid_loss)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        print(epoch)
        best_valid_loss = valid_loss
        torch.save(transformer.state_dict(), 'transformer_traslation.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

0
Epoch: 01 | Time: 1m 42s
	Train Loss: 5.486 | Train PPL: 241.313
	 Val. Loss: 4.424 |  Val. PPL:  83.442
1
Epoch: 02 | Time: 1m 45s
	Train Loss: 4.256 | Train PPL:  70.494
	 Val. Loss: 4.069 |  Val. PPL:  58.519
2
Epoch: 03 | Time: 1m 45s
	Train Loss: 3.925 | Train PPL:  50.649
	 Val. Loss: 3.811 |  Val. PPL:  45.194
3
Epoch: 04 | Time: 1m 41s
	Train Loss: 3.708 | Train PPL:  40.753
	 Val. Loss: 3.657 |  Val. PPL:  38.753


KeyboardInterrupt: 

In [None]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    
    model.eval()
        
    if isinstance(sentence, str):
        nlp = spacy.load('de_core_news_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.create_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.create_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [None]:
example_idx = 8

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

In [None]:
translation, attention = translate_sentence(src, SRC, TRG, transformer, device)

print(f'predicted trg = {translation}')