In [1]:
!pip install torch torchvision==0.10.0
!pip install -U torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchvision==0.10.0
  Downloading torchvision-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (22.1 MB)
[K     |████████████████████████████████| 22.1 MB 1.2 MB/s 
Collecting torch
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.7 kB/s 
Installing collected packages: torch, torchvision
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.13.0+cu113
    Uninstalling torchvision-0.13.0+cu113:
      Successfully uninstalled torchvision-0.13.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency con

In [2]:
import random
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
!python -m spacy download en 
!python -m spacy download de

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 19.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 

In [4]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [7]:
German = Field(tokenize = tokenize_ger, init_token = '<sos>', eos_token = '<eos>', lower = True, batch_first = True)
English = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True, batch_first = True)

In [8]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (German, English))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 421kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 114kB/s] 


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 108kB/s]


In [9]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [10]:
German.build_vocab(train_data, min_freq = 2)
English.build_vocab(train_data, min_freq = 2)

In [11]:
print(f"German vocabulary: {len(German.vocab)}")
print(f"English vocabulary: {len(English.vocab)}")

German vocabulary: 7853
English vocabulary: 5893


In [12]:
BATCH_SIZE = 32
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), batch_size = BATCH_SIZE, device = device)

In [13]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)        
        return x

In [14]:
class SelfAtt(nn.Module):
  def __init__(self, emb_size, heads,dropout):
    super(SelfAtt,self).__init__()
    self.emb_size = emb_size
    self.heads = heads
    self.head_dim = emb_size//heads
    assert(self.head_dim*heads == self.emb_size), "head_dim*heads != emb_size"

    self.query = nn.Linear(self.emb_size,self.emb_size)
    self.key = nn.Linear(self.emb_size,self.emb_size)
    self.value = nn.Linear(self.emb_size,self.emb_size)

    self.fc_out = nn.Linear(self.head_dim*heads, self.emb_size)
    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
    self.dropout = nn.Dropout(dropout)

  def forward(self,values,keys,query,mask=None):
    N = query.shape[0]
    values = self.value(values)
    keys = self.key(keys)  
    query1 = self.query(query) 

    values = values.view(N, -1, self.heads, self.head_dim).permute(0, 2, 1, 3)
    keys = keys.view(N, -1, self.heads, self.head_dim).permute(0, 2, 1, 3)
    query1 = query1.view(N, -1, self.heads, self.head_dim).permute(0, 2, 1, 3)   

    #dot product of keys and query
    energy = torch.matmul(query1, keys.permute(0, 1, 3, 2)) / self.scale
    #print(energy.shape)

    if mask is not None:
      energy = energy.masked_fill(mask == 0, float("-1e20"))
      #print(energy)
    
    attention = torch.softmax(energy, dim= -1)
    #print(attention.shape)
    #print(values.shape)
    x = torch.matmul(self.dropout(attention), values)    
    x = x.permute(0, 2, 1, 3).contiguous()
    out = x.view(N, -1, self.emb_size)
    
    out =  self.fc_out(out)
    return out

In [15]:
class TransformerBlock(nn.Module):
  def __init__(self,emb_size, heads, dropout, forward_expansion):
    super(TransformerBlock,self).__init__()
    self.att = SelfAtt(emb_size,heads,dropout)
    self.norm1 = nn.LayerNorm(emb_size)
    self.norm2 = nn.LayerNorm(emb_size)

    self.feed_forward = PositionwiseFeedforwardLayer(emb_size, forward_expansion*emb_size, dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, value, key, query):
    att = self.att(value, key, query)
    x = self.norm1(query + self.dropout(att))
    forward = self.feed_forward(x)
    out  = self.norm2(x + self.dropout(forward))
    return out

In [16]:
class Encoder(nn.Module):
  def __init__(self, 
               src_vocab_size, 
               emb_size,
               num_layers,
               heads,
               device,
               forward_expansion,
               dropout,
               max_length,
               ):
    super(Encoder,self).__init__()
    self.emb_size = emb_size
    self.device = device
    self.word_embedding  = nn.Embedding(src_vocab_size,emb_size)
    self.position_embedding = nn.Embedding(max_length, emb_size)
    self.layers = nn.ModuleList(
        [
         TransformerBlock(
             emb_size,
             heads,
             dropout = dropout,
             forward_expansion = forward_expansion,
         )
         for _ in range(num_layers)
        ]
    )
    self.dropout = nn.Dropout(dropout)
    self.scale = torch.sqrt(torch.FloatTensor([emb_size])).to(device)
  def forward(self,x):
    N,seq_length = x.shape
    positions = torch.arange(0, seq_length).unsqueeze(0).repeat(N, 1).to(self.device)
    out = self.dropout(self.word_embedding(x)*self.scale + self.position_embedding(positions))
    for layer in self.layers:
      out  = layer(out,out,out)
      
    return out

In [17]:
class DecoderBlock(nn.Module):
  def __init__(self, emb_size, heads, forward_expansion, dropout, device):
    super(DecoderBlock,self).__init__()
    self.attention = SelfAtt(emb_size,heads,dropout)
    self.norm1 = nn.LayerNorm(emb_size)
    self.norm = nn.LayerNorm(emb_size)
    self.transformer_block = TransformerBlock(emb_size, heads, dropout, forward_expansion)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, value, key):
    attention = self.attention(x,x,x)
    query = self.norm1(self.dropout(attention) + x)
    out  = self.transformer_block(value, key, query)
    return out

In [18]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([embed_size])).to(device)

    def forward(self, x, enc_out):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).unsqueeze(0).repeat(N, 1).to(self.device)
        x = self.dropout((self.word_embedding(x)*self.scale) + self.position_embedding(positions))
        for layer in self.layers:
            x = layer(x, enc_out, enc_out)
        out = self.fc_out(x)

        return out

In [19]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        embed_size=5,
        num_layers=6,
        forward_expansion=4,
        heads=1,
        dropout=0,
        device="cpu",
        max_length=10,
        src_pad_idx = 0,
        trg_pad_idx = 0,
        teacher_force = 0.5,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.device = device
        self.trg_vocab_size = trg_vocab_size
        self.teacher_force = teacher_force

    def forward(self, src, trg):
        enc_src = self.encoder(src)
        outputs = torch.zeros((trg.shape[0],trg.shape[1],self.trg_vocab_size)).to(self.device)
        trg_dec = trg[:,0:1]
        for i in range(trg.shape[1]):
            out = self.decoder(trg_dec, enc_src)
            outputs[:,i,:] = out[:,-1,:]
            trg_dec = torch.argmax(outputs[:,0:i+2,:],dim = 2) if random.random() < self.teacher_force else trg[:,0:i+2]
        return outputs


In [20]:
src_vocab_size = len(German.vocab)
trg_vocab_size = len(English.vocab)
embed_size= 256
num_layers= 3
forward_expansion= 2
heads=8
dropout= 0.1
max_length= 100
teacher_force = 0.5
TRG_PAD_IDX = English.vocab.stoi[English.pad_token]
SRC_PAD_IDX = German.vocab.stoi[German.pad_token]

model  = Transformer( src_vocab_size, trg_vocab_size, embed_size, num_layers, forward_expansion, heads, dropout, device, max_length, SRC_PAD_IDX, TRG_PAD_IDX, teacher_force).to(device)

In [21]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,039,877 trainable parameters


In [22]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)
model.apply(initialize_weights);

In [23]:
optimizer = torch.optim.Adam(model.parameters(),lr = 0.0005)

In [24]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [25]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        #print(trg.shape)
        output = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [26]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg
            trg1 = trg

            output = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), output, trg1

In [27]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss, output, trg = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tranformer_1.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 4m 26s
	Train Loss: 4.230 | Train PPL:  68.705
	 Val. Loss: 3.345 |  Val. PPL:  28.369
Epoch: 02 | Time: 4m 19s
	Train Loss: 3.202 | Train PPL:  24.579
	 Val. Loss: 2.765 |  Val. PPL:  15.876
Epoch: 03 | Time: 4m 19s
	Train Loss: 2.742 | Train PPL:  15.518
	 Val. Loss: 2.496 |  Val. PPL:  12.139
Epoch: 04 | Time: 4m 19s
	Train Loss: 2.454 | Train PPL:  11.639
	 Val. Loss: 2.422 |  Val. PPL:  11.270
Epoch: 05 | Time: 4m 20s
	Train Loss: 2.246 | Train PPL:   9.448
	 Val. Loss: 2.347 |  Val. PPL:  10.457
Epoch: 06 | Time: 4m 22s
	Train Loss: 2.098 | Train PPL:   8.151
	 Val. Loss: 2.221 |  Val. PPL:   9.219
Epoch: 07 | Time: 4m 20s
	Train Loss: 1.974 | Train PPL:   7.201
	 Val. Loss: 2.230 |  Val. PPL:   9.300
Epoch: 08 | Time: 4m 17s
	Train Loss: 1.877 | Train PPL:   6.534
	 Val. Loss: 2.138 |  Val. PPL:   8.484
Epoch: 09 | Time: 4m 15s
	Train Loss: 1.792 | Train PPL:   6.000
	 Val. Loss: 2.219 |  Val. PPL:   9.199
Epoch: 10 | Time: 4m 17s
	Train Loss: 1.701 | Train PPL

In [29]:
model.load_state_dict(torch.load('tranformer_1.pt'))

test_loss, output, trg = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Test Loss: 2.238 | Test PPL:   9.378 |


In [30]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    model.eval()   
    if isinstance(sentence, str):
        nlp = spacy.load('de_core_news_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    #src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        #trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output = model.decoder(trg_tensor, enc_src)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens

In [33]:
example_idx = 10

src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']

print(f'src = {src}')
print(f'trg = {trg}')

src = ['eine', 'ballettklasse', 'mit', 'fünf', 'mädchen', ',', 'die', 'nacheinander', 'springen', '.']
trg = ['a', 'ballet', 'class', 'of', 'five', 'girls', 'jumping', 'in', 'sequence', '.']


In [34]:
translation = translate_sentence(src, German, English, model, device)

print(f'predicted trg = {translation}')

predicted trg = ['<sos>', 'a', 'ballet', 'of', 'five', 'girls', 'jumping', 'in', 'sequence', '.', '<eos>']
