In [2]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import time
import gzip
import time
import math
import spacy


In [4]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [5]:
!python -m spacy download de_core_news_sm


Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')

In [8]:
SEED = 123 # to stop generating random values
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [9]:
class Multi30kDataset(Dataset):

    def __init__(self, src_file, trg_file, src_transform=None, trg_transform=None):
        self.src_data= self.load_data(src_file)
        self.trg_data =self.load_data(trg_file)
        self.src_transform=src_transform
        self.trg_transform = trg_transform

    def __len__(self):
        return len(self.src_data)

    def load_data(self, file_path):
        with gzip.open(file_path,'rt', encoding='utf-8') as f:
            data = f.readlines()
        return data
    
    def __getitem__(self, index):
        src_sentence= self.src_data[index].strip()
        trg_sentence =  self.trg_data[index].strip()

        if self.src_transform:
           src_sentence= self.src_transform(src_sentence)
           trg_sentence= self.trg_transform(trg_sentence)

        return { 'src':src_sentence, 'trg':trg_sentence}
    


In [10]:
def tokenize_de(text):
    return [token.text.lower() for token in spacy_de.tokenizer(text)]

In [11]:
def tokenize_en(text):
    return [token.text.lower() for token in spacy_en.tokenizer(text)]

In [12]:
train_de_path= './data/train.de.gz'
train_en_path = './data/train.en.gz'

val_de_path='./data/val.de.gz'
val_en_path='./data/val.en.gz'

test_de_path='./data/test2016.de.gz'
test_en_path='./data/test2016.en.gz'


In [13]:
train_data = Multi30kDataset(train_de_path, train_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)
test_data = Multi30kDataset(test_de_path, test_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)
val_data = Multi30kDataset(val_de_path, val_en_path, src_transform=tokenize_de, trg_transform=tokenize_en)


In [14]:
train_data.src_data[0]

'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.\n'

In [15]:
#special token
PAD_TOKEN='<pad>'
SOS_TOKEN='<sos>'
EOS_TOKEN='<eos>'
UNK_TOKEN='<unk>'
special_tokens = ['<pad>','<sos>','<eos>','<unk>']

In [16]:
def create_vocab(tokenized_sentences, special_tokens):
    vocab= { token: ind for ind,token in enumerate(special_tokens)}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in vocab:
                vocab[token]=len(vocab)
    return vocab


In [17]:
train_de_tokenized = [tokenize_de(sentence.strip()) for sentence in train_data.src_data]
train_en_tokenized = [tokenize_en(sentence.strip()) for sentence in train_data.trg_data]

In [18]:
SRC_VOCAB= create_vocab(train_de_tokenized, special_tokens)
TRG_VOCAB= create_vocab(train_en_tokenized,special_tokens)

In [19]:
import torch.nn as nn
class PositionalEncoding(nn.Module):
    
    def __init__(self,d_model, max_length ):
        super().__init__()
        pe=torch.zeros(max_length,d_model)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]
        





In [20]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)

In [21]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [22]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [23]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x
        

In [24]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, trg_mask):
        attn_output = self.self_attn(x, x, x, trg_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

        

In [25]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        
        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([d_model]))
        
    def generate_mask(self, src, trg):
        src_mask = (src != SRC_VOCAB[PAD_TOKEN]).unsqueeze(1).unsqueeze(2)
        trg_mask = (trg != TRG_VOCAB[PAD_TOKEN]).unsqueeze(1).unsqueeze(3)
        seq_length = trg.shape[1]
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        trg_mask = trg_mask & nopeak_mask
        return src_mask, trg_mask
        
    def forward(self, src, trg):
        src_mask, trg_mask = self.generate_mask(src, trg)
        
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src) * self.scale))
        trg_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(trg) * self.scale))
        
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
            
        dec_output = trg_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, trg_mask)
        
        output = self.fc_out(dec_output)
        return output

In [26]:
SRC_VOCAB_SIZE = len(SRC_VOCAB)
TRG_VOCAB_SIZE = len(TRG_VOCAB)
D_MODEL = 512
NUM_HEADS = 8
NUM_LAYERS = 6
D_FF = 2048
MAX_SEQ_LENGTH = 100
DROPOUT = 0.1

In [27]:
model = Transformer(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, D_MODEL, NUM_HEADS, NUM_LAYERS, D_FF, MAX_SEQ_LENGTH, DROPOUT)


In [28]:
print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

The model has 63,738,949 trainable parameters


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
PAD_IDX = SRC_VOCAB[PAD_TOKEN]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [30]:
# Collate function for DataLoader
def collate_fn(batch):
    src_batch, trg_batch = [], []
    for sample in batch:
        src_batch.append(torch.tensor([SRC_VOCAB.get(token, SRC_VOCAB[UNK_TOKEN]) for token in [SOS_TOKEN] + sample['src'] + [EOS_TOKEN]]))
        trg_batch.append(torch.tensor([TRG_VOCAB.get(token, TRG_VOCAB[UNK_TOKEN]) for token in [SOS_TOKEN] + sample['trg'] + [EOS_TOKEN]]))
    
    src_batch = pad_sequence(src_batch, padding_value=SRC_VOCAB[PAD_TOKEN])
    trg_batch = pad_sequence(trg_batch, padding_value=TRG_VOCAB[PAD_TOKEN])
    
    return src_batch.transpose(0, 1), trg_batch.transpose(0, 1)

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    print(len(iterator))
    for src, trg in tqdm(iterator, desc="Training", leave=False):
        optimizer.zero_grad()
        
        output = model(src, trg[:, :-1])
        
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch
            
            output = model(src, trg[:, :-1])
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def translate_sentence(sentence, src_vocab, trg_vocab, model, device, max_len=50):
    model.eval()
    
    tokens = [SOS_TOKEN] + tokenize_de(sentence) + [EOS_TOKEN]
    
    src_indexes = [src_vocab.get(token, src_vocab[UNK_TOKEN]) for token in tokens]
    
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.generate_mask(src_tensor, src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder_embedding(src_tensor)
        for enc_layer in model.encoder_layers:
            enc_src = enc_layer(enc_src, src_mask[0])
    
    trg_indexes = [trg_vocab[SOS_TOKEN]]
    
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        
        trg_mask = model.generate_mask(src_tensor, trg_tensor)
        
        with torch.no_grad():
            output = model.decoder_embedding(trg_tensor)
            for dec_layer in model.decoder_layers:
                output = dec_layer(output, enc_src, src_mask[0], trg_mask[1])
            output = model.fc_out(output)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)
        
        if pred_token == trg_vocab[EOS_TOKEN]:
            break
    
    trg_tokens = [list(trg_vocab.keys())[list(trg_vocab.values()).index(i)] for i in trg_indexes]
    
    return trg_tokens[1:-1]

In [31]:
# Training loop
N_EPOCHS = 100
CLIP = 1.0
BATCH_SIZE = 32

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, collate_fn=collate_fn)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_dataloader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer-translation-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

# Load the best model for evaluation
model.load_state_dict(torch.load('transformer-translation-model.pt'))

907


                                                           

Epoch: 01 | Time: 11.0m 18.827818155288696s
	Train Loss: 3.770 | Train PPL:  43.373
	 Val. Loss: 2.994 |  Val. PPL:  19.973
907


                                                           

Epoch: 02 | Time: 10.0m 54.932904958724976s
	Train Loss: 2.786 | Train PPL:  16.222
	 Val. Loss: 2.645 |  Val. PPL:  14.084
907


                                                           

Epoch: 03 | Time: 8.0m 46.33641195297241s
	Train Loss: 2.395 | Train PPL:  10.972
	 Val. Loss: 2.402 |  Val. PPL:  11.044
907


                                                           

Epoch: 04 | Time: 8.0m 55.27112293243408s
	Train Loss: 2.120 | Train PPL:   8.334
	 Val. Loss: 2.244 |  Val. PPL:   9.426
907


                                                           

Epoch: 05 | Time: 8.0m 54.18117308616638s
	Train Loss: 1.901 | Train PPL:   6.691
	 Val. Loss: 2.213 |  Val. PPL:   9.141
907


                                                           

Epoch: 06 | Time: 9.0m 7.908630132675171s
	Train Loss: 1.708 | Train PPL:   5.520
	 Val. Loss: 2.137 |  Val. PPL:   8.474
907


                                                           

Epoch: 07 | Time: 8.0m 56.20693874359131s
	Train Loss: 1.533 | Train PPL:   4.633
	 Val. Loss: 2.123 |  Val. PPL:   8.356
907


                                                           

Epoch: 08 | Time: 8.0m 55.91201901435852s
	Train Loss: 1.372 | Train PPL:   3.944
	 Val. Loss: 2.115 |  Val. PPL:   8.293
907


                                                           

Epoch: 09 | Time: 8.0m 51.35969805717468s
	Train Loss: 1.222 | Train PPL:   3.394
	 Val. Loss: 2.167 |  Val. PPL:   8.732
907


                                                           

Epoch: 10 | Time: 8.0m 57.063929080963135s
	Train Loss: 1.083 | Train PPL:   2.955
	 Val. Loss: 2.176 |  Val. PPL:   8.815
907


                                                           

Epoch: 11 | Time: 8.0m 49.53402876853943s
	Train Loss: 0.955 | Train PPL:   2.600
	 Val. Loss: 2.245 |  Val. PPL:   9.436
907


                                                           

Epoch: 12 | Time: 8.0m 52.04048466682434s
	Train Loss: 0.837 | Train PPL:   2.309
	 Val. Loss: 2.274 |  Val. PPL:   9.715
907


                                                           

Epoch: 13 | Time: 8.0m 12.7898108959198s
	Train Loss: 0.734 | Train PPL:   2.082
	 Val. Loss: 2.315 |  Val. PPL:  10.123
907


                                                           

Epoch: 14 | Time: 8.0m 59.64967107772827s
	Train Loss: 0.647 | Train PPL:   1.909
	 Val. Loss: 2.381 |  Val. PPL:  10.813
907


                                                           

Epoch: 15 | Time: 9.0m 22.646697998046875s
	Train Loss: 0.570 | Train PPL:   1.768
	 Val. Loss: 2.502 |  Val. PPL:  12.201
907


                                                           

Epoch: 16 | Time: 8.0m 43.13524293899536s
	Train Loss: 0.506 | Train PPL:   1.658
	 Val. Loss: 2.533 |  Val. PPL:  12.594
907


                                                           

Epoch: 17 | Time: 8.0m 52.71027421951294s
	Train Loss: 0.455 | Train PPL:   1.576
	 Val. Loss: 2.564 |  Val. PPL:  12.990
907


                                                           

Epoch: 18 | Time: 8.0m 44.91269898414612s
	Train Loss: 0.411 | Train PPL:   1.508
	 Val. Loss: 2.619 |  Val. PPL:  13.718
907


                                                           

Epoch: 19 | Time: 9.0m 1.1401219367980957s
	Train Loss: 0.379 | Train PPL:   1.460
	 Val. Loss: 2.653 |  Val. PPL:  14.201
907


                                                           

Epoch: 20 | Time: 9.0m 6.861113786697388s
	Train Loss: 0.355 | Train PPL:   1.427
	 Val. Loss: 2.670 |  Val. PPL:  14.433
907


                                                           

Epoch: 21 | Time: 9.0m 2.417980909347534s
	Train Loss: 0.340 | Train PPL:   1.405
	 Val. Loss: 2.689 |  Val. PPL:  14.721
907


                                                           

Epoch: 22 | Time: 8.0m 58.057718992233276s
	Train Loss: 0.323 | Train PPL:   1.382
	 Val. Loss: 2.735 |  Val. PPL:  15.408
907


                                                           

Epoch: 23 | Time: 8.0m 57.7164888381958s
	Train Loss: 0.307 | Train PPL:   1.359
	 Val. Loss: 2.751 |  Val. PPL:  15.653
907


                                                           

Epoch: 24 | Time: 9.0m 2.3258819580078125s
	Train Loss: 0.294 | Train PPL:   1.341
	 Val. Loss: 2.765 |  Val. PPL:  15.873
907


                                                           

Epoch: 25 | Time: 8.0m 32.1035361289978s
	Train Loss: 0.283 | Train PPL:   1.327
	 Val. Loss: 2.797 |  Val. PPL:  16.401
907


                                                           

Epoch: 26 | Time: 8.0m 44.46570706367493s
	Train Loss: 0.270 | Train PPL:   1.310
	 Val. Loss: 2.788 |  Val. PPL:  16.248
907


                                                           

Epoch: 27 | Time: 8.0m 55.830233097076416s
	Train Loss: 0.261 | Train PPL:   1.298
	 Val. Loss: 2.832 |  Val. PPL:  16.985
907


                                                           

Epoch: 28 | Time: 9.0m 11.61724591255188s
	Train Loss: 0.253 | Train PPL:   1.287
	 Val. Loss: 2.843 |  Val. PPL:  17.161
907


                                                           

Epoch: 29 | Time: 9.0m 8.516356945037842s
	Train Loss: 0.245 | Train PPL:   1.277
	 Val. Loss: 2.884 |  Val. PPL:  17.892
907


                                                           

Epoch: 30 | Time: 8.0m 45.49085092544556s
	Train Loss: 0.238 | Train PPL:   1.269
	 Val. Loss: 2.909 |  Val. PPL:  18.335
907


                                                           

Epoch: 31 | Time: 9.0m 16.190500259399414s
	Train Loss: 0.232 | Train PPL:   1.261
	 Val. Loss: 2.864 |  Val. PPL:  17.525
907


                                                           

Epoch: 32 | Time: 8.0m 57.94319987297058s
	Train Loss: 0.228 | Train PPL:   1.256
	 Val. Loss: 2.946 |  Val. PPL:  19.035
907


                                                           

Epoch: 33 | Time: 8.0m 54.89789605140686s
	Train Loss: 0.222 | Train PPL:   1.249
	 Val. Loss: 2.942 |  Val. PPL:  18.954
907


                                                           

Epoch: 34 | Time: 8.0m 58.59556794166565s
	Train Loss: 0.218 | Train PPL:   1.244
	 Val. Loss: 2.985 |  Val. PPL:  19.792
907


                                                           

Epoch: 35 | Time: 8.0m 51.57616877555847s
	Train Loss: 0.215 | Train PPL:   1.239
	 Val. Loss: 2.954 |  Val. PPL:  19.176
907


                                                           

Epoch: 36 | Time: 8.0m 49.10818696022034s
	Train Loss: 0.211 | Train PPL:   1.235
	 Val. Loss: 2.965 |  Val. PPL:  19.393
907


                                                           

Epoch: 37 | Time: 8.0m 47.23843693733215s
	Train Loss: 0.207 | Train PPL:   1.231
	 Val. Loss: 2.946 |  Val. PPL:  19.031
907


                                                           

Epoch: 38 | Time: 8.0m 46.461331844329834s
	Train Loss: 0.205 | Train PPL:   1.228
	 Val. Loss: 2.960 |  Val. PPL:  19.304
907


                                                           

Epoch: 39 | Time: 8.0m 49.96688532829285s
	Train Loss: 0.202 | Train PPL:   1.224
	 Val. Loss: 2.992 |  Val. PPL:  19.920
907


                                                           

Epoch: 40 | Time: 8.0m 14.84033489227295s
	Train Loss: 0.200 | Train PPL:   1.221
	 Val. Loss: 3.048 |  Val. PPL:  21.070
907


                                                           

Epoch: 41 | Time: 8.0m 37.99388384819031s
	Train Loss: 0.198 | Train PPL:   1.219
	 Val. Loss: 2.987 |  Val. PPL:  19.830
907


                                                           

Epoch: 42 | Time: 9.0m 23.78581690788269s
	Train Loss: 0.196 | Train PPL:   1.216
	 Val. Loss: 3.018 |  Val. PPL:  20.459
907


                                                           

Epoch: 43 | Time: 8.0m 53.444684743881226s
	Train Loss: 0.194 | Train PPL:   1.214
	 Val. Loss: 3.030 |  Val. PPL:  20.688
907


                                                           

Epoch: 44 | Time: 8.0m 53.31152296066284s
	Train Loss: 0.191 | Train PPL:   1.211
	 Val. Loss: 2.998 |  Val. PPL:  20.044
907


                                                           

Epoch: 45 | Time: 8.0m 55.59679675102234s
	Train Loss: 0.190 | Train PPL:   1.209
	 Val. Loss: 3.036 |  Val. PPL:  20.812
907


                                                           

Epoch: 46 | Time: 8.0m 57.775217056274414s
	Train Loss: 0.186 | Train PPL:   1.205
	 Val. Loss: 3.045 |  Val. PPL:  21.002
907


                                                           

Epoch: 47 | Time: 9.0m 1.3599638938903809s
	Train Loss: 0.185 | Train PPL:   1.204
	 Val. Loss: 3.058 |  Val. PPL:  21.281
907


                                                           

Epoch: 48 | Time: 8.0m 38.35386800765991s
	Train Loss: 0.184 | Train PPL:   1.202
	 Val. Loss: 3.029 |  Val. PPL:  20.675
907


                                                           

Epoch: 49 | Time: 8.0m 56.09635305404663s
	Train Loss: 0.182 | Train PPL:   1.199
	 Val. Loss: 3.048 |  Val. PPL:  21.074
907


                                                           

Epoch: 50 | Time: 8.0m 56.92475986480713s
	Train Loss: 0.182 | Train PPL:   1.199
	 Val. Loss: 3.056 |  Val. PPL:  21.242
907


                                                           

Epoch: 51 | Time: 9.0m 2.482003927230835s
	Train Loss: 0.179 | Train PPL:   1.196
	 Val. Loss: 3.045 |  Val. PPL:  21.002
907


                                                           

Epoch: 52 | Time: 8.0m 50.11277794837952s
	Train Loss: 0.178 | Train PPL:   1.195
	 Val. Loss: 3.029 |  Val. PPL:  20.685
907


                                                           

Epoch: 53 | Time: 8.0m 50.21841096878052s
	Train Loss: 0.176 | Train PPL:   1.193
	 Val. Loss: 3.053 |  Val. PPL:  21.174
907


                                                           

Epoch: 54 | Time: 8.0m 52.633220195770264s
	Train Loss: 0.176 | Train PPL:   1.192
	 Val. Loss: 3.069 |  Val. PPL:  21.521
907


                                                           

Epoch: 55 | Time: 9.0m 24.011665105819702s
	Train Loss: 0.174 | Train PPL:   1.190
	 Val. Loss: 3.086 |  Val. PPL:  21.885
907


                                                           

Epoch: 56 | Time: 9.0m 5.684516191482544s
	Train Loss: 0.174 | Train PPL:   1.190
	 Val. Loss: 3.061 |  Val. PPL:  21.339
907


                                                           

Epoch: 57 | Time: 8.0m 55.579249143600464s
	Train Loss: 0.172 | Train PPL:   1.187
	 Val. Loss: 3.138 |  Val. PPL:  23.064
907


                                                           

Epoch: 58 | Time: 9.0m 2.2847049236297607s
	Train Loss: 0.172 | Train PPL:   1.188
	 Val. Loss: 3.090 |  Val. PPL:  21.982
907


                                                           

Epoch: 59 | Time: 8.0m 35.127341747283936s
	Train Loss: 0.170 | Train PPL:   1.186
	 Val. Loss: 3.077 |  Val. PPL:  21.698
907


                                                           

Epoch: 60 | Time: 8.0m 13.501152992248535s
	Train Loss: 0.169 | Train PPL:   1.185
	 Val. Loss: 3.128 |  Val. PPL:  22.829
907


                                                           

Epoch: 61 | Time: 8.0m 46.55457615852356s
	Train Loss: 0.169 | Train PPL:   1.184
	 Val. Loss: 3.076 |  Val. PPL:  21.671
907


                                                           

Epoch: 62 | Time: 8.0m 9.52866792678833s
	Train Loss: 0.168 | Train PPL:   1.183
	 Val. Loss: 3.108 |  Val. PPL:  22.372
907


                                                           

Epoch: 63 | Time: 8.0m 30.082476139068604s
	Train Loss: 0.167 | Train PPL:   1.182
	 Val. Loss: 3.093 |  Val. PPL:  22.047
907


                                                           

Epoch: 64 | Time: 7.0m 56.332679748535156s
	Train Loss: 0.167 | Train PPL:   1.182
	 Val. Loss: 3.073 |  Val. PPL:  21.617
907


                                                           

Epoch: 65 | Time: 8.0m 9.851258039474487s
	Train Loss: 0.165 | Train PPL:   1.179
	 Val. Loss: 3.104 |  Val. PPL:  22.280
907


                                                           

Epoch: 66 | Time: 8.0m 5.845634937286377s
	Train Loss: 0.164 | Train PPL:   1.178
	 Val. Loss: 3.121 |  Val. PPL:  22.663
907


                                                           

Epoch: 67 | Time: 8.0m 10.841559886932373s
	Train Loss: 0.164 | Train PPL:   1.178
	 Val. Loss: 3.065 |  Val. PPL:  21.438
907


                                                           

Epoch: 68 | Time: 7.0m 45.26745128631592s
	Train Loss: 0.163 | Train PPL:   1.177
	 Val. Loss: 3.122 |  Val. PPL:  22.699
907


                                                           

Epoch: 69 | Time: 8.0m 36.06680393218994s
	Train Loss: 0.163 | Train PPL:   1.177
	 Val. Loss: 3.097 |  Val. PPL:  22.132
907


                                                           

Epoch: 70 | Time: 8.0m 37.40251111984253s
	Train Loss: 0.162 | Train PPL:   1.176
	 Val. Loss: 3.094 |  Val. PPL:  22.068
907


                                                           

Epoch: 71 | Time: 8.0m 27.53780508041382s
	Train Loss: 0.160 | Train PPL:   1.174
	 Val. Loss: 3.080 |  Val. PPL:  21.763
907


                                                           

Epoch: 72 | Time: 8.0m 26.56277823448181s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 3.121 |  Val. PPL:  22.664
907


                                                           

Epoch: 73 | Time: 9.0m 17.98567008972168s
	Train Loss: 0.159 | Train PPL:   1.172
	 Val. Loss: 3.119 |  Val. PPL:  22.614
907


                                                           

Epoch: 74 | Time: 8.0m 52.912083864212036s
	Train Loss: 0.159 | Train PPL:   1.172
	 Val. Loss: 3.053 |  Val. PPL:  21.188
907


                                                           

Epoch: 75 | Time: 8.0m 58.206515312194824s
	Train Loss: 0.158 | Train PPL:   1.171
	 Val. Loss: 3.120 |  Val. PPL:  22.641
907


                                                           

Epoch: 76 | Time: 8.0m 58.10264015197754s
	Train Loss: 0.158 | Train PPL:   1.171
	 Val. Loss: 3.125 |  Val. PPL:  22.760
907


                                                           

Epoch: 77 | Time: 9.0m 4.534527063369751s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 3.142 |  Val. PPL:  23.157
907


                                                           

Epoch: 78 | Time: 8.0m 56.29962396621704s
	Train Loss: 0.157 | Train PPL:   1.169
	 Val. Loss: 3.194 |  Val. PPL:  24.375
907


                                                           

Epoch: 79 | Time: 8.0m 53.55362582206726s
	Train Loss: 0.156 | Train PPL:   1.169
	 Val. Loss: 3.151 |  Val. PPL:  23.349
907


                                                           

Epoch: 80 | Time: 8.0m 43.125404834747314s
	Train Loss: 0.155 | Train PPL:   1.167
	 Val. Loss: 3.133 |  Val. PPL:  22.945
907


                                                           

Epoch: 81 | Time: 8.0m 33.87627291679382s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 3.146 |  Val. PPL:  23.249
907


                                                           

Epoch: 82 | Time: 8.0m 38.536235094070435s
	Train Loss: 0.153 | Train PPL:   1.166
	 Val. Loss: 3.144 |  Val. PPL:  23.192
907


                                                           

Epoch: 83 | Time: 8.0m 53.328369140625s
	Train Loss: 0.154 | Train PPL:   1.166
	 Val. Loss: 3.140 |  Val. PPL:  23.112
907


                                                           

Epoch: 84 | Time: 8.0m 32.04570174217224s
	Train Loss: 0.154 | Train PPL:   1.167
	 Val. Loss: 3.121 |  Val. PPL:  22.672
907


                                                           

Epoch: 85 | Time: 7.0m 58.50881385803223s
	Train Loss: 0.154 | Train PPL:   1.166
	 Val. Loss: 3.147 |  Val. PPL:  23.264
907


                                                           

Epoch: 86 | Time: 7.0m 25.602509260177612s
	Train Loss: 0.152 | Train PPL:   1.165
	 Val. Loss: 3.133 |  Val. PPL:  22.952
907


                                                           

Epoch: 87 | Time: 7.0m 31.86610507965088s
	Train Loss: 0.152 | Train PPL:   1.164
	 Val. Loss: 3.148 |  Val. PPL:  23.299
907


                                                           

Epoch: 88 | Time: 8.0m 5.1698150634765625s
	Train Loss: 0.152 | Train PPL:   1.164
	 Val. Loss: 3.147 |  Val. PPL:  23.270
907


                                                           

Epoch: 89 | Time: 7.0m 47.43869996070862s
	Train Loss: 0.151 | Train PPL:   1.163
	 Val. Loss: 3.161 |  Val. PPL:  23.586
907


                                                           

Epoch: 90 | Time: 7.0m 45.97254204750061s
	Train Loss: 0.150 | Train PPL:   1.162
	 Val. Loss: 3.087 |  Val. PPL:  21.905
907


                                                           

Epoch: 91 | Time: 7.0m 42.52696394920349s
	Train Loss: 0.150 | Train PPL:   1.161
	 Val. Loss: 3.200 |  Val. PPL:  24.526
907


                                                           

Epoch: 92 | Time: 8.0m 38.30528736114502s
	Train Loss: 0.150 | Train PPL:   1.162
	 Val. Loss: 3.131 |  Val. PPL:  22.893
907


                                                           

Epoch: 93 | Time: 9.0m 0.9147191047668457s
	Train Loss: 0.149 | Train PPL:   1.161
	 Val. Loss: 3.151 |  Val. PPL:  23.363
907


                                                           

Epoch: 94 | Time: 8.0m 25.49916911125183s
	Train Loss: 0.149 | Train PPL:   1.161
	 Val. Loss: 3.180 |  Val. PPL:  24.059
907


                                                           

Epoch: 95 | Time: 8.0m 28.967164039611816s
	Train Loss: 0.149 | Train PPL:   1.160
	 Val. Loss: 3.147 |  Val. PPL:  23.260
907


                                                           

Epoch: 96 | Time: 7.0m 30.238784790039062s
	Train Loss: 0.149 | Train PPL:   1.160
	 Val. Loss: 3.167 |  Val. PPL:  23.736
907


                                                           

Epoch: 97 | Time: 8.0m 1.4299991130828857s
	Train Loss: 0.148 | Train PPL:   1.159
	 Val. Loss: 3.160 |  Val. PPL:  23.565
907


                                                           

Epoch: 98 | Time: 8.0m 12.562832117080688s
	Train Loss: 0.148 | Train PPL:   1.160
	 Val. Loss: 3.173 |  Val. PPL:  23.889
907


                                                           

Epoch: 99 | Time: 7.0m 47.48849415779114s
	Train Loss: 0.147 | Train PPL:   1.158
	 Val. Loss: 3.153 |  Val. PPL:  23.416
907


                                                           

Epoch: 100 | Time: 7.0m 58.13723921775818s
	Train Loss: 0.146 | Train PPL:   1.157
	 Val. Loss: 3.128 |  Val. PPL:  22.821


<All keys matched successfully>

In [34]:
for example_idx in range(3):  # Change the range to translate more examples
    src = test_data[example_idx]['src']
    trg = test_data[example_idx]['trg']

    print(f'Source: {" ".join(src)}')
    print(f'Target: {" ".join(trg)}')

    translation = translate_sentence(" ".join(src), SRC_VOCAB, TRG_VOCAB, model, torch.device('cpu' if torch.cuda.is_available() else 'cpu'))
    print(f'Predicted: {" ".join(translation)}')
    print()

Source: ein mann mit einem orangefarbenen hut , der etwas anstarrt .
Target: a man in an orange hat starring at something .
Predicted: a man in an orange hat is walking something with a man .

Source: ein boston terrier läuft über saftig-grünes gras vor einem weißen zaun .
Target: a boston terrier is running on lush green grass in front of a white fence .
Predicted: a black and white dog runs over a white fence while on grass .

Source: ein mädchen in einem karateanzug bricht ein brett mit einem tritt .
Target: a girl in karate uniform breaking a stick with a front kick .
Predicted: a girl in a suit is performing a board game in a board game .



In [35]:
# Save the model
torch.save(model.state_dict(), 'final_transformer_translation_model.pt')
print("Model saved as 'final_transformer_translation_model.pt'")

Model saved as 'final_transformer_translation_model.pt'
