In [87]:
#install quietly
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
!pip install --quiet sentencepiece
!pip install --quiet datasets
!pip install --quiet subword-nmt
!pip install --quiet tokenizers

### Dataset 

In [2]:
from datasets import load_dataset

# Load the pre-processed dataset
dataset = load_dataset("stas/wmt14-en-de-pre-processed",verification_mode="no_checks")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
len(dataset['train']['translation'])

4548885

In [4]:
train_dataset = dataset['train']['translation']
val_dataset = dataset['validation']['translation']
test_dataset = dataset['test']['translation']

def create_file(dataset, file_name):
    with open(file_name + ".txt","w",encoding="utf-8") as f:
        for item in dataset:
            f.write('[DE] ' + item['de'] + '\n')
            f.write('[EN] ' + item['en'] + '\n')

create_file(train_dataset, "train")
create_file(val_dataset, "val")
create_file(test_dataset, "test")

### BPE Tokenizer

In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=37000, show_progress=True, special_tokens=["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]","[DE]","[EN]"], min_frequency=2, continuing_subword_prefix="@@")
tokenizer.train(files=["train.txt"], trainer=trainer)
tokenizer.save("tokenizer.json")






In [7]:
sentence = "Ich bin ein Berliner"
print(tokenizer.encode(sentence).ids)
print(tokenizer.encode(sentence).tokens)

[7072, 8397, 6671, 20789]
['Ich', 'bin', 'ein', 'Berliner']


### INPUT EMBEDDING

In [75]:
import torch.nn as nn
import torch
import numpy as np

#we need to move vocab size as a hardcode input

class InputEmbedding(nn.Module):
    """_summary_:Performs normalized embedddings. Returns N x d_k dimensions
    """
    def __init__(self, vocab_size, d_k):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_k = d_k
        #This will create hypervectors for the whole vocab_size with dimension d_k
        self.embedding = nn.Embedding(vocab_size, d_k)

    def forward(self, x):
        #the reason why we multiply by sqrt(d_k) is because we compute dot products of the embeddings
        #they would grow up very large so we need to normalize by d_k
        return self.embedding(x)*torch.sqrt(torch.tensor(self.d_k))

class PositionalEncoding(nn.Module):
    
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        #x is the input embedding
        pos = torch.arange(x.size(1)).unsqueeze(1)
        i = torch.arange(x.size(2))        
        print(pos.shape,i.shape)
        elements = pos/torch.pow(10000, 2*i/x.size(2))
        print(elements.shape)
        self.pe = torch.zeros(x.shape)
        self.pe[:,:,1::2] = torch.sin(elements[:,0::2])
        self.pe[:,:,0::2] = torch.cos(elements[:,1::2])
        
        return x+self.pe

In [74]:
sentence = "Ich bin ein Berliner"
ids = tokenizer.encode(sentence).ids
ids = torch.tensor(ids).unsqueeze(0)
print("IDs: ", ids)
input_embedding = InputEmbedding(37000, 10)
embeddings = input_embedding(ids)
print("Embeddings: ", embeddings.shape)
positional_encoding = PositionalEncoding()
position_encoded = positional_encoding(embeddings)
print("Encodings",position_encoded.shape)

IDs:  tensor([[ 7072,  8397,  6671, 20789]])
Embeddings:  torch.Size([1, 4, 10])
torch.Size([4, 1]) torch.Size([10])
torch.Size([4, 10])
Encodings torch.Size([1, 4, 10])


### Encoder

In [83]:
import torch.nn as nn
import torch

class MultiHeadAttention(nn.Module):
    """_summary_:Performs multihead attention
       _inputs_: input_embeddings
    """
    def __init__(self, d_model, num_heads):
        self.d_model = d_model
        self.num_heads = num_heads
        self.linear_values = nn.Linear(d_model, d_model)
        self.Q_proj = nn.Linear(d_model, d_model)
        self.K_proj = nn.Linear(d_model, d_model)
        self.V_proj = nn.Linear(d_model, d_model)
        
    def forward(self,query,key,value):
        #we need to project the query, key and value matrices to the same dimension
        batch_size, seq_len, d_model = query.shape
        Q = self.Q_proj(query)
        K = self.K_proj(key)
        V = self.V_proj(value)
        
        self.d_k = d_model//self.num_heads
        self.d_v = d_model//self.num_heads
        self.scale = 1/torch.sqrt(torch.tensor(self.d_k))
        
        #We add head dimension so we split the embeddings into different heads
        Q = Q.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.d_v).transpose(1, 2)
        
        #dimensions are now batch_size, num_heads, seq_len, d_k/d_v
        attention_weights = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        attention_weights = torch.softmax(attention_weights, dim=-1)
        
        #attention weights dimensions are now batch_size, num_heads, seq_len, seq_len
        values = torch.matmul(attention_weights, V)
        
        #values is now batch_size, num_heads, seq_len, d_v
        values = values.transpose(1,2)
        
        #values is now batch_size, seq_len, num_heads, d_v
        #contiguous() is used to make the tensor contiguous in memory otherwise view() will fail
        values = values.contiguous().view(batch_size, seq_len, d_model)
        values = self.linear_values(values)
        return values
    

In [84]:
class EncoderBlock(nn.Module):
    """_summary_:Performs one encoder layer
       _inputs_: d_model, num_heads
    """
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
    def forward(self, x):
        x = self.norm1(x + self.multi_head_attention(query=x,key=x,value=x))
        x = self.norm2(x + self.feed_forward(x))
        return x


class Encoder(nn.Module):
    """_summary_:Performs one encoder layer
       _inputs_: d_model, num_heads, num_layers, input_embedding
    """
    def __init__(self, d_model, num_heads, num_layers):
        super().__init__()
        self.encoder_blocks = nn.ModuleList([EncoderBlock(d_model, num_heads) for _ in range(num_layers)])
        
    def forward(self, x):
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x)
        return x

In [88]:
class DecoderBlock(nn.Module):
    """_summary_:Performs one decoder layer
       _inputs_: d_model, num_heads
    """
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.multi_head_attention_2 = MultiHeadAttention(d_model, num_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )
        self.norm3 = nn.LayerNorm(d_model)
    
    def forward(self, x, encoder_output):
        x = self.norm1(x + self.multi_head_attention(query=x,key=x,value=x))
        x = self.norm2(x + self.multi_head_attention_2(query=encoder_output,key=encoder_output,value=x))
        x = self.norm3(x + self.feed_forward(x))
        return x

class Decoder(nn.Module):
    """_summary_:Performs one decoder layer
       _inputs_: d_model, num_heads, num_layers
    """
    def __init__(self, d_model, num_heads, num_layers):
        super().__init__()
        self.decoder_blocks = nn.ModuleList([DecoderBlock(d_model, num_heads) for _ in range(num_layers)])
        
    def forward(self, x, encoder_output):
        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, encoder_output)
        return x

class Transformer(nn.Module):
    """_summary_:Performs one transformer layer
       _inputs_: d_model, num_heads, num_layers, input_embedding
    """
    def __init__(self, d_model, num_heads, num_layers,vocab_size):
        super().__init__()
        self.encoder = Encoder(d_model, num_heads, num_layers)
        self.decoder = Decoder(d_model, num_heads, num_layers)
        self.linear_projection = nn.Linear(d_model, d_model)
        self.input_embedding = InputEmbedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding()
    
    def forward(self, input, output):
        x1 = self.input_embedding(input)
        x1 = self.positional_encoding(x1)
        x1 = self.encoder(x1)
        
        x2 = self.input_embedding(output)
        x2 = self.positional_encoding(x2)
        x2 = self.decoder(x2, x1)
        
        x2 = self.linear_projection(x2)
        return x2
    
    