In [1]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

In [2]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        # This is from the paper where we multiply the embeddings by sqrt(d_model)
        return self.embedding(x) * math.sqrt(self.d_model)

In [3]:
torch.arange(0, 10, 2)

tensor([0, 2, 4, 6, 8])

In [4]:
a = torch.exp(torch.arange(0, 512, 2).float() * (-math.log(10000.0) / 512))
a.shape

torch.Size([256])

In [5]:
position = torch.arange(0, 10, dtype=torch.float).unsqueeze(1)
position.shape

torch.Size([10, 1])

In [6]:
(position * a).shape

torch.Size([10, 256])

In [7]:
class PositionEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # create position embeddings
        pe = torch.zeros(seq_len, d_model)
        # create a vector of shape (seq_len, 1)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Fill position embedding matrix
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # (1, seq_len, d_model)
        pe = pe.unsqueeze(0)

        # when model is saved, this will be saved
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

In [8]:
a = torch.tensor([1, 2, 3])
torch.sqrt(a)

tensor([1.0000, 1.4142, 1.7321])

In [9]:
a = nn.Parameter(torch.zeros(2, 2), requires_grad=True)
a * 1

tensor([[0., 0.],
        [0., 0.]], grad_fn=<MulBackward0>)

In [10]:
class LayerNormalization(nn.Module):
    def __init__(self, last_dim: int, epsilon: float=1e-6):
        super().__init__()
        self.last_dim = last_dim
        self.alpha = nn.Parameter(torch.ones(last_dim), requires_grad=True) 
        self.beta = nn.Parameter(torch.zeros(last_dim), requires_grad=True)
        self.epsilon = epsilon
    
    def forward(self, x: torch.Tensor):
        xmean = x.mean(dim=-1, keepdim=True)
        xvar = x.var(dim=-1, keepdim=True)
        xnorm = (x - xmean) / torch.sqrt(xvar + self.epsilon)
        return self.alpha * xnorm + self.beta

In [11]:
last_dim = 5
gamma = nn.Parameter(torch.ones(last_dim), requires_grad=True)
a = torch.ones((10, 5))

In [12]:
gamma * a

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]], grad_fn=<MulBackward0>)

In [13]:
layer_norm = LayerNormalization(128)

In [14]:
x = F.softmax(torch.rand(28, 128) * 2, dim=-1)
x.mean(dim=-1, keepdim=True).shape
x

tensor([[0.0139, 0.0025, 0.0088,  ..., 0.0044, 0.0067, 0.0110],
        [0.0029, 0.0125, 0.0068,  ..., 0.0085, 0.0038, 0.0057],
        [0.0041, 0.0034, 0.0026,  ..., 0.0026, 0.0049, 0.0086],
        ...,
        [0.0046, 0.0130, 0.0075,  ..., 0.0036, 0.0043, 0.0060],
        [0.0101, 0.0027, 0.0103,  ..., 0.0155, 0.0136, 0.0127],
        [0.0053, 0.0048, 0.0031,  ..., 0.0026, 0.0045, 0.0097]])

In [15]:
# layer_norm(x)

In [16]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float): 
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1, b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2, b2 
    
    def forward(self, x: torch.Tensor):
        x = F.relu(self.linear_1(x))
        return self.linear_2(self.dropout(x))

### Matrix mult in pytorch

In [17]:
tensor1 = torch.randn(10, 3, 4)
tensor2 = torch.randn(10, 4, 5)
(tensor1 @ tensor2).shape

torch.Size([10, 3, 5])

In [18]:
# (tensor2 @ tensor1).shape (fails)

In [19]:
tensor3 = torch.randn(10, 3, 4)
# (tensor1 @ tensor3).shape (fails)

In [20]:
tensor5 = torch.randn(4)
(tensor1 @ tensor5).shape

torch.Size([10, 3])

In [21]:
tensor1 = torch.randn(10, 12, 3, 4)
tensor2 = torch.randn(10, 12, 4, 5)
(tensor1 @ tensor2).shape

torch.Size([10, 12, 3, 5])

In [22]:
tensor1 = torch.randn(10, 12, 3, 4)
tensor3 = torch.randn(5, 5)
# (tensor1 @ tensor3).shape (fails)

In [23]:
batch, seq_len, _,_ = tensor1.shape
batch, seq_len

(10, 12)

In [24]:
tensor1.contiguous().shape

torch.Size([10, 12, 3, 4])

In [25]:
tensor1.shape

torch.Size([10, 12, 3, 4])

In [26]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float):
        super().__init__()
        self.d_model = d_model
        self.h = h # number of heads
        assert d_model % h == 0, "d_model it not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # w_q
        self.w_k = nn.Linear(d_model, d_model, bias=False) # w_k
        self.w_v = nn.Linear(d_model, d_model, bias=False) # w_v
        self.w_o = nn.Linear(d_model, d_model, bias=False) # w_o

        self.dropout = nn.Dropout(dropout)
    
    @staticmethod
    def attention(q, k, v, mask, dropout: nn.Dropout):
        # q, k, v are of dim (batch, h, seq_len, d_k)
        d_k = q.shape[-1]

        # (batch, h, seq_len, seq_len)
        attention_scores = (q @ k.transpose(-2, -1)) / math.sqrt(d_k)

        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9)
        
        attention_scores = attention_scores.softmax(dim=-1) 
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        
        # (batch, h, seq_len, d_k), (batch, h, seq_len, seq_len)
        return (attention_scores @ v), attention_scores
    
    def forward(self, q, k, v, mask):
        # (batch, seq_len, d_model)
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        # (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # get attention
        x, attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        # (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # (batch, seq_len, d_model)
        return self.w_o(x)

In [27]:
class ResidualConnection(nn.Module):

    def __init__(self, features: int, dropout: float):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)
    
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [28]:
class EncoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])
    
    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [29]:
class Encoder(nn.Module):
    def __init__(self, features: int, encoder_blocks: nn.ModuleList):
        super().__init__()
        self.encoder_blocks = encoder_blocks
        self.norm = LayerNormalization(features)
    
    def forward(self, x, src_mask):
        for encder_block in self.encoder_blocks:
            x = encder_block(x, src_mask)
        return self.norm(x)

In [30]:
class DecoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList(
            [ResidualConnection(features, dropout) for _ in range(3)]
        )
    
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [31]:
class Decoder(nn.Module):
    def __init__(self, features: int, decoder_blocks: nn.ModuleList):
        super().__init__()
        self.norm = LayerNormalization(features)
        self.decoder_blocks = decoder_blocks
    
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, encoder_output, src_mask, tgt_mask)
        
        return self.norm(x)

In [32]:
class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        return self.proj(x)

In [33]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionEncoding, tgt_pos: PositionEncoding, projection_layer: ProjectionLayer):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer
    
    def encode(self, src, src_mask):
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, tgt, encoder_output, src_mask, tgt_mask):
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        return self.projection_layer(x)

In [34]:
from dataclasses import dataclass

In [35]:
@dataclass
class ModelArgs:
    src_vocab_size: int
    tgt_vocab_size: int
    src_seq_len: int
    tgt_seq_len: int
    d_model: int = 512
    N: int = 6 # number of encoder / decoder blocks
    h: int = 8 # number of heads in multi head attention
    dropout: float = 0.1
    d_ff: int = 2048

In [36]:
# DecoderBlock??

In [37]:
def build_transformer(model_args: ModelArgs):
    # Create embedding vectors
    d_model = model_args.d_model
    src_embed = InputEmbeddings(d_model, model_args.src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, model_args.tgt_vocab_size)

    # Create position embeddings
    src_pos = PositionEncoding(d_model, model_args.src_seq_len, model_args.dropout)
    tgt_pos = PositionEncoding(d_model, model_args.tgt_seq_len, model_args.dropout)
    
    # Create encoder blocks
    N = model_args.N
    encoder_blocks = []
    for _ in range(N):
        self_attention_block = MultiHeadAttentionBlock(d_model, model_args.h, model_args.dropout)
        feed_forward_block = FeedForwardBlock(d_model, model_args.d_ff, model_args.dropout)
        encoder_block = EncoderBlock(
            d_model,
            self_attention_block,
            feed_forward_block,
            model_args.dropout
        )
        encoder_blocks.append(encoder_block)
    
    # Create decoder blocks
    decoder_blocks = []
    for _ in range(N):
        self_attention_block = MultiHeadAttentionBlock(d_model, model_args.h, model_args.dropout)
        cross_attention_block = MultiHeadAttentionBlock(d_model, model_args.h, model_args.dropout)
        feed_forward_block = FeedForwardBlock(d_model, model_args.d_ff, model_args.dropout)
        decoder_block = DecoderBlock(
            d_model,
            self_attention_block,
            cross_attention_block,
            feed_forward_block,
            model_args.dropout
        )
        decoder_blocks.append(decoder_block)
    
    # Create encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    projection_layer = ProjectionLayer(d_model, model_args.tgt_vocab_size)

    transformer = Transformer(
        encoder,
        decoder,
        src_embed,
        tgt_embed,
        src_pos,
        tgt_pos,
        projection_layer
    )

    # Initialize the parameters (xavier init)
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer

### Build the dataset and tokenizer

In [38]:
from torch.utils.data import random_split, Dataset

In [39]:
# torch.tril??

In [40]:
# import torch.utils.tensorboard 

In [41]:
# decoder_input = torch.empty(1, 1).fill_(sos_idx)

In [42]:
%pip install altair

Note: you may need to restart the kernel to use updated packages.


In [43]:
%pip install tokenizers
%pip install datasets
%pip install torchmetrics
%pip install tensorboard

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [44]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import torchmetrics
from torch.utils.tensorboard import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [45]:
import warnings
from tqdm import tqdm
import os
from pathlib import Path

In [46]:
from torch.optim.lr_scheduler import LambdaLR

### config.py

In [47]:
from pathlib import Path

In [48]:
@dataclass
class ModelConfig:
    batch_size: int = 8
    num_epochs: int = 2
    lr: float= 1e-4
    seq_len: int =  350
    d_model: int = 512
    datasource: str = 'opus_books'
    lang_src: str = 'en'
    lang_tgt: str = 'it'
    model_folder: str = 'weights'
    model_basename: str = 'tmodel_'
    preload: str = 'latest'
    tokenizer_file: str = "tokeninzer_{0}.json"
    experiment_name: str = 'runs/tmodel'

In [49]:
config = ModelConfig()
config.seq_len

350

In [50]:
config.tokenizer_file.format("json")

'tokeninzer_json.json'

In [51]:
def get_weights_file_path(config: ModelConfig, epoch: str):
    model_folder = f"{config.datasource}_{config.model_folder}"
    model_filename = f"{config.model_basename}{epoch}.pt"
    return str(Path('.')/model_folder/model_filename)

In [52]:
get_weights_file_path(config, 1)

'opus_books_weights/tmodel_1.pt'

In [53]:
str(Path('.') / 'test')

'test'

In [54]:
def latest_weights_file_path(config):
    model_folder = f"{config.datasource}_{config.model_folder}"
    # print(model_folder)
    model_filename = f"{config.model_basename}*"
    weight_files = list(Path(model_folder).glob(model_filename))

    if len(weight_files) == 0:
        return None
    
    weight_files.sort()
    weights_file = str(weight_files[-1])
    print(f"loading {weights_file}")
    return weights_file

In [55]:
latest_weights_file_path(config)

### Hugging face tokenizer library

In [56]:
# !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
# !unzip wikitext-103-raw-v1.zip

#### train.py

In [57]:
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]

In [58]:
def get_or_build_tokenizer(config: ModelConfig, ds, lang):
    tokenizer_path = Path(config.tokenizer_file.format(lang))

    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    
    return tokenizer

In [59]:
dataset = load_dataset(config.datasource, f"{config.lang_src}-{config.lang_tgt}", split='train')

In [60]:
len(dataset)

32332

In [61]:
tokenizer_src = get_or_build_tokenizer(config, dataset, config.lang_src)

In [62]:
tokenizer_tgt = get_or_build_tokenizer(config, dataset, config.lang_tgt)

In [63]:
wiki_text_data = load_dataset('wikitext', 'wikitext-103-raw-v1')
len(wiki_text_data)

3

In [64]:
wiki_text_data

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [65]:
from tokenizers.models import BPE

In [66]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [67]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [68]:
tokenizer.pre_tokenizer = Whitespace()

In [69]:
tokenizer.train_from_iterator(get_all_sentences(dataset, 'en'), trainer)






In [70]:
tokenizer.save('tokenizer_en_bpe.json')

In [71]:
tokenizer = Tokenizer.from_file('tokenizer_en_bpe.json')

In [72]:
tokenizer

<tokenizers.Tokenizer at 0x169f79740>

In [73]:
output = tokenizer.encode('how are you doing')

In [74]:
output??

[0;31mType:[0m        Encoding
[0;31mString form:[0m Encoding(num_tokens=4, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[0;31mLength:[0m      4
[0;31mFile:[0m        ~/anaconda3/envs/myenv/lib/python3.8/site-packages/tokenizers/__init__.py
[0;31mDocstring:[0m   The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.

In [75]:
output.tokens

['how', 'are', 'you', 'doing']

In [76]:
output.ids

[336, 210, 155, 1431]

In [77]:
output.offsets

[(0, 3), (4, 7), (8, 11), (12, 17)]

In [78]:
tokenizer.token_to_id("[SEP]")

2

### Build the dataset

In [79]:
from torch.utils.data import Dataset

In [80]:
# torch.triu??

In [81]:
mask = torch.triu(torch.ones((1, 5, 5)), diagonal=1)
mask == 0

tensor([[[ True, False, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True]]])

In [82]:
mask = torch.tril(torch.ones((1, 5, 5), dtype=torch.int))
mask == 1

tensor([[[ True, False, False, False, False],
         [ True,  True, False, False, False],
         [ True,  True,  True, False, False],
         [ True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True]]])

In [83]:
mask2 = torch.tril(torch.ones((1, 5, 5))).type(torch.int)
mask2 & (mask == 1)

tensor([[[1, 0, 0, 0, 0],
         [1, 1, 0, 0, 0],
         [1, 1, 1, 0, 0],
         [1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1]]], dtype=torch.int32)

In [84]:
torch.tril??

[0;31mDocstring:[0m
tril(input, diagonal=0, *, out=None) -> Tensor

Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices
:attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.

The lower triangular part of the matrix is defined as the elements on and
below the diagonal.

The argument :attr:`diagonal` controls which diagonal to consider. If
:attr:`diagonal` = 0, all elements on and below the main diagonal are
retained. A positive value includes just as many diagonals above the main
diagonal, and similarly a negative value excludes just as many diagonals below
the main diagonal. The main diagonal are the set of indices
:math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
:math:`d_{1}, d_{2}` are the dimensions of the matrix.

Args:
    input (Tensor): the input tensor.
    diagonal (int, optional): the diagonal to consider

Keyword args:
    out (Tensor, optional): the output tensor.

Example::

    

In [85]:
class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_src.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_src.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_src.token_to_id("[PAD]")], dtype=torch.int64)
    
    def __len__(self):
        return len(self.ds)
    
    
    def __getitem__(self, index):
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add eos, eos and padding tokens to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")
        
        # Add <s> and </s> to encoder input
        # Shape should be [seq_len], only one feature remember
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add <s> to decoder input
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add </s> to decoder output
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,
            "decoder_input": decoder_input,
            "label": label,
            "src_text": src_text,
            "tgt_txt": tgt_text,
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & self.causal_mask(self.seq_len)
        }
    
    @staticmethod
    def causal_mask(size):
        return torch.tril(torch.ones((1, size, size))).type(torch.int64)

In [86]:
len(dataset)
type(config)

__main__.ModelConfig

In [87]:
train_ds = BilingualDataset(dataset, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)

In [88]:
len(train_ds)

32332

In [89]:
BilingualDataset.causal_mask(12)

tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]])

In [90]:
from torch.utils.data import Dataset, DataLoader, random_split

In [91]:
from tokenizers.pre_tokenizers import Whitespace

In [92]:
train_dataloader = DataLoader(train_ds, batch_size=4, shuffle=True)

In [93]:
for i in train_dataloader:
    print(i.keys())
    break

dict_keys(['encoder_input', 'decoder_input', 'label', 'src_text', 'tgt_txt', 'encoder_mask', 'decoder_mask'])


### Build the model

In [94]:
build_transformer?

[0;31mSignature:[0m [0mbuild_transformer[0m[0;34m([0m[0mmodel_args[0m[0;34m:[0m [0m__main__[0m[0;34m.[0m[0mModelArgs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      /var/folders/c8/mt_y_mg14_s14_slht8ds95w0000gn/T/ipykernel_18433/651523827.py
[0;31mType:[0m      function

In [95]:
ModelArgs??

[0;31mInit signature:[0m
[0mModelArgs[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msrc_vocab_size[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtgt_vocab_size[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msrc_seq_len[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtgt_seq_len[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0md_model[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m512[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mN[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m6[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mh[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropout[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0md_ff[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m2048[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;

In [96]:
BilingualDataset??

[0;31mInit signature:[0m [0mBilingualDataset[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
An abstract class representing a :class:`Dataset`.

All datasets that represent a map from keys to data samples should subclass
it. All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
data sample for a given key. Subclasses could also optionally overwrite
:meth:`__len__`, which is expected to return the size of the dataset by many
:class:`~torch.utils.data.Sampler` implementations and the default options
of :class:`~torch.utils.data.DataLoader`. Subclasses could also
optionally implement :meth:`__getitems__`, for speedup batched samples
loading. This method accepts list of indices of samples of batch and returns
list of samples.

.. note::
  :class:`~torch.utils.data.DataLoader` by default constructs a index
  sampler that yields integral indices.  To make it work with a map-

In [97]:
def get_model(config: ModelConfig, vocab_src_len, vocab_tgt_len):
    model_args = ModelArgs(
        vocab_src_len,
        vocab_tgt_len,
        config.seq_len,
        config.seq_len,
        config.d_model
    )
    return build_transformer(model_args)

In [98]:
def get_ds(config: ModelConfig):
    dataset = load_dataset(config.datasource, f"{config.lang_src}-{config.lang_tgt}", split='train')
    # print(len(dataset))

    # build tokenizers
    tokenizer_src = get_or_build_tokenizer(config, dataset, config.lang_src)
    tokenizer_tgt = get_or_build_tokenizer(config, dataset, config.lang_tgt)

    # Train/test splits
    train_ds_size = int(0.9 * len(dataset))
    val_ds_sisze = len(dataset) - train_ds_size
    train_ds, val_ds = random_split(dataset, [train_ds_size, val_ds_sisze])

    train_ds = BilingualDataset(train_ds, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)
    val_ds = BilingualDataset(val_ds, tokenizer_src, tokenizer_tgt, config.lang_src, config.lang_tgt, config.seq_len)

    # print(len(train_ds), len(val_ds))

    max_len_src = 0
    max_len_tgt = 0
    for data in dataset:
        data = data['translation']
        tokens_src = tokenizer_src.encode(data[config.lang_src]).ids
        tokens_tgt = tokenizer_tgt.encode(data[config.lang_tgt]).ids
        max_len_src = max(max_len_src, len(tokens_src))
        max_len_tgt = max(max_len_tgt, len(tokens_tgt))
    
    # print(max_len_src, max_len_tgt)
    train_dataloader = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True)
    # double check this
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=False)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

In [99]:
get_ds(config)

(<torch.utils.data.dataloader.DataLoader at 0x1698f88b0>,
 <torch.utils.data.dataloader.DataLoader at 0x1698f8df0>,
 <tokenizers.Tokenizer at 0x169fea580>,
 <tokenizers.Tokenizer at 0x177c86c90>)

In [100]:
config.experiment_name = 'opus_books_tb'

In [101]:
dataset = load_dataset(config.datasource, f"{config.lang_src}-{config.lang_tgt}", split='train')
config

ModelConfig(batch_size=8, num_epochs=2, lr=0.0001, seq_len=350, d_model=512, datasource='opus_books', lang_src='en', lang_tgt='it', model_folder='weights', model_basename='tmodel_', preload='latest', tokenizer_file='tokeninzer_{0}.json', experiment_name='opus_books_tb')

In [102]:
config.experiment_name

'opus_books_tb'

In [103]:
# torch.optim.Adam??

In [104]:
def train_model(config: ModelConfig):
    device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_built() or torch.backend.mps.is_available() else 'cpu'
    device = torch.device(device)
    # print(device)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    
    Path(f"{config.datasource}_{config.model_folder}").mkdir(parents=True, exist_ok=True)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

    # tensorboard
    writer = SummaryWriter(config.experiment_name)

    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, eps=1e-9)

    # preload the model
    initial_epoch = 0
    global_step = 0
    preload = config.preload
    # print(preload)

    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    # print(model_filename)

    if model_filename:
        print(f"preloading model: {model_filename}")
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
        # return model
    else:
        print('No model to preload')
    
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config.num_epochs):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing epoch {epoch:02d}")

        for batch in batch_iterator:
            encoder_input = batch['encoder_input'].to(device) # [B, seq_len]
            decoder_input = batch['decoder_input'].to(device) # [B, seq_len]
            encoder_mask = batch['encoder_mask'].to(device) # [B, 1, 1, seq_len]
            decoder_mask = batch['decoder_mask'].to(device) # [B, 1, seq_len, seq_len]
            label = batch['label'].to(device) # [B, seq_len]

            encoder_output = model.encode(encoder_input, encoder_mask) # [B, seq_len, d_model]
            decoder_output = model.decode(decoder_input, encoder_output, encoder_mask, decoder_mask) # [B, seq_len, d_model]
            proj_out = model.project(decoder_output) # [B, seq_len, vocab_size]

            # print(encoder_input.shape, decoder_input.shape, encoder_mask.shape, decoder_mask.shape,
            #       label.shape, encoder_output.shape, decoder_output.shape, proj_out.shape, tokenizer_tgt.get_vocab_size())
            
            proj_out = proj_out.view(-1, tokenizer_tgt.get_vocab_size())
            label = label.view(-1)
            # print(proj_out.shape, label.shape)
            # print(proj_out[0].sum()) unnormalized as we are just projecting, loss function expects unnormalized
            # print(proj_out[0, :10], label[0]) # each proj out has probs, while labels are class labels

            loss = loss_fn(proj_out, label)
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # log the loss
            writer.add_scalar('train_loss', loss.item(), global_step)
            writer.flush()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1
            # break
        
        # run validation
        run_validation(model, val_dataloader, tokenizer_tgt, config.seq_len, device, global_step, writer)

        # save model
        model_filename = get_weights_file_path(config, f"{epoch:02d}")
        print(model_filename)
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)
        # break

In [105]:
# writer = SummaryWriter(config.experiment_name)
# writer.flush??

In [106]:
# ! pip install torch-tb-profiler


In [107]:
train_model(config)

No model to preload


Processing epoch 00:  16%|█▌        | 578/3638 [07:25<39:19,  1.30it/s, loss=6.697] 


KeyboardInterrupt: 

In [None]:
config

In [None]:
a = torch.tensor([0, 1, 2, 3, 4, 5]).view(2, 3)
a.shape
type(a)

In [None]:
e = InputEmbeddings(128, 64)

In [None]:
e(a).shape

In [None]:
# nn.CrossEntropyLoss??

In [None]:
loss_func = nn.CrossEntropyLoss()

In [None]:
output = torch.randn(12, 10, 128)
label = torch.ones(12, 10).type(torch.int64)
output.shape, label.shape

In [None]:
output.size(-1)

In [None]:
loss_func(output.view(-1, output.size(-1)), label.view(-1))

In [None]:
import torchmetrics

In [None]:
def greedy_decode(model, source, source_mask, tokenizer_tgt, max_len, device):
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')
    # print(eos_idx, sos_idx)
    encoder_output = model.encode(source, source_mask)
    # print(f"encoder output is {encoder_output.shape}")
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)

    while True:
        if decoder_input.size(1) == max_len:
            # self, tgt, encoder_output, src_mask, tgt_mask
            break

        # build mask for the target
        decoder_mask = BilingualDataset.causal_mask(decoder_input.size(1)).type_as(source_mask).unsqueeze(0).to(device)
        # print(decoder_input.shape, decoder_mask.shape)
        # (1, seq_len, d_model)
        decoder_output = model.decode(decoder_input, encoder_output, source_mask, decoder_mask)
        prob = model.project(decoder_output[:, -1])
        # print(decoder_output.shape, prob.shape)
        _, next_token = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.ones(1, 1).fill_(next_token.item()).type_as(source).to(device)], dim=1
        )
        # print(f"new decoder input is {decoder_input.shape}")

        if next_token.item() == eos_idx:
            break
    
    return decoder_input.squeeze(0)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_built() or torch.backend.mps.is_available() else 'cpu'
device = torch.device(device)
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

In [None]:
model = train_model(config)

In [None]:
for batch in val_dataloader:
    print(batch.keys())
    encoder_input = batch["encoder_input"].to(device)
    encoder_mask = batch['encoder_mask'].to(device) # (b, 1, 1, seq_len)
    print(encoder_input.shape, encoder_mask.shape, batch['decoder_input'].shape)
    model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_tgt, 350, device)
    break

In [None]:
decoder_output = torch.randn(1, 12, 128)
decoder_output.shape

In [None]:
tokenizer_tgt.token_to_id('[EOS]')

In [None]:
decoder_output[:, -1], decoder_output[:, -1][0, 28]

In [None]:
torch.max(decoder_output[:, -1], dim=1)

In [None]:
model = train_model(config)

In [None]:
def run_validation(model, validation_ds, tokenizer_tgt, max_len, device, global_step=-1, writer=None, num_examples=2):
    # model.eval()
    count = 0
    source_texts = []
    expected = []
    predicted = []

    with torch.no_grad():
        for batch in validation_ds:
            # batch size is 1 here
            # print(batch.keys())
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (b, 1, 1, seq_len)
            # print(encoder_input.shape, encoder_mask.shape)

            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_tgt, max_len, device)
            # print(model_out)

            src_text = batch['src_text'][0]
            tgt_text = batch['tgt_txt'][0]
            model_out_text = tokenizer_tgt.decode(model_out.tolist())
            # print(model_out_text)

            source_texts.append(src_text)
            expected.append(tgt_text)
            predicted.append(model_out_text)

            if count == num_examples:
                print(source_texts)
                print(expected)
                print(predicted)
                break
    
    if writer:
        # Log metrics
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()

        metric = torchmetrics.BLEUScore()
        wer = metric(predicted, expected)
        writer.add_scalar('validation bleu', wer, global_step)
        writer.flush()

In [None]:
# greedy_decode??

In [None]:
run_validation(model, val_dataloader, tokenizer_tgt, config.seq_len, device)

In [None]:
model??

### Open questions

1) Why does encoder mask need to be (B, 1, 1, seq_len)?
2) Why does decoder mask need to be (B, 1, seq_len, seq_len)?
