In [None]:
import torch

In [None]:
import math
import torch.nn as nn
class InputEmbeddings(nn.Module):
  def __init__(self,  d_model : int , vocab_size : int):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size , d_model )
  def forward(self , x):
    return self.embedding(x) * math.sqrt(self.d_model)




In [None]:
#Positional Encoding
#will work only for even d_model
class PositionalEncoding(nn.Module):
  def __init__(self , d_model : int , seq_len : int , dropout: float) -> None :
    super().__init__()
    self.d_model = d_model
    self.seq_len = seq_len
    self.dropout = nn.Dropout(dropout)
    #(seq_len , d_model)
    pe = torch.zeros(seq_len , d_model)
    #create a vector of shape
    position = torch.arange( seq_len , dtype = torch.float ).unsqueeze(1) #(seq_len , 1)
    div_term = torch.exp(torch.arange(0 , d_model  , 2).float()*(-math.log(10000.0) / d_model))
    pe[: , 0::2] = torch.sin(position * div_term)

    pe[: , 1::2] = torch.cos(position*div_term)

    pe = pe.unsqueeze(0) #(1 , Seq_len , d_model)

    self.register_buffer('pe' , pe)
  def forward(self , x):
    x = x + (self.pe[: , : x.shape[1] , : ]).required_grad_(False)
    return x

In [None]:
class Layer_Normalization(nn.Module):
  def __init__(self , eps = 1e-6):
    super().__init__()
    self.eps = eps
    self.alpha = nn.Parameter(torch.ones(1))
    self.bias = nn.Parameter(torch.zeros(1))
  def forward(self , x):
    #x -> (n  , seq_len , d_model)
    mean = torch.mean(x , dim = -1 , keepdim = True)
    std = torch.std(x ,dim = -1 , keepdim =  True )


    return self.alpha * (x - mean) / (std + self.eps) + self.bias



In [None]:
class FeedForwardBlock(nn.Module):
  def __init__(self , d_model : int , d_hidden : int  , dropout : int):
    super().__init__()
    self.dropout = dropout
    self.d_model = d_model
    self.layers = nn.Sequential( nn.Linear(d_model , d_hidden ), nn.Dropout(dropout) , nn.ReLU() , nn.Linear(d_hidden , d_model ) , nn.ReLU())
  def forward(self , x):
    return self.layers(x)



In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self , d_model : int , h : int , dropout : float ) -> None :
    super().__init__()
    self.d_model = d_model
    self.h = h
    assert d_model % h ==0
    self.d_k = d_model // h
    self.w_q = nn.Linear(d_model , d_model)
    self.w_k = nn.Linear(d_model , d_model)
    self.w_v = nn.Linear(d_model , d_model)
    self.w_o = nn.Linear(d_model , d_model)
    self.dropout = nn.Dropout(dropout)
  @staticmethod
  def attention(query , key , value , mask , dropout : nn.Dropout):
    d_k = query.shape[-1]
    attention_scores = (query @ key.transpose(-2 , -1)) / math.sqrt(d_k)
    if mask is not None:
      attention_scores.masked_fill_(mask == 0 , -1e9)
    attention_scores = attention_scores.softmax(dim = -1)
    if dropout is not None:
      attention_scores = dropout(attention_scores)
    return (attention_scores @ value) , attention_scores




  def forward(self , q , k , v , mask):
    query = self.w_q(q)
    key = self.w_k(k)
    value = self.w_v(v)
    query = query.view(query.shape[0] , query.shape[1] , self.h , self.d_k).transpose(1,2)
    key = key.view(key.shape[0] , key.shape[1] , self.h , self.d_k).transpose(1,2)
    value = value.view(value.shape[0] , value.shape[1] , self.h , self.d_k).transpose(1,2)
    x , self.attention_scores = MultiHeadAttention.attention(query , key  , value , mask , self.dropout)
    x.transpose(1,2)#(batch, head , seq , d_k) ---> ((batch, seq , head , d_k))
    x = x.contiguous().view(x.shape[0] , -1 , self.h * self.d_k)
    return self.w_o(x)




In [None]:
class ResidualConnection(nn.Module):
  def __init__(self , dropout : float):
    super().__init__()
    self.dropout = nn.Dropout(dropout)
    self.norm = Layer_Normalization()
  def forward(self , x , sublayer):
    return x + self.dropout(sublayer(self.norm(x)))




In [None]:
class EncoderBlock(nn.Module):
  def __init__(self , self_attention_block : MultiHeadAttention , feed_forward_block : FeedForwardBlock , dropout : float):
    super().__init__()
    self.self_attention_block = self_attention_block
    self.feed_forward_block = feed_forward_block
    self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

  def forward(self, x , src_mask):
    x = self.residual_connections[0](x , lambda x : self.self_attention_block(x , x , x , src_mask))
    x = self.layer_normalization(x)
    x = self.residual_connections[1](x , lambda x : self.feed_forward_block(x))

    return x



In [None]:
class Encoder(nn.Module):
  def __init__(self , layers : nn.ModuleList ) -> None :
    super().__init__()
    self.layers = layers
    self.norm = Layer_Normalization()
  def forward(self , x , mask):
    for layer in self.layers:
      x = layer(x , mask)
    return self.norm(x)



In [None]:
class DecoderBlock(nn.Module):
  def __init__(self , self_attention_block : MultiHeadAttention ,feed_forward_block : FeedForwardBlock , dropout : float):
    super().__init__()
    self.self_attention_block = nn.ModuleList([self_attention_block for _ in range(2) ])
    self.feed_forward_block = feed_forward_block
    self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
    self.norm = Layer_Normalization()
    self.dropout = dropout
  def forward(self , x , self_attention_block ,v , k , src_mask):
    x =(self.residual_connections[0](x , lambda x : self.self_attention_block[0](x , x , x ,src_mask)))
    #passing lambda function as sublayer
    x = (self.residual_connections[1](x , lambda x : self.self_attention_block[1](x , k , v , None , self.dropout )))
    # last feed forward layer
    x = (self.residual_connections[2](x , lambda x : self.feed_forward_block(x)))
    return x



In [None]:
class DecoderBlock(nn.Module):
  def __init__(self , self_attention_block : MultiHeadAttention ,cross_attention_block : MultiHeadAttention ,feed_forward_block : FeedForwardBlock , dropout : float  ):
    super().__init__()
    self.self_attention_block = self_attention_block
    self.cross_attention_block = cross_attention_block
    self.feed_forward_block = feed_forward_block
    self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
    self.norm = Layer_Normalization()
  def forward(self , x ,encoder_output , src_mask , tgt_mask):
    x = self.residual_connections[0](x , lambda x : self.self_attention_block(x , x , x , tgt_mask))
    x = self.residual_connections[1](x , lambda x : self.cross_attention_block(x , encoder_output , encoder_output , src_mask))
    x = self.residual_connections[2](x , lambda x : self.feed_forward_block(x))
    return x

In [None]:
class Decoder(nn.Module):
  def __init__(self, layers : nn.ModuleList):
    super().__init__()
    self.layers = layers
    self.norm = Layer_Normalization()
  def forward(self , x , encoder_output , src_mask , tgt_mask):
    for layer in self.layers:
      x = layer(x , encoder_output , src_mask , tgt_mask)
    return self.norm(x)

In [None]:
class ProjectionLayer(nn.Module):
  def __init__(self , d_model : int , vocab_size : int):
    super().__init__()
    self.proj = nn.Linear(d_model , vocab_size)
  def forward(self , x):
    return torch.log_softmax(self.proj(x) , dim = -1)

In [None]:
transformer = Tranformer(encoder , decoder , src_embed , tgt_embed , src_pos , tgt_pos , projection)

NameError: name 'Tranformer' is not defined

In [None]:
class Tranformer(nn.Module):
  def __init__(self , encoder : Encoder , decoder : Decoder , src_embed  , tgt_embeddings , src_pos : PositionalEncoding ,  tgt_emb  , tgt_pos : PositionalEncoding , projection : ProjectionLayer   ) -> None :
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embed
    self.tgt_embed = tgt_embeddings
    self.tgt_pos = tgt_pos
    self.src_pos = src_pos
    self.projection = projection
  def encode(self , src , src_mask):
    src = self.src_embed(src)
    src = self.src_pos(src)
    return self.encoder(src , src_mask)
  def decode(self , encoder_output , src_mask , tgt ,  tgt_mask):
    tgt = self.tgt_embed(tgt)
    tgt = self.tgt_pos(tgt)
    return self.decoder(tgt , encoder_output , src_mask , tgt_mask)
  def project(self , x ):
    return self.projection(x)

In [None]:
#building for translation
def build_transformer(src_vocab_size : int , tgt_vocab_size : int , src_seq_len : int , tgt_seq_len : int , d_model : int = 512 , N : int = 6 , h : int = 8 , dropout : float = 0.1 , d_ff:  int =  2048 ):
  # create the Embedding layers
  src_embed = InputEmbeddings(d_model , src_vocab_size)
  tgt_embed = InputEmbeddings(d_model , tgt_vocab_size)
  #create Positional Encoding Layers
  src_pos = PositionalEncoding(d_model , src_seq_len , dropout)
  tgt_pos = PositionalEncoding(d_model , tgt_seq_len , dropout)

  #create the encoder blocks
  encoder_blocks = []
  for _ in range(N):
    encoder_self_attention_block = MultiHeadAttention(d_model , h , dropout)
    feed_forward_block = FeedForwardBlock(d_model , d_ff, dropout)
    encoder_block = EncoderBlock(encoder_self_attention_block , feed_forward_block , dropout)
    encoder_blocks.append(encoder_block)
  encoder = Encoder(nn.ModuleList(encoder_blocks))
  decoder_blocks = []
  for _ in range(N):
    decoder_self_attention_block = MultiHeadAttention(d_model , h , dropout)
    decoder_cross_attention_block = MultiHeadAttention(d_model , h , dropout)
    feed_forward_block = FeedForwardBlock(d_model , d_ff, dropout)
    decoder_block = DecoderBlock(decoder_self_attention_block , decoder_cross_attention_block , feed_forward_block , dropout)
    decoder_blocks.append(decoder_block)
  #create encoder and decoder
  encoder = Encoder(nn.ModuleList(encoder_blocks))
  decoder = Decoder(nn.ModuleList(decoder_blocks))
  #create the projection layer
  projection = ProjectionLayer(d_model , tgt_vocab_size)

  #create the transformer
  transformer = Tranformer(encoder , decoder , src_embed , tgt_embed , src_pos , tgt_pos , projection)

  #Initialize the parameters
  for p in transformer.parameters():
    if p.dim() > 1 :
      nn.init.xavier_uniform_(p)
  return transformer





In [None]:
!pip install datasets
!pip install tokenizers


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace


In [None]:
from pathlib import Path

In [None]:
def get_all_sentences(ds , lang):
  for item in ds :
    yield item['translation'][lang]


In [None]:
def get_build_tokenizer(config , ds , lang ):
  tokenizer_path = Path(config['tokenizer_file'].format(lang))
  if not Path.exists(tokenizer_path):
    tokenizer = Tokenizer(WordLevel(unk_token = "[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(special_tokens = ["[UNK]" , "[PAD]" , "[SOS]" , "[EOS]"] , min_frequency = 2)
    tokenizer.train_from_iterator(get_all_sentences(ds , lang) , trainer = trainer)
    tokenizer.save(str(tokenizer_path))
  else :
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
  return tokenizer



In [None]:
from torch.utils.data import Dataset , DataLoader  , random_split

In [None]:
def causal_mask(size):
  mask  = torch.triu(torch.ones(1 , size , size ) , diagonal = 1).type(torch.int)
  return mask == 0

In [None]:
class BilingualDataset(Dataset):
  def __init__(self , ds , tokenizer_src , tokenizer_tgt , src_lang , tgt_lang , seq_len) -> None :
    super().__init__()
    self.seq_len = seq_len
    self.ds = ds
    self.tokenizer_src = tokenizer_src
    self.tokenizer_tgt = tokenizer_tgt
    self.src_lang = src_lang
    self.tgt_lang = tgt_lang
    self.sos_token = torch.tensor([tokenizer_src.token_to_id('[SOS]') ],  dtype = torch.int64 )
    self.eos_token = torch.tensor([tokenizer_src.token_to_id('[EOS]')] , dtype = torch.int64 )
    self.pad_token = torch.tensor([tokenizer_src.token_to_id('[PAD]')] , dtype = torch.int64 )


    self.seq_len = seq_len
  def __len__(self):
    return len(self.ds)
  def __getitem__(self, index ):
    src_target_pair = self.ds[index]
    src_text = src_target_pair[self.src_lang]
    tgt_text = src_target_pair[self.tgt_lang]

    enc_input_tokens = self.src_tokenizer_src.encode(src_text).ids
    dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

    enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
    dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1
    if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0 :
      raise ValueError('Sentence id stooo long')
    encoder_input = torch.cat([
        self.sos_token  , self.tensor(enc_input_tokens , dtype = torch.int64) , self.eos_token ,
        torch.tensor([self.pad_token] * enc_num_padding_tokens , dtype = torch.int64 ) ,
    ])
    decoder_input = torch.cat([self.sos_token , self.tensor( dec_input_tokens , dtype = torch.int64 ), torch.tensor([self.pad] * dec_num_padding_tokens   , dtype = torch.int64 )]  )

    label = torch.cat([
        torch.tensor(dec_input_tokens , dtype = torch.int64 ) ,
        self.eos_token ,
        torch.tensor([self.pad_token] * dec_num_padding_tokens , dtype = torch.int64 )
    ])
    assert encoder_input.size(0) == self.seq_len
    assert decoder_input.size(0) == self.seq_len
    return {
        "encoder_input" : encoder_input ,  #seq_len
        "decoder_input" : decoder_input , #seq_len
        "encoder_mask" : (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() #(1 , 1 , Seq len)  check how this works
         ,
        "decoder_mask" : (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0))
      #(1 , seq_len) ,, (1 , seq_len , seq_len)
       ,
        "label" :  label ,
        "src_text" : src_text ,
        "tgt_text" : tgt_text
    }


In [None]:
def get_ds(config):
  ds_raw = load_dataset('opus_books' , f'{config["lang_src"]}-{config["lang_tgt"]}' , split = 'train')
  tokenizer_src = get_build_tokenizer(config , ds_raw , config['lang_src'])
  tokenizer_tgt = get_build_tokenizer(config , ds_raw , config['lang_tgt'])

  #keep 90% for training and 10% for for validation
  train_ds_size = int(0.9 * len(ds_raw))
  val_ds_size = len(ds_raw) - train_ds_size
  train_ds_raw , val_ds_raw = random_split(ds_raw , [train_ds_size , val_ds_size])
  train_ds = BilingualDataset(train_ds_raw , tokenizer_src , tokenizer_tgt , config['lang_src'] , config['lang_tgt'] , config['seq_len'])
  val_ds = BilingualDataset(val_ds_raw , tokenizer_src , tokenizer_tgt , config['lang_src'] , config['lang_tgt'] , config['seq_len'])
  max_len_src = 0
  max_len_tgt = 0
  for item in ds_raw :
    src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
    tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
    max_len_src = max(len(src_ids) , max_len_src )
    max_len_tgt = max(len(tgt_ids) , max_len_tgt)
  print(f'Max length of source sentence : {max_len_src}')
  print(f'Max length of target sentence : {max_len_tgt}')
  train_dataloader = DataLoader(train_ds , batch_size = config['batch_size'] , shuffle = True )
  val_dataloader = DataLoader(train_ds  , batch_size = config['batch_size'] , shuffle = True )
  return train_dataloader  , val_dataloader , tokenizer_src , tokenizer_tgt



In [None]:
def get_model(config , vocab_src_len , vocab_tgt_len):
  model = build_transformer(vocab_src_len , vocab_tgt_len , config['seq_len'] , config['seq_len'] , config['d_model'])
  return model

In [None]:
from pathlib import Path

In [None]:
def get_config() :
  return {
      "batch_size" : 8 ,
      "num_epochs" : 20 ,
      "lr" : 10**-4 ,
      "seq_len" : 350 ,
      "d_model" : 512 ,
      "lang_src" : "en" ,
      "lang_tgt" : "it" ,
      "model_folder" : "content/drive/MyDrive/weights" ,
      "model_filename" : "tmodel_" ,
      "preload" : None ,
      "tokenizer_file" : "tokenizer_{0}.json" ,
      "experiment_name" : "runs/model" ,
  }
def get_weights_file_path(config , epoch : str) :
  model_folder = config['model_folder']
  model_basename = config['model_basename']
  model_filename = f"{model_basename}{epoch}.pt"
  return str(Path('.') / model_folder / model_filename)


In [None]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import os

In [None]:
def train_model(config):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"Using device {device}")
  Path(config['model_folder']).mkdir(parents = True  , exist_ok = True)
  train_loader , val_loader , tokenizer_src , tokenizer_tgt = get_ds(config)
  model = get_model(config , tokenizer_src.get_vocab_size() , tokenizer_tgt.get_vocab_size()).to(device)
  writer = SummaryWriter(config['experiment_name'])
  optimizer = torch.optim.Adam(model.parameters() , lr = config['lr'] , eps = 1e-9)
  initial_epoch = 0
  global_step = 0
  if config['preload'] :
    model_filename = get_weights_file_path(config , config['preload'])
    print(f"Preloading model {model_filename}")
    state = torch.load(model_filename)
    initial_epoch = state['epoch'] + 1
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
  loss_fn = nn.CrossEntropy(ignore_index = tokenizer_src.token_to_id('[PAD]') , label_smoothing = 0.1 ).to(device)
  for epoch in range(initial_epoch , config['num_epochs']):
    model.train()
    batch_iterator = tqdm(train_loader , desc = f'preprocessing epoch {epoch}')
    for batch in batch_iterator:
      encoder_input = batch['encoder_input'].to(device) # (B , seq_len)
      decoder_input = batch['decoder_input'].to(device) # (B , seq_len)
      encoder_mask = batch['encoder_mask'].to(device) #(B , 1 , 1 , SEQ_LEN)
      decoder_mask = batch['decoder_mask'].to(device) #(B , 1 , Seq_len , Seq_len)
      encoder_output = model.encode(encoder_input , encoder_mask) #B . Seq_len , d_model
      decoder_output = model.decode(encoder_output , encoder_mask , decoder_input , decoder_mask)
      proj_output = model.project(decoder_output) # (B , seq_len , tgt_vocab_size )
      label = batch['label'].to(device) #( B , seq_len)
      #(B , Seq_Len , tgt_vocab_size) --> (B * Seq_len , tgt_vocab_size)
      loss = loss_fn(proj_output.view(-1 , tokenizer_tgt.get_vocab_size()) , label.view(-1))
      batch_iterator.set_postfix({"Loss" : f"{loss.item() : 6.3f }"})

      #log the loss on tenserboard
      writer.add_scalar('train loss' , loss.item() , global_step )
      writer.flush()

      #back
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      global_step += 1

      # save the model
      model_filename = get_weights_file_path(config , f'{epoch}')
      torch.save({
          'epoch' : epoch ,
          'model_state_dict' : model.state_dict() ,
          'optimizer_state_dict' : optimizer.state_dict() ,
          'global_step' : global_step ,

      } , model_filename)


In [None]:
config = get_config()
train_model(config)

Using device cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Max length of source sentence : 309
Max length of target sentence : 274


TypeError: Tranformer.__init__() missing 1 required positional argument: 'projection'