In [3]:
!pip install Tokenizer  Datasets torchmetrics



In [4]:
import torch
import torch.nn as nn

In [5]:
import math
class InputEmbedding(nn.Module):
  def __init__(self,d_model:int,vocab_size:int):
    super().__init__()
    self.d_model=d_model
    self.input_embedding=nn.Embedding(vocab_size,d_model)
  def forward(self,x):
    return self.input_embedding(x)*math.sqrt(self.d_model)

In [6]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model:int,max_seq_len:int,dropout:float):
    super().__init__()
    self.dropout=nn.Dropout(dropout)

    pe=torch.zeros(max_seq_len,d_model)
    position=torch.arange(0,max_seq_len,dtype=torch.float).unsqueeze(1)
    divide_term=torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:,0::2]=torch.sin(position*divide_term)
    pe[:,1::2]=torch.cos(position*divide_term)
    pe=pe.unsqueeze(0)
    self.register_buffer('pe',pe)

  def forward(self,x):
    x= x+(self.pe[:,:x.shape[1],:]).requires_grad_(False)
    return self.dropout(x)

In [7]:
class MultiHeadAttention(nn.Module):
  def __init__(self,d_model:int,head:int,dropout:float):
    super().__init__()
    self.head=head
    self.dk=d_model//head
    self.wq=nn.Linear(d_model,d_model)
    self.wk=nn.Linear(d_model,d_model)
    self.wv=nn.Linear(d_model,d_model)
    self.wo=nn.Linear(d_model,d_model)
    self.dropout=nn.Dropout(dropout)
  @staticmethod
  def attention(query,key,value,mask,dropout:nn.Dropout):
    d_k=query.shape[-1]
    attention_score=(query@key.transpose(-2,-1))/math.sqrt(d_k)
    if mask is not None:
      attention_score.masked_fill_(mask==0,-1e7)
    attention_score=dropout(attention_score)

    return (attention_score@value),attention_score
  def forward(self,q,k,v,mask):
    query=self.wq(q)
    key=self.wk(k)
    value=self.wv(v)

    query=query.view(query.shape[0],query.shape[1],self.head,self.dk).transpose(1,2)
    key=key.view(key.shape[0],key.shape[1],self.head,self.dk).transpose(1,2)
    value=value.view(value.shape[0],value.shape[1],self.head,self.dk).transpose(1,2)

    x,self.attention_score=MultiHeadAttention.attention(query,key,value,mask,self.dropout)
    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.head*self.dk)

    return self.wo(x)

In [8]:
class FeedForward(nn.Module):
  def __init__(self,d_model:int,d_ff:int,dropout:float):
    super().__init__()
    self.layer_1=nn.Linear(d_model,d_ff)
    self.layer_2=nn.Linear(d_ff,d_model)
    self.dropout=nn.Dropout(dropout)
  def forward(self,x):
    return self.dropout(self.layer_2(torch.relu(self.layer_1(x))))

In [9]:
class LayerNormalization(nn.Module):
  def __init__(self,epsilon:float=10**-5):
    super().__init__()
    self.epsilon=epsilon
  def forward(self,x):
    mean=x.mean(dim=-1,keepdims=True)
    std=x.std(dim=-1,keepdims=True)
    return (x-mean)/(std+self.epsilon)

In [10]:
class ResidualConnection(nn.Module):
  def __init__(self,dropout):
    super().__init__()
    self.dropout=nn.Dropout(dropout)
    self.layer_norm=LayerNormalization()
  def forward(self,x,sublayer):
    return x+ self.dropout(self.layer_norm(sublayer(x)))

In [11]:
class ProjectionLayer(nn.Module):
  def __init__(self,d_model:int,vocab_size:int):
    super().__init__()
    self.projection_layer=nn.Linear(d_model,vocab_size)
  def forward(self,x):
    return torch.log_softmax(self.projection_layer(x),dim=-1)

In [12]:
class EncoderBlock(nn.Module):
  def __init__(self,self_attention:MultiHeadAttention,feed_forward:FeedForward,dropout:float):
    super().__init__()
    self.self_attention=self_attention
    self.feed_forward=feed_forward

    self.residual=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])
  def forward(self,x,src_mask):
    x=self.residual[0](x,lambda x:self.self_attention(x,x,x,src_mask))
    x=self.residual[1](x,self.feed_forward)

    return x

class Encoder(nn.Module):
  def __init__(self,layers:nn.ModuleList):
    super().__init__()
    self.layer=layers
    self.norm=LayerNormalization()
  def forward(self,x,mask):
    for l in self.layer:
      x=l(x,mask)
    return self.norm(x)

In [13]:
class DecoderBlock(nn.Module):
  def __init__(self,self_attention:MultiHeadAttention,cross_attention:MultiHeadAttention,feed_forward:FeedForward,dropout:float):
    super().__init__()
    self.self_attention=self_attention
    self.cross_attention=cross_attention
    self.feed_forward=feed_forward

    self.residual=nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
  def forward(self,x,encoder_output,src_mask,trg_mask):
    x=self.residual[0](x,lambda x:self.self_attention(x,x,x,trg_mask))
    x=self.residual[1](x,lambda x:self.cross_attention(x,encoder_output,encoder_output,src_mask))
    x=self.residual[2](x,self.feed_forward)

    return x

class Decoder(nn.Module):
  def __init__(self,layers:nn.ModuleList):
    super().__init__()
    self.layer=layers
    self.norm=LayerNormalization()
  def forward(self,x,encoder_output,src_mask,trg_mask):
    for l in self.layer:
      x=l(x,encoder_output,src_mask,trg_mask)
    return self.norm(x)

In [14]:
class Transformer(nn.Module):
  def __init__(self,src_embedding:InputEmbedding,trg_embedding:InputEmbedding,src_position:PositionalEncoding,trg_position:PositionalEncoding,encoder:Encoder,decoder:Decoder,projection_layer:ProjectionLayer):
    super().__init__()
    self.src_embedding=src_embedding
    self.src_position=src_position
    self.encoder=encoder
    self.trg_embedding=trg_embedding
    self.trg_position=trg_position
    self.decoder=decoder
    self.projection_layer=projection_layer
  def encode(self,src,src_mask):
    src=self.src_embedding(src)
    src=self.src_position(src)
    return self.encoder(src,src_mask)
  def decode(self,trg,encoder_output,src_mask,trg_mask):
    trg=self.trg_embedding(trg)
    trg=self.trg_position(trg)
    return self.decoder(trg,encoder_output,src_mask,trg_mask)
  def project(self,x):
    return self.projection_layer(x)

def BuildTransformer(src_vocab_size:int,trg_vocab_size:int,src_max_seq_len:int,trg_max_seq_len:int,d_model:int=512,N:int=4,head:int=4,dropout:float=0.1,d_ff:int=2048):
  src_embedding=InputEmbedding(d_model,src_vocab_size)
  src_position=PositionalEncoding(d_model,src_max_seq_len,dropout)

  encoder_array=[]
  for _ in range(N):
    encoder_attention=MultiHeadAttention(d_model,head,dropout)
    encoder_feed_forward=FeedForward(d_model,d_ff,dropout)
    encoder_block=EncoderBlock(encoder_attention,encoder_feed_forward,dropout)
    encoder_array.append(encoder_block)
  encoder=Encoder(nn.ModuleList(encoder_array))

  trg_embedding=InputEmbedding(d_model,trg_vocab_size)
  trg_position=PositionalEncoding(d_model,trg_max_seq_len,dropout)

  decoder_array=[]
  for _ in range(N):
    decoder_self_attention=MultiHeadAttention(d_model,head,dropout)
    decoder_cross_attention=MultiHeadAttention(d_model,head,dropout)
    decoder_feed_forward=FeedForward(d_model,d_ff,dropout)
    decoder_block=DecoderBlock(decoder_self_attention,decoder_cross_attention,decoder_feed_forward,dropout)
    decoder_array.append(decoder_block)
  decoder=Decoder(nn.ModuleList(decoder_array))

  projection_layer=ProjectionLayer(d_model,trg_vocab_size)

  transformer=Transformer(src_embedding,trg_embedding,src_position,trg_position,encoder,decoder,projection_layer)

  for p in transformer.parameters():
    if p.dim()>1:
      nn.init.xavier_uniform_(p)
  return transformer

In [15]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [16]:
from pathlib import Path
def get_all_sentences(ds,lang):
  for item in ds:
    yield item["translation"][lang]

def build_tokenizer(config,ds,lang):
  tokenizer_path=Path(config["tokenizer_file"].format(lang))#file name to save json file of saved tokenizer
  if not Path.exists(tokenizer_path):#if tokenizer not exist lets make one
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))#initilize the tokenizer
    tokenizer.pre_tokenizer=Whitespace()#to seprate the sentence to words so no token is greater than a word
    trainer = BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]","[EOS]"],min_frequency=2)#initilize trainer
    tokenizer.train_from_iterator(get_all_sentences(ds,lang),trainer=trainer)#train from a list given one by one by get_all_sentence
    tokenizer.save(str(tokenizer_path))#save the model in current director with this name
  else:
    tokenizer=Tokenizer.from_file(str(tokenizer_path))#if tokeizer exist then call it with file name
  return tokenizer

In [17]:
class BilingualDataset(nn.Module):
  def __init__(self,ds,src_tokenizer,trg_tokenizer,max_seq_len,src_lang,trg_lang):
    super().__init__()
    self.ds=ds
    self.src_tokenizer=src_tokenizer
    self.trg_tokenizer=trg_tokenizer
    self.max_seq_len=max_seq_len
    self.src_lang=src_lang
    self.trg_lang=trg_lang

    self.sos_token=torch.tensor([src_tokenizer.token_to_id("[SOS]")],dtype=torch.int64)
    self.eos_token=torch.tensor([src_tokenizer.token_to_id("[EOS]")],dtype=torch.int64)
    self.pad_token=torch.tensor([src_tokenizer.token_to_id("[PAD]")],dtype=torch.int64)

  def __len__(self):
    return len(self.ds)

  def __getitem__(self,index):
    pair_text=self.ds[index]

    src_text=pair_text["translation"][self.src_lang]
    trg_text=pair_text["translation"][self.trg_lang]

    encoder_input_tokens=self.src_tokenizer.encode(src_text).ids
    decoder_input_tokens=self.trg_tokenizer.encode(trg_text).ids

    enc_num_padding_tokens = self.max_seq_len - len(encoder_input_tokens) - 2
    dec_num_padding_tokens = self.max_seq_len - len(decoder_input_tokens) - 1

    encoder_input=torch.cat([
        self.sos_token,
        torch.tensor(encoder_input_tokens,dtype=torch.int64),
        self.eos_token,
        torch.tensor([self.pad_token]*enc_num_padding_tokens,dtype=torch.int64)
    ],dim=0)

    decoder_input=torch.cat([
        self.sos_token,
        torch.tensor(decoder_input_tokens,dtype=torch.int64),
        torch.tensor([self.pad_token]*dec_num_padding_tokens,dtype=torch.int64)
    ],dim=0)

    label=torch.cat([
        torch.tensor(decoder_input_tokens,dtype=torch.int64),
        self.eos_token,
        torch.tensor([self.pad_token]*dec_num_padding_tokens,dtype=torch.int64),
    ],dim=0)

    return {
        "encoder_input":encoder_input,
        "decoder_input":decoder_input,
        "encoder_mask":(encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
        "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
        "label": label,
        "src_text": src_text,
        "trg_text": trg_text,
    }

def causal_mask(size):
  mask=torch.triu(torch.ones((1,size,size)),diagonal=1).type(torch.int)
  return mask==0

In [18]:
from torch.utils.data import Dataset,DataLoader,random_split
def get_ds(config):
  ds_raw=load_dataset('opus_books',f'{config["src_lang"]}-{config["trg_lang"]}',split="train")

  src_tokenizer=build_tokenizer(config,ds_raw,config["src_lang"])
  trg_tokenizer=build_tokenizer(config,ds_raw,config["trg_lang"])

  train_ds_size=int(0.9*len(ds_raw))
  val_ds_size=len(ds_raw)-train_ds_size
  train_ds_raw,val_ds_raw=random_split(ds_raw,[train_ds_size,val_ds_size])

  train_ds=BilingualDataset(train_ds_raw,src_tokenizer,trg_tokenizer,config['seq_len'],config["src_lang"],config['trg_lang'])
  val_ds=BilingualDataset(val_ds_raw,src_tokenizer,trg_tokenizer,config['seq_len'],config["src_lang"],config['trg_lang'])

  max_len_src = 0
  max_len_tgt = 0

  for item in ds_raw:
    src_ids = src_tokenizer.encode(item['translation'][config['src_lang']]).ids
    tgt_ids = trg_tokenizer.encode(item['translation'][config['trg_lang']]).ids
    max_len_src = max(max_len_src, len(src_ids))
    max_len_tgt = max(max_len_tgt, len(tgt_ids))

  print(f'Max length of source sentence: {max_len_src}')
  print(f'Max length of target sentence: {max_len_tgt}')

  train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
  val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

  return train_dataloader, val_dataloader, src_tokenizer, trg_tokenizer

def get_model(config, vocab_src_len, vocab_tgt_len):
    model = BuildTransformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config['seq_len'], d_model=config['d_model'])
    return model

In [19]:
def get_config():
    return{
        "batch_size":24,
        "num_epochs":100,
        "lr":10**-4,
        "seq_len":350,
        "d_model":512,
        "datasource": 'opus_books',
        "src_lang": "en",
        "trg_lang": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": None,
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }
def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}{epoch}.pt"
    return str(Path('.') / model_folder / model_filename)

# Find the latest weights file in the weights folder
def latest_weights_file_path(config):
    model_folder = f"{config['datasource']}_{config['model_folder']}"
    model_filename = f"{config['model_basename']}*"
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

In [20]:
import torchmetrics
def greedy_decode(model,source,source_mask,src_tokenizer,trg_tokenizer,max_len,device):
  sos_idx = src_tokenizer.token_to_id('[SOS]')
  eos_idx = src_tokenizer.token_to_id('[EOS]')

  encoder_output = model.encode(source, source_mask)
  decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
  while True:
    if decoder_input.size(1) == max_len:
        break

    decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)
    out = model.decode(decoder_input,encoder_output, source_mask, decoder_mask)

    prob = model.project(out[:, -1])
    _, next_word = torch.max(prob, dim=1)
    decoder_input = torch.cat(
        [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
    )

    if next_word == eos_idx:
      break
  return decoder_input.squeeze(0)

def run_validation(model,validation_ds,src_tokenizer,trg_tokenizer,max_len,print_msg,global_steps,device,num_examples=2):
  model.eval()
  count=0

  source_texts=[]
  expected=[]
  predicted=[]

  with torch.no_grad():
    for batch in validation_ds:
      count+=1
      encoder_input=batch['encoder_input'].to(device)
      encoder_mask=batch['encoder_mask'].to(device)

      model_out=greedy_decode(model,encoder_input,encoder_mask,src_tokenizer,trg_tokenizer,max_len,device)

      source_text = batch["src_text"][0]
      target_text = batch["trg_text"][0]
      model_out_text = trg_tokenizer.decode(model_out.detach().cpu().numpy())

      source_texts.append(source_text)
      expected.append(target_text)
      predicted.append(model_out_text)

      print_msg(f"{f'SOURCE: ':>12}{source_text}")
      print_msg(f"{f'TARGET: ':>12}{target_text}")
      print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")

      if count == num_examples:
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        print_msg(f"{f'BLEU: ':>12}{bleu}")
        break



In [21]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
def train_model(config):
  device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(device)
  Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)
  train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt=get_ds(config)
  model=get_model(config,tokenizer_src.get_vocab_size(),tokenizer_tgt.get_vocab_size()).to(device)
  writer=SummaryWriter(config['experiment_name'])
  optimizer=torch.optim.Adam(model.parameters(),lr=config['lr'],eps=1e-9)

  initial_epoch=0
  global_step=0
  preload = config['preload']
  model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
  if model_filename:
    print(f'Preloading model {model_filename}')
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])
    initial_epoch = state['epoch'] + 1
    optimizer.load_state_dict(state['optimizer_state_dict'])
    global_step = state['global_step']
  else:
      print('No model to preload, starting from scratch')

  loss_fn=nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

  for epoch in range(initial_epoch, config['num_epochs']):

    torch.cuda.empty_cache()
    model.train()
    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    for batch in batch_iterator:

      encoder_input = batch['encoder_input'].to(device)
      decoder_input = batch['decoder_input'].to(device)
      encoder_mask = batch['encoder_mask'].to(device)
      decoder_mask = batch['decoder_mask'].to(device)

      encoder_output = model.encode(encoder_input, encoder_mask)
      decoder_output = model.decode(decoder_input,encoder_output, encoder_mask, decoder_mask)
      proj_output = model.project(decoder_output)

      label = batch['label'].to(device)

      loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
      batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

      writer.add_scalar('train loss', loss.item(), global_step)
      writer.flush()

      loss.backward()

      optimizer.step()
      optimizer.zero_grad(set_to_none=True)
      global_step+=1

    run_validation(model,val_dataloader,tokenizer_src,tokenizer_tgt,config['seq_len'],lambda msg:batch_iterator.write(msg),global_step,device)

    model_filename = get_weights_file_path(config, f"{epoch:02d}")
    torch.save({
          'epoch': epoch,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'global_step': global_step
      }, model_filename)


In [None]:
import warnings
if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    config = get_config()
    train_model(config)

cuda


Downloading builder script:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

Downloading and preparing dataset opus_books/en-it (download: 3.14 MiB, generated: 8.58 MiB, post-processed: Unknown size, total: 11.72 MiB) to /root/.cache/huggingface/datasets/opus_books/en-it/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf...


Downloading data:   0%|          | 0.00/3.30M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Dataset opus_books downloaded and prepared to /root/.cache/huggingface/datasets/opus_books/en-it/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf. Subsequent calls will reuse this data.






Max length of source sentence: 316
Max length of target sentence: 287
No model to preload, starting from scratch


Processing Epoch 00: 100%|██████████| 1213/1213 [17:24<00:00,  1.16it/s, loss=6.217]


    SOURCE: 'As candidate for the post of Provincial Marshal, Captain Eugene Ivanich Apukhtin will now be balloted for.'
    TARGET: — È proposto come candidato alla carica di maresciallo del governatorato il capitano in seconda di cavalleria Evgenij Ivanovic Apuchtin!
 PREDICTED: 
    SOURCE: No one else was in the church except a soldier-beggar, two old women, and the clergy.
    TARGET: In chiesa non c’era nessuno all’infuori di un povero soldato, due vecchiette e i sacrestani.
 PREDICTED: 
      BLEU: 0.0


Processing Epoch 01: 100%|██████████| 1213/1213 [17:28<00:00,  1.16it/s, loss=5.496]


    SOURCE: This plan would be all the more convenient because the young couple intended immediately after the wedding to go to the country, where the larger part of the trousseau would not be required.
    TARGET: Una decisione simile era quanto mai opportuna, perché la giovane coppia, subito dopo il matrimonio, si sarebbe recata in campagna, dove il corredo di casa non sarebbe stato necessario.
 PREDICTED: 
    SOURCE: I love you, and it's all the same to me,' she said, changing from French into Russian, while her eyes as she looked at him glittered with a light he could not understand, 'so long as you have not changed toward me!
    TARGET: Io ti amo e per me tutto il resto è indifferente — ella disse in russo, guardandolo con uno scintillio particolare, per lui incomprensibile — se tu non sei cambiato.
 PREDICTED: 
      BLEU: 0.0


Processing Epoch 02: 100%|██████████| 1213/1213 [17:29<00:00,  1.16it/s, loss=5.234]


    SOURCE: Different as were those two women, Agatha Mikhaylovna and Kitty – or 'Kate' as Nicholas called her, and as Levin was also fond of calling her now – in that respect they were exactly alike.
    TARGET: Per quanto diverse fossero queste due donne, Agaf’ja Michajlovna e Katja, come la chiamava suo fratello Nikolaj, e come adesso era particolarmente caro per Levin chiamarla, in questo erano perfettamente simili.
 PREDICTED: 
    SOURCE: "When I go to India, Jane, will I leave you!
    TARGET: — Quando andrò nell'India vi lascierò forse, Jane?
 PREDICTED: 
      BLEU: 0.0


Processing Epoch 03:   1%|          | 9/1213 [00:07<17:16,  1.16it/s, loss=5.268]