In [35]:
from pathlib import Path
from collections import defaultdict
import re 

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data.sampler import BatchSampler,Sampler,SequentialSampler

from torch.utils.tensorboard import SummaryWriter

import importlib
import sys
sys.path.append("../utils")

import data_helper
import dataset
import sampler
import models
import masked_loss
import trainer

import itertools
from functools import partial

importlib.reload(dataset)
importlib.reload(data_helper)
importlib.reload(sampler)
importlib.reload(models)
importlib.reload(masked_loss)
importlib.reload(trainer)

<module 'trainer' from '/Users/khalid/personal_nlp_playground/seq-seq-translation/notebooks/../utils/trainer.py'>

# dataset

In [3]:
dataset_root = Path("../dataset/ordered/valid")
ar_path  = dataset_root / "ar-en.ar"
en_path  = dataset_root / "ar-en.en"

In [4]:
# en_tokenizer,ar_tokenizer = dataset._get_tokenizers()

# en_itr,ar_itr = data_helper._get_data_itr(en_path,ar_path)

# print("getting en vocab ... ",end=" ")
# en_vocab = data_helper._build_vocab(en_itr,en_tokenizer)
# print("Done!")

# print("getting ar vocab ... ",end = " ")
# ar_vocab = data_helper._build_vocab(ar_itr,ar_tokenizer)
# print("Done!")

In [5]:
temp = dataset.en_to_arr_dataset(en_path,ar_path)

Reading data from txt files ...  Done!
building x_vocab ...  Done!
building y_vocab ...  Done!


In [390]:
len(temp)

92670

In [391]:
for x,y in temp :
    break

In [392]:
len(x)

235

In [393]:
print(" ".join(temp.x_vocab.lookup_tokens(x)))
print()
print(" ".join(temp.y_vocab.lookup_tokens(y)))

57 . at its <unk> meeting on 19 may 2009 , the commission considered and provisionally adopted the following draft guidelines , of which it had taken note at its sixtieth session 2 . 8 . 1 ( <unk> acceptance of reservations ) , 2 . 8 . 2 ( unanimous acceptance of reservations ) , 2 . 8 . 3 ( express acceptance of a reservation ) , 2 . 8 . 4 ( written form of express acceptance ) , 2 . 8 . 5 ( procedure for formulating express acceptance ) , 2 . 8 . 6 ( <unk> of confirmation of an acceptance made prior to formal confirmation of a reservation ) , 2 . 8 . 7 ( acceptance of a reservation to the constituent instrument of an international organization ) , 2 . 8 . 8 ( organ competent to accept a reservation to a constituent instrument ) , 2 . 8 . 9 ( modalities of the acceptance of a reservation to a constituent instrument ) , 2 . 8 . 10 ( acceptance of a reservation to a constituent instrument that has not yet entered into force ) , 2 . 8 . 11 ( reaction by a member of an international organ

# Sampler

In [394]:
batch_size = 32
drop_last = False
s = sampler.RandomSameLengthSampler(temp,num_samples=batch_size)
bs = sampler.CustomBatchSampler(s,batch_size,drop_last)

In [395]:
counter = 0
for x in bs:
    counter+=1
    # print(x)
print(counter)

3014


In [396]:
bs = sampler.CustomBatchSampler([1,2,3,4,5,6,7,8,9,10,11],batch_size=5,drop_last=False)

# DataLoader

In [397]:
a1 = torch.zeros(2,4)
a2 = torch.zeros(3,4)
torch.concat([a1,a2]).shape

torch.Size([5, 4])

In [398]:
torch.unsqueeze(a1, 0).shape

torch.Size([1, 2, 4])

In [399]:
def get_mask(Y,padding_fill_value):
    mask = []
    
    for seq in Y:
        seq_mask = []
        for token in seq:
            if token == padding_fill_value:
                seq_mask.append(0)
            else:
                seq_mask.append(1)
        mask.append(seq_mask)
        
    return mask
            
def collate_fn(batch,padding_fill_value):
    # get X and Y
    X = torch.concat( [torch.unsqueeze(torch.tensor(x[0]), 0) for x in batch] ,dim=0 )
    Y = torch.tensor(list(itertools.zip_longest(*[ x[1] for x in batch  ], fillvalue=3))).T
    # get mask
    mask = torch.tensor(get_mask(Y,padding_fill_value))
    
    return X,Y,mask

In [400]:
padding_fill_value=3
dl = torch.utils.data.DataLoader(temp)

In [401]:
for x,y in dl:
    #
    break
len(x)

235

In [402]:
x.shape

AttributeError: 'list' object has no attribute 'shape'

In [403]:
y.shape

AttributeError: 'list' object has no attribute 'shape'

In [404]:
mask

tensor([[1, 1, 1, 1, 1, 1, 1]])

# all togather

In [476]:
drop_last=False
batch_size=32
dl,x_vocab,y_vocab = dataset.get_data_loader(en_path,ar_path,batch_size,drop_last,x_vocab=None,y_vocab=None)

Reading data from txt files ...  Done!
building x_vocab ...  Done!
building y_vocab ...  Done!


In [477]:
counter = 0
for x in dl.batch_sampler:
    counter+=1
print(counter)

3014


In [478]:
counter = 0
for x in dl.sampler:
    counter+=1
print(counter)

92670


In [479]:
counter = 0
num_of_steps = 0
for i,(x,y,mask) in enumerate(dl,1):
    # print(x.shape)
    if i% 10000 == 0:
        print(f"Finished {i} steps")
    counter+=x.shape[0]
    num_of_steps+=1
    
print(counter)
print(num_of_steps)

92670
3014


In [480]:
len(dl)

3014

In [20]:
len(dl)

57869

In [929]:
x.shape[0]

16

In [930]:
x.shape

torch.Size([16, 127])

In [931]:
y.shape

torch.Size([16, 206])

In [932]:
mask.shape

torch.Size([16, 206])

# model

In [935]:
model = models.seq2seq(len(x_vocab),len(y_vocab),embed_size=300,hidden_size=120,num_layers=1)
model

seq2seq(
  (encoder): Encoder(
    (embed): Embedding(27379, 300)
    (lstm): LSTM(300, 120, batch_first=True)
  )
  (decoder): Decoder(
    (embed): Embedding(70948, 300)
    (lstm): LSTM(300, 120, batch_first=True)
    (fc): Linear(in_features=120, out_features=70948, bias=True)
  )
)

In [936]:
output = model(x,y)
output.shape

torch.Size([3296, 70948])

In [937]:
y.shape

torch.Size([16, 206])

In [938]:
y.flatten().shape

torch.Size([3296])

# loss

In [951]:
criterion = masked_loss.masked_crossEntropyLoss

In [952]:
y.flatten().shape

torch.Size([3296])

In [953]:
output.shape

torch.Size([3296, 70948])

In [954]:
mask = torch.ones(y.flatten().shape)
mask.shape

torch.Size([3296])

In [955]:
criterion(output,y.flatten(),mask)

tensor(11.3168, grad_fn=<MeanBackward0>)

# training loop

In [910]:

model_name = "seq2seq"
batch_size = 16
drop_last = False
embed_size = 300
hidden_size = 100
num_layers = 1

# en_path=
# ar_path=

number_of_epochs = 10
criterion = masked_loss.masked_crossEntropyLoss
scheduler = None
lr = 0.05
optimizer = torch.optim.SGD(model.parameters(),lr=lr)
device = torch.device("mps" if torch.has_mps else "cpu")
device = "cpu"
model_path = f"../weights/{model_name}"


train_loader,x_vocab, y_vocab = dataset.get_data_loader(en_path,ar_path,batch_size,drop_last,x_vocab=None,y_vocab=None)
val_loader,_,_ = dataset.get_data_loader(en_path,ar_path,batch_size,drop_last,x_vocab=x_vocab,y_vocab=y_vocab)

model = models.seq2seq(len(x_vocab),len(y_vocab),embed_size=embed_size,hidden_size=hidden_size,num_layers=num_layers)
writer = SummaryWriter(f".runs/{model_name}")


Reading data from txt files ...  Done!
building x_vocab ...  Done!
building y_vocab ...  Done!
Reading data from txt files ...  Done!
setting previously calculated vocabs ...  Done!


In [911]:
len(train_loader.dataset)

925990

In [912]:
for x,y,z in train_loader:
    break

In [913]:
y.shape

torch.Size([2, 126])

In [919]:
trainer = trainer.Trainer(model,train_loader,val_loader,number_of_epochs,criterion,optimizer,scheduler,device,model_path,model_name,writer)

In [920]:
trainer.train()

TRAINING STARTED using device = cpu .... training the model seq2seq, the training will continue for 10 epochs
 
    epoch #1
        training ... 

KeyboardInterrupt: 

# Inference

In [203]:
dataset_root = Path("../dataset/ordered/train")
ar_path  = dataset_root / "ar-en.ar"
en_path  = dataset_root / "ar-en.en"

batch_size = 8
drop_last = False

In [204]:
model_name = "seq2seq"
path = f"../weights/{model_name}/model.pth"
train_loader,x_vocab, y_vocab = dataset.get_data_loader(en_path,ar_path,batch_size,drop_last,x_vocab=None,y_vocab=None)
model = models.seq2seq(len(x_vocab),len(y_vocab),embed_size=300,hidden_size=100,num_layers=1)

Reading data from txt files ...  Done!
building x_vocab ...  Done!
building y_vocab ...  Done!


In [205]:
len(x_vocab)

27250

In [206]:
model.load_state_dict(torch.load(path))

<All keys matched successfully>

## get validation loss of best model

In [28]:
dataset_root = Path("../dataset/ordered/valid")
ar_path  = dataset_root / "ar-en.ar"
en_path  = dataset_root / "ar-en.en"

batch_size = 8
drop_last = False
device = torch.device("cpu")
criterion = masked_loss.masked_crossEntropyLoss
val_loader,x_vocab, y_vocab = dataset.get_data_loader(en_path,ar_path,batch_size,drop_last,x_vocab=x_vocab,y_vocab=y_vocab)

In [32]:
model.eval()

running_loss = []

for i,(x_source,y,mask) in enumerate(val_loader,1):
    # get x_target and y_true
    x_target = y[:,0:-1]
    y_true   = y[:,1:].flatten()
    # send tensors to device
    x_source,x_target,y_true,mask = x_source.to(device),x_target.to(device),y_true.to(device),mask[:,1:].to(device).flatten()

    with torch.no_grad():
        # get model's predicitions
        y_pred = model(x_source,x_target)

        # calculate loss
        loss = criterion(y_pred,y_true,mask)

        # reigister running loss
        step_loss = loss.detach().item()
        running_loss.append(step_loss)

epoch_loss = np.mean(running_loss)
epoch_loss

NameError: name 'np' is not defined

In [36]:
epoch_loss

5.067629406700863

## InferenceDecoder

In [197]:
class Encoder(nn.Module):
    
    def __init__(self,vocab_size,embed_size,hidden_size,num_layers):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embed = nn.Embedding(self.vocab_size,self.embed_size)
        self.lstm = nn.LSTM(input_size=self.embed_size,hidden_size=self.hidden_size,num_layers=self.num_layers,batch_first=True)
        
        
    def forward(self,x):
        x                = self.embed(x)
        output,(h_n,c_n) =  self.lstm(x)
        
        return output,(h_n,c_n)
    
class Decoder(nn.Module):
    
    def __init__(self,vocab_size,embed_size,hidden_size,num_layers):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embed = nn.Embedding(self.vocab_size,self.embed_size)
        self.lstm  = nn.LSTM(input_size=self.embed_size,hidden_size=self.hidden_size,num_layers=self.num_layers,batch_first=True)
        self.fc    = nn.Linear(self.hidden_size,self.vocab_size)
        
    def forward(self,x,h_n,c_n):
        x = self.embed(x)
        output, (hn, cn) = self.lstm(x,(h_n,c_n))
        logits = self.fc(output)
        
        return logits.reshape(-1,self.vocab_size)
    
class seq2seq(nn.Module):
    
    def __init__(self,x_vocab_size,y_vocab_size,embed_size,hidden_size,num_layers):
        super().__init__()
        self.encoder = Encoder(x_vocab_size,embed_size,hidden_size,num_layers)
        self.decoder = Decoder(y_vocab_size,embed_size,hidden_size,num_layers)
        
    def forward(self,x_source,x_target):
        output,(h_n,c_n) = self.encoder(x_source)
        logits           = self.decoder(x_target,h_n,c_n)
        
        return logits
        

In [None]:
class InferenceSeq2seq(seq2seq):
    
    def __init__(self,x_vocab_size,y_vocab_size,embed_size,hidden_size,num_layers,y_vocab,k=5):
        super().__init__(x_vocab_size,y_vocab_size,embed_size,hidden_size,num_layers)
        
        self.k = k
        self.y_vocab = y_vocab
        
    def forward(self,x_source):
        # encode the input sentence
        output,(h_n,c_n) = self.encoder(x_source)
        
        #
        sos_int = torch.tensor(self.y_vocab(["<SOS>"])).reshape(1,-1)
        top_k_tokens,top_k_probs,(h_n, c_n) = self.get_top_k(sos_int,h_n,c_n)        
        top_k_hidden_states = [(h_n,c_n) for _ in range(self.k)]
        top_k_paths = [ [x.item()] for x in top_k_tokens ]
        
        is_done = False
        while True:
            # initialize beam search variables
            k_times_k_tokens = []
            k_times_k_probs = []
            for i,token in enumerate(top_k_tokens):
                if self.y_vocab.lookup_token(token.item()) == "<EOS>":
                    is_done = True
                    continue
                token = token.reshape(1,-1)
                h_n,c_n = top_k_hidden_states[i]
                # get best k tokens and one (h_n,c_n)
                tokens,probs,(h_n,c_n) = self.get_top_k(token,h_n,c_n)
                k_times_k_tokens.append(tokens)
                k_times_k_probs.append(probs)
                top_k_hidden_states[i] = (h_n,c_n)
                
            if is_done:
                break
            k_times_k_tokens = torch.vstack(k_times_k_tokens)
            k_times_k_probs = torch.vstack(k_times_k_probs)


            # aggregate results so that u have best k path so far
            top_k_probs,top_k_tokens,top_k_hidden_states,top_k_paths = self.beam_step(top_k_probs,k_times_k_probs,top_k_tokens,k_times_k_tokens,top_k_hidden_states,top_k_paths)
        
        
        return top_k_paths,top_k_probs
        
    
    def get_top_k(self,token,h_n,c_n):
        # 
        x = self.decoder.embed(token)
        output, (h_n, c_n)  = self.decoder.lstm(x,(h_n,c_n))
        logits = self.decoder.fc(output).flatten()
        probs = torch.softmax(logits,dim=0)
        ## pick top k tokens
        top_k = torch.topk(probs,k=self.k)
        top_k_tokens = top_k.indices
        top_k_probs = torch.log(top_k.values)
        
        return top_k_tokens,top_k_probs,(h_n, c_n)
    
    def beam_step(self,top_k_probs,k_times_k_probs,top_k_tokens,k_times_k_tokens,top_k_hidden_states,top_k_paths):
        
        # get accumulative prob
        for i in range(k_times_k_probs.shape[0]):
            k_times_k_probs[i] = k_times_k_probs[i] + top_k_probs[i]
            
        # get top k probs from k_times_k_probs
        top_k_probs_object = torch.topk(k_times_k_probs.flatten(),k=self.k)
        new_top_k_probs = top_k_probs_object.values
        prev_index , curr_index = top_k_probs_object.indices // self.k , top_k_probs_object.indices % self.k
        # get top k tokens from prev_index, curr_index 
        new_top_k_tokens = k_times_k_tokens[prev_index,curr_index]
        # get hidden states associated with new_top_k_tokens
        new_top_k_hidden_states = [0] * len(top_k_hidden_states)
        for i,idx in enumerate(prev_index):
            new_top_k_hidden_states[i] = top_k_hidden_states[idx]
        
        # update top_k_paths
        new_top_k_paths = [0]*len(top_k_paths)
        for i,idx in enumerate(prev_index):
            new_top_k_paths[i] = top_k_paths[idx] + [k_times_k_tokens[idx][curr_index[i]].item()]
        
            
        
        return new_top_k_probs,new_top_k_tokens,new_top_k_hidden_states,new_top_k_paths

In [1034]:
class Node():
    def __init__(self,token,prob,h_n,c_n,prev):
        self.token = token
        self.prob = prob
        self.h_n = h_n
        self.c_n = c_n
        self.prev = prev
        
        if self.prev is not None:
            self.acc_prob = self.prob + self.prev.acc_prob
        else:
            self.acc_prob = self.prob
            
        self.acc_prob_per_token = self.calculate_acc_prob_per_token()
            
    def calculate_acc_prob_per_token(self):
        counter = 1
        prev_node = self.prev
        while prev_node is not None:
            counter+=1
            prev_node = prev_node.prev
            
        return self.acc_prob/counter

In [1093]:
class InferenceSeq2seq(seq2seq):
    
    def __init__(self,x_vocab_size,y_vocab_size,embed_size,hidden_size,num_layers,y_vocab,k=5):
        super().__init__(x_vocab_size,y_vocab_size,embed_size,hidden_size,num_layers)
        
        self.k = k
        self.y_vocab = y_vocab
        
    def forward(self,x_source):
        # encode the input sentence
        output,(h_n,c_n) = self.encoder(x_source)
        
        #
        sos_int = torch.tensor(self.y_vocab(["<SOS>"])).reshape(1,-1)
        sos_node = Node(token=sos_int,prob=0,h_n=h_n,c_n=c_n,prev=None)
        
        top_k_nodes = self.get_top_k(sos_node)   
        
        candidate_nodes = []
        while len(candidate_nodes) < self.k:
            # initialize beam search variables
            k_times_k_nodes = []
            for i,node in enumerate(top_k_nodes):
                if self.y_vocab.lookup_token(node.token.flatten().item()) == "<EOS>":
                    candidate_nodes.append(node)
                    continue
                curr_top_k_nodes = self.get_top_k(node)
                k_times_k_nodes.extend(curr_top_k_nodes)

            # aggregate results so that u have best k nodes so far
            top_k_nodes = self.beam_step(k_times_k_nodes)
        
        return sorted(candidate_nodes,key = lambda x:x.acc_prob,reverse=True)
        
    
    def get_top_k(self,node):
        # 
        token,h_n,c_n = node.token,node.h_n,node.c_n
        
        x = self.decoder.embed(token)
        output, (h_n, c_n)  = self.decoder.lstm(x,(h_n,c_n))
        logits = self.decoder.fc(output).flatten()
        probs = torch.softmax(logits,dim=0)
        ## pick top k tokens
        top_k = torch.topk(probs,k=self.k)
        top_k_tokens = top_k.indices
        top_k_probs = torch.log(top_k.values)
        
        top_k_nodes = []
        for i in range(self.k):
            curr_node =  Node(token=top_k_tokens[i].reshape(1,-1),prob=top_k_probs[i],h_n=h_n,c_n=c_n,prev=node)
            top_k_nodes.append(curr_node)
        return top_k_nodes
    
    def beam_step(self,k_times_k_nodes):
        return sorted(k_times_k_nodes,key = lambda x:x.acc_prob,reverse=True)[0:self.k]

In [1086]:
# dataset_root = Path("../dataset/ordered/train")
# ar_path  = dataset_root / "ar-en.ar"
# en_path  = dataset_root / "ar-en.en"

# batch_size = 8
# drop_last = False
# device = torch.device("cpu")
# criterion = masked_loss.masked_crossEntropyLoss
# train_loader,x_vocab, y_vocab = dataset.get_data_loader(en_path,ar_path,batch_size,drop_last,x_vocab=None,y_vocab=None)

In [1103]:
InferenceModel =  InferenceSeq2seq(len(x_vocab),len(y_vocab),embed_size=300,hidden_size=100,num_layers=1,y_vocab=y_vocab,k=5)
InferenceModel.load_state_dict(torch.load(path))

<All keys matched successfully>

In [1088]:
# for input_ in train_loader:
#     break
# example_input = input_[0][0].reshape(1,-1)
# " ".join(x_vocab.lookup_tokens(input_[0][0].tolist()))

'the greenhouse gas emissions by sources and removals by sinks resulting from additional human induced land use , land-use change and forestry activities may be used to meet the commitments under subparagraph ( a ) above of each party included in annex i , provided that these activities have taken place since 1990 .'

In [None]:
'''
'the greenhouse gas emissions by sources and removals by sinks resulting from additional human induced land use , land-use change and forestry activities may be used to meet the commitments under subparagraph ( a ) above of each party included in annex i , provided that these activities have taken place since 1990 .
'''

In [1104]:
with torch.no_grad():
    t = InferenceModel(example_input)

In [1110]:
node = t[0]
tokens = []
while node is not None:
    tokens.append(node.token.flatten().item())
    node = node.prev
" ".join(y_vocab.lookup_tokens(tokens))

'<EOS> . <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> البرامج هذه أن المتوقع ومن <SOS>'