
### Package imports

In [23]:
count = 1

In [24]:
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import torch.nn as nn
from torch.nn import Transformer
from torchmetrics.functional import precision_recall,f1_score,accuracy
import torch.optim as optim
import math
from torch import Tensor

from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
import numpy as np

import random
import time
from datetime import datetime
from collections import Counter

# from imblearn.over_sampling import RandomOverSampler
from torch.utils.tensorboard import SummaryWriter

### parameters

In [25]:
BATCH_SIZE = 64 #1,2,4,8,16,32,64,128,256,512,1028
path = "data_filter/"
# path = "small_data/" 
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

### seed initializing

In [26]:
seed=1234

random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.backends.cudnn.determininistic=True

### Data Processing

In [27]:
# step 1

tokens = Field(sequential=True,use_vocab=True,batch_first = True,lower=True,pad_token="<pad>", init_token = '<sos>', eos_token = '<eos>')
edits = Field(sequential=True,use_vocab=True,batch_first = True,lower=True,pad_token="<pad>", init_token = '<sos>', eos_token = '<eos>')

fields = {'tokens':('tokens',tokens),'labels':('edits',edits)}

train_data, val_data, test_data = TabularDataset.splits(path=path,train='ptrain.jsonl',validation='val.jsonl',
                                                        test='test.jsonl',format='json',fields=fields)

In [28]:
# step 2  Build a vocab
tokens.build_vocab(train_data,min_freq=1)
edits.build_vocab(train_data,min_freq=1)

In [29]:
# Step 3 (Create a iterator to loop over the data. Also separate batchs with 
#         similar length and pad the extra space)
sort_key = lambda x: len(x.tokens)
train_data_iterator,val_data_iterator,test_data_iterator = BucketIterator.splits((train_data,val_data,test_data),
                                            batch_size=BATCH_SIZE, device= device,shuffle=True,sort_key=sort_key, sort_within_batch = True)

#### data processing result

In [30]:
len(train_data)

49896

In [31]:
len(val_data.examples)

378

In [32]:
print(train_data)              # Tabular Data set object

print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

<torchtext.legacy.data.dataset.TabularDataset object at 0x7f7d0e562b90>
dict_keys(['tokens', 'edits'])
dict_values([['[cls]', 'alistair', 'darling', 'is', 'expected', 'to', 'announce', 'details', 'of', 'tax', 'cuts', 'and', 'plans', 'to', 'increases', 'public', 'spending', '[sep]'], ['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$transform_verb_vbz_vb', '$keep', '$append_.', '$keep']])


In [33]:
batch_1 = next(iter(train_data_iterator))
print(batch_1.edits.shape)
print(batch_1.tokens.shape)

torch.Size([64, 23])
torch.Size([64, 23])


In [34]:
#string to index
print(f"tokens.vocab.stoi['0'] = {tokens.vocab.stoi['']}")
print(f"tokens.vocab.itos[0] = {tokens.vocab.itos[4]}")
print(f"edits.vocab.stoi['$keep'] = {edits.vocab.stoi['0']}")
print(f"edits.vocab.itos[1] = {edits.vocab.itos[4]}")

tokens.vocab.stoi['0'] = 0
tokens.vocab.itos[0] = the
edits.vocab.stoi['$keep'] = 0
edits.vocab.itos[1] = $keep


In [35]:
#length of vocabular create from the data set
print(f"len(tokens.vocab) = {len(tokens.vocab)}")
print(f"len(edits.vocab) = {len(edits.vocab)}")

len(tokens.vocab) = 64172
len(edits.vocab) = 24


In [36]:
# no. of unique words in tokens and edits
print(f"len(tokens.vocab.freqs.keys()) = {len(tokens.vocab.freqs.keys())}")
print(f"len(edits.vocab.freqs.keys()) = {len(edits.vocab.freqs.keys())} \n")
print(f"edits.vocab.freqs = {edits.vocab.freqs}")

len(tokens.vocab.freqs.keys()) = 64168
len(edits.vocab.freqs.keys()) = 20 

edits.vocab.freqs = Counter({'$keep': 1191726, '$delete': 39872, '$replace_.': 7235, '$replace_,': 7183, '$transform_agreement_singular': 6220, '$append_.': 5167, '$append_,': 4905, '$append_the': 4686, '$replace_to': 3634, '$replace_the': 3574, '$replace_of': 3458, '$transform_verb_vbz_vb': 3253, '$replace_in': 2898, '$transform_verb_vbg_vb': 2714, '$transform_verb_vbn_vb': 2637, '$append_to': 2499, '$append_of': 2413, '$transform_agreement_plural': 2340, '$append_and': 2272, '$append_a': 2204})


In [37]:
#shape of vocabular create from the data set
print(f"tokens.vocab.vectors.shape = {tokens}")
print(f"edits.vocab.vectors.shape = {edits}")
print()

tokens.vocab.vectors.shape = <torchtext.legacy.data.field.Field object at 0x7f7d0e5629d0>
edits.vocab.vectors.shape = <torchtext.legacy.data.field.Field object at 0x7f7d0e562990>



build vocab just takes unique tokens from the dataset and given a position and stores
as a dictionary. when it is applied to the dataset the result comming from the 
bucket iteartor is just a postion no. from the build vocab and the rest is padded

### Model

In [38]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 encoder_layer:int, # num of layer in encoder
                 emb_dim:int, #embedding dimension
                 head:int, #num of head
                 src_vocab_size:int,
                 trg_vocab_size:int,
                 feedforward_dim:int, 
                 src_pad_idx:int,
                 trg_pad_idx:int,
                 device:str,
                 dropout:float=0.0):
        
        super().__init__()
        
        self.head = head
        self.emb_dim = emb_dim
        self.device = device
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        
        #Embedding layer
        self.src_embedding_layer = nn.Embedding(src_vocab_size,emb_dim,device=device)
        self.trg_embedding_layer = nn.Embedding(trg_vocab_size,emb_dim,device=device)
        
        #transformer layer
        encoder_layers = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=head, dim_feedforward=feedforward_dim, dropout=dropout,batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers,encoder_layer)
        
        # self.transformer = nn.Transformer(d_model = emb_dim,
        #                                nhead = head,
        #                                num_encoder_layers = encoder_layer,
        #                                num_decoder_layers = decoder_layer,
        #                                dropout = dropout,
        #                                batch_first = True,
        #                                device = device)
        
        #Linear Layer
        self.linear_layer = nn.Linear(emb_dim,trg_vocab_size)
        
        # Dropout Layer
        self.dropout = nn.Dropout(dropout)

        # initrange = 1
        # self.src_embedding_layer.weight.data.uniform_(-initrange, initrange)
        # self.trg_embedding_layer.weight.data.uniform_(-initrange, initrange)

        # self.linear_layer.bias.data.zero_()
        # self.linear_layer.weight.data.uniform_(-initrange, initrange)

    def positional_embedding(self,length = 200):
        
        position = torch.arange(length).unsqueeze(1).to(self.device)    # [batch_size, num_of_tokens]
        denominator = torch.exp(torch.arange(0, self.emb_dim, 2) * (-math.log(10000.0) / self.emb_dim)).to(self.device)
        
        position_embedding = torch.zeros((length, self.emb_dim),device=self.device)
        position_embedding[:,0::2] = torch.sin(position*denominator)
        position_embedding[:,1::2] = torch.cos(position*denominator)
        
        position_embedding = position_embedding.unsqueeze(0)
        # position_embedding = (1,lenght,emb_dim)
        position_embedding.requires_grad = True
        return position_embedding
                                                      
    def make_padding_mask(self,template,idx):
        #mask = [batch size, src_len/trg_len]
        return (template == idx)
    
    def trg_mask(self,trg):
        mask = (torch.triu(torch.ones((trg, trg), device=self.device)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def forward(self,
                src : Tensor, #(batch_size,src_len)
                trg : Tensor): #(batch_size,trg_len) 
                #in this case src_len == trg_len
               
        
        batch_size , src_len  = src.shape
        trg_len = src_len # depends upon the senario
        
        # Applying embedding layer
        trg_mask = self.trg_mask(trg_len)
        src_pad_mask = self.make_padding_mask(src,self.src_pad_idx)
        trg_pad_mask = self.make_padding_mask(trg,self.trg_pad_idx)
        
        src_emb = self.src_embedding_layer(src)+self.positional_embedding(src_len)
        trg_emb = self.trg_embedding_layer(trg)+self.positional_embedding(trg_len)
        src_emb = self.dropout(src_emb).to(self.device)
        trg_emb = self.dropout(trg_emb).to(self.device)
        # src_emb = trg_emb = (batch_size,src_len/trg_len,emb_dim)
        
        # print(f"src_emb[0] {src.shape}")
        # print(f"trg_emb[0] {trg.shape}")
            
        # Apply transformer layer
        transformer_output = self.transformer_encoder(src_emb)
        
        # print(f"encoder output {transformer_output.shape}")
        
        # transformer_output = (batch_size,trg_len,emb_dim)
        # print(f"transformer_output {transformer_output[0]}")
        
        # Apply Linear layer
        output = self.linear_layer(transformer_output)
        # print(f"output {output.shape} ")
        # output = (batch_size,trg_len,num_class)
        
        return output.permute(0,2,1)
        
    


### Utility functions

In [39]:
def train_model(model,data_iterator,optimizer,criterion,clip,n_classes):
    
    model.train()
    epoch_loss,acc,f1_point = 0,0,0
        
    for i, batch in enumerate(train_data_iterator):
        
        optimizer.zero_grad()

        src = batch.tokens.to(device)
        trg = batch.edits.to(device)

        
        output = model(src, trg)
 
        loss = criterion(output, trg)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        
        predicted = torch.argmax(output, dim=1)
        
        acc += accuracy(predicted, trg).item()
        f1_point += f1_score(predicted, trg,average="macro",num_classes=n_classes,mdmc_average='global').item()
        
    acc = 100.0 * acc / len(data_iterator)
    f1_point = f1_point / len(data_iterator)
    epoch_loss = epoch_loss / len(data_iterator)
    
    return (epoch_loss,acc,f1_point)

In [40]:
def evaluate_model(model, data_iterator, criterion, n_classes):
    
    model.eval()

    epoch_loss,acc,f1_point = 0,0,0
    f1_score_n_classes = torch.zeros(n_classes).to(device)

    with torch.no_grad():
        for i, batch in enumerate(data_iterator):
            
            src = batch.tokens.to(device)
            trg = batch.edits.to(device)
            
            output = model(src, trg)

            loss = criterion(output,trg)
            
            epoch_loss += loss.item()
            
            
            predicted = torch.argmax(output, dim=1)
            
            acc += accuracy(predicted, trg).item() 
            f1_point += f1_score(predicted, trg,average="macro",num_classes=n_classes,mdmc_average='global').item()
            
            
            f1_score_n_classes += torch.nan_to_num(f1_score(predicted,trg, mdmc_average = 'global', average = 'none', num_classes = n_classes), nan = 0)
            
    f1_score_n_classes = f1_score_n_classes/len(data_iterator)
    acc = 100.0 * acc / len(data_iterator)
    f1_point = f1_point / len(data_iterator)
    epoch_loss = epoch_loss / len(data_iterator)
    
    return (epoch_loss,acc,f1_point,f1_score_n_classes)

### Hyper parameters

In [44]:
learning_rate = 0.001
num_epochs = 50
clip = 0.1
num_encoder_layer = 6
num_decoder_layer = 3
INPUT_DIM = len(tokens.vocab)
OUTPUT_DIM = len(edits.vocab)
HIDDEN_DIM = 256
EMBEDDING_DIM = 100
heads = 4
DROPOUT = 0.1
PAD_IDX = tokens.vocab.stoi[tokens.pad_token]
UNK_IDX = tokens.vocab.stoi[tokens.unk_token]
EDIT_PAD_IDX = edits.vocab.stoi[edits.pad_token]

In [45]:
512/8

64.0

### Model creation

In [46]:
# model is created

model = Seq2SeqTransformer(num_encoder_layer,EMBEDDING_DIM,heads,INPUT_DIM,OUTPUT_DIM,HIDDEN_DIM,PAD_IDX,EDIT_PAD_IDX,device,DROPOUT)
# (num_encoder_layer,num_decoder_layer,EMBEDDING_DIM,heads,INPUT_DIM,OUTPUT_DIM,HIDDEN_DIM,PAD_IDX,EDIT_PAD_IDX,device,DROPOUT)


In [47]:
# Initialize optimizer and loss function

criterion = nn.CrossEntropyLoss(ignore_index=EDIT_PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#,weight_decay=weight_decay, amsgrad=amsgrad)
# scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

n_total_steps = len(train_data_iterator)
model = model.to(device)
criterion = criterion.to(device)

In [48]:
# initialize zero weights for unknown and padding tokens.

# trainable parameters are printed

count_parameters= lambda model:sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,976,160 trainable parameters


### Train and testing

In [49]:
writer = SummaryWriter(f'runs/transformerEncoder/big_dataset/test_3')
count+=1

In [50]:
start = time.perf_counter_ns()
try:
    for epoch in range(num_epochs):
        
        train_loss,train_acc,train_f1_score = train_model(model,train_data_iterator,optimizer,criterion,clip,OUTPUT_DIM)
        val_loss,val_acc,val_f1_score,val_f1_score_n_class = evaluate_model(model, val_data_iterator, criterion,OUTPUT_DIM)
        
        if epoch%5 == 0:
            pass

        print (f'''Epoch [{epoch+1}/{num_epochs}],
        Train:       Loss: {train_loss:.3f}
        Validation:  Loss: {val_loss:.3f}, Accuracy: {val_acc:.3f},  F1 score: {val_f1_score:.3f}
        f1_score_n_class:-    
        {val_f1_score_n_class}''')
        print("----------------------------------------------------------------------------------")
        
        writer.add_scalars('LOSS',     { 'Train' : train_loss   ,'Validation' : val_loss    },  epoch)
        writer.add_scalars('ACCURACY', { 'Train': train_acc     ,'Validation': val_acc      }, epoch)
        writer.add_scalars('F1 SCORE', { 'Train': train_f1_score,'Validation': val_f1_score }, epoch)
    
finally:
    end = time.perf_counter_ns()
    timetaken = (end-start)*1.66667*10**-11
    print(f"time take is {timetaken:.3f} min")
    # torch.save(model, path+'/model.pt')


Epoch [1/50],
        Train:       Loss: 0.448
        Validation:  Loss: 0.364, Accuracy: 80.521,  F1 score: 0.219
        f1_score_n_class:-    
        tensor([0.0000, 0.0000, 1.0000, 1.0000, 0.8797, 0.0074, 0.8357, 0.6670, 0.0000,
        0.0556, 0.0000, 0.0000, 0.0000, 0.0000, 0.0370, 0.0000, 0.4897, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], device='cuda:1')
----------------------------------------------------------------------------------
Epoch [2/50],
        Train:       Loss: 0.365
        Validation:  Loss: 0.330, Accuracy: 80.842,  F1 score: 0.254
        f1_score_n_class:-    
        tensor([0.0000, 0.0000, 1.0000, 1.0000, 0.8811, 0.0537, 0.9070, 0.6618, 0.1877,
        0.5262, 0.0000, 0.0000, 0.0000, 0.0000, 0.0476, 0.0000, 0.5222, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], device='cuda:1')
----------------------------------------------------------------------------------
Epoch [3/50],
        Train:       Loss: 0.343
        V

In [None]:
test_loss,test_accuracy,f1,f2 = evaluate_model(model, test_data_iterator, criterion,OUTPUT_DIM)
print(f"Test Loss {test_loss:.3f} Test accuracy {test_accuracy:.3f} F1 score {f1:.3f}")

Test Loss 0.283 Test accuracy 93.363 F1 score 0.318


In [None]:
# writing the hyerparameter to the tensor board
writer.add_hparams({'learning_rate' : learning_rate, 
                    'Num_epochs': num_epochs,
                    'input_dim' : INPUT_DIM,
                    'output_dim' : OUTPUT_DIM,
                    'hidden_dim' : HIDDEN_DIM,
                    'embedding_dim' : EMBEDDING_DIM,
                    'droppout' : DROPOUT,
                    'train_data_len': len(train_data),
                    'Val_data_len': len(test_data),
                    'batch_size': BATCH_SIZE,
                  },
                  {
                      "test_loss":test_loss,
                      "test_accuracy":test_accuracy,
                      "test_f1_score":f1
                  })
writer.close()

### Rough work

In [None]:
a = torch.randn(2, 5, 4)
a

In [None]:
torch.argmax(a, dim=1).shape

In [None]:
torch.argmax(a, dim=-1)

In [None]:
y_act = torch.Tensor([[2,1],[1,3]]).int()
# batch size, tokens
print(f" output shape {y_act.shape}")
s1 = torch.Tensor([[[0,2,3,0],[5,0,1,3]],[[1,5,2,3],[1,5,6,13]]])
# batch size, tokens, emb lay

print(f" predicted shape {s1.shape}")
y_pred = torch.argmax(s1,dim=-1)


In [None]:
print(y_pred)
print(y_act)

In [None]:
# print(f1_score(y_act,s1))
print(accuracy_score(y_act,y_pred))

In [None]:
precision_recall(y_pred, y_act, average="macro",num_classes=4,mdmc_average='samplewise')

In [None]:
f1_score(y_pred,y_act,average="macro",num_classes=4,mdmc_average='global')

In [None]:
p, r = precision_recall(y_pred, y_act, average="macro",num_classes=4,mdmc_average='global')
print(p)
print(r)

In [None]:
(2*p.item()*r.item())/(p.item()+r.item())

###  THE END