### Package imports

In [1]:
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.legacy import datasets
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

import torch.optim as optim
import time
import random
import numpy as np
import torch.optim as optim

BATCH_SIZE = 10

#### parameters

In [2]:
BATCH_SIZE = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Data Processing

In [34]:
# step 1

removeTokens = lambda values: values[1:-1]   # function to remove [CLS] and [SEP] from the data set

tokens = Field(sequential=True,use_vocab=True,batch_first = True,preprocessing=removeTokens,lower=True,pad_token="0")
edits = Field(sequential=True,use_vocab=True,batch_first = True,preprocessing=removeTokens,lower=True,pad_token="0")

fields = {'tokens':('tokens',tokens),'labels':('edits',edits)}

train_data, test_data = TabularDataset.splits(path='data',train='ptrain.jsonl',test='val.jsonl',
                                              format='json',fields=fields)

# train_data is dataset with edits and tokens pair. in edits and tokens list of string is available

In [35]:
print(train_data)              # Tabular Data set object

print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

<torchtext.legacy.data.dataset.TabularDataset object at 0x7fb5e27f1e50>
dict_keys(['tokens', 'edits'])
dict_values([['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.'], ['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']])


In [36]:
train_data.examples    # to print tokens of any row

[<torchtext.legacy.data.example.Example at 0x7fb5e2163450>,
 <torchtext.legacy.data.example.Example at 0x7fb5e2224d90>,
 <torchtext.legacy.data.example.Example at 0x7fb5e1bc7850>,
 <torchtext.legacy.data.example.Example at 0x7fb5e21d0950>,
 <torchtext.legacy.data.example.Example at 0x7fb5e1fe7f50>,
 <torchtext.legacy.data.example.Example at 0x7fb5e27f1b10>,
 <torchtext.legacy.data.example.Example at 0x7fb5e21d0410>,
 <torchtext.legacy.data.example.Example at 0x7fb5e27f1750>,
 <torchtext.legacy.data.example.Example at 0x7fb60337c3d0>,
 <torchtext.legacy.data.example.Example at 0x7fb5e187aa90>]

In [37]:
# step 2  Build a vocab

tokens.build_vocab(train_data,min_freq=1,vectors='glove.6B.100d')
edits.build_vocab(train_data,min_freq=1,vectors='glove.6B.100d')

In [38]:
print(tokens)   # field object
print(tokens.vocab)   #vocab object
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())
print(len(train_data[3].__dict__.values()))

# Note: even after the building vocab the train data will not change

<torchtext.legacy.data.field.Field object at 0x7fb5e27f15d0>
<torchtext.legacy.vocab.Vocab object at 0x7fb5e1aa8ed0>
dict_keys(['tokens', 'edits'])
dict_values([['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.'], ['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']])
2


In [43]:
# Step 3 (Create a iterator to loop over the data. Also separate batchs with 
#         similar length and pad the extra space)

train_data_iterator, test_data_iterator = BucketIterator.splits((train_data,test_data),
                                                                batch_size=5)#,sort_within_batch=False)

In [44]:
batch_1 = next(iter(train_data_iterator))

In [45]:
print(batch_1.tokens.shape)
print(batch_1.edits.shape)

torch.Size([5, 41])
torch.Size([5, 41])


In [11]:
#string to index
print(f"tokens.vocab.stoi['$$$'] = {tokens.vocab.stoi['$$$']}")
print(f"tokens.vocab.itos[0] = {tokens.vocab.itos[0]}")
print(f"edits.vocab.stoi['$keep'] = {edits.vocab.stoi['$keep']}")
print(f"edits.vocab.itos[2] = {edits.vocab.itos[2]}")

tokens.vocab.stoi['$$$'] = 0
tokens.vocab.itos[0] = <unk>
edits.vocab.stoi['$keep'] = 2
edits.vocab.itos[2] = $keep


In [12]:
#length of vocabular create from the data set
print(f"len(tokens.vocab) = {len(tokens.vocab)}")
print(f"len(edits.vocab) = {len(edits.vocab)}")

len(tokens.vocab) = 174
len(edits.vocab) = 22


In [13]:
# no. of unique words in tokens and edits
print(f"len(tokens.vocab.freqs.keys()) = {len(tokens.vocab.freqs.keys())}")
print(f"len(edits.vocab.freqs.keys()) = {len(edits.vocab.freqs.keys())} \n")
print(f"edits.vocab.freqs = {edits.vocab.freqs}")

len(tokens.vocab.freqs.keys()) = 172
len(edits.vocab.freqs.keys()) = 20 

edits.vocab.freqs = Counter({'$keep': 259, '$delete': 10, '$replace_.': 4, '$transform_agreement_singular': 2, '$append_every': 2, '$replace_might': 1, '$replace_,': 1, '$transform_verb_vbz_vb': 1, '$append_.': 1, '$transform_case_lower': 1, '$append_had': 1, '$replace_the': 1, '$append_said': 1, '$replace_no': 1, '$transform_verb_vbn_vbg': 1, '$replace_by': 1, '$append_yesterday': 1, '$replace_so': 1, '$append_for': 1, '$replace_what': 1})


In [14]:
#shape of vocabular create from the data set
print(f"tokens.vocab.vectors.shape = {tokens.vocab.vectors.shape}")
print(f"edits.vocab.vectors.shape = {edits.vocab.vectors.shape}")
print()

tokens.vocab.vectors.shape = torch.Size([174, 100])
edits.vocab.vectors.shape = torch.Size([22, 100])



build vocab just takes unique tokens from the dataset and given a position and stores
as a dictionary. when it is applied to the dataset the result comming from the 
bucket iteartor is just a postion no. from the build vocab and the rest is padded

### Model

In [15]:
class Seq_to_edits(nn.Module):
    def __init__(self,token_vocab_size : int        # num of unique token in the entire dataset 
               ,edit_vocab_size : list             # [batch_size,max sentence size in the batch]
               ,output_dim : list            # [batch_size,max sentence size in the batch]
               ,embed_dim : int              # embedding dimension
               ,num_of_filters : int         # to num of filters
               ,filter_sizes : list          # list of sizes of each filter
               ,dropout : int                # dropout percent ()
               ,pad_index : int              # padding index from the vocab
               ,max_lenght : int = 100):     # Maximum words in a list of tokens or sentence
               
        
        
        super().__init__()
        self.device = device
        
        # Embedding layer
        self.token_embedding_layer = nn.Embedding(token_vocab_size,embed_dim)
        self.position_embedding_layer = nn.Embedding(max_lenght,embed_dim)
        
        # Multiple conv 1D layers 
        self.conv_layers = nn.ModuleList([nn.Conv1d(in_channels = embed_dim,
                                                    out_channels= embed_dim,
                                                    kernel_size = filter_size,
                                                    padding = "same") 
                                          for filter_size in filter_sizes])
        
        self.final_layer = nn.Linear(embed_dim, edit_vocab_size)
        # Dropout layers
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, tokens):
        
        # tokens size is [batch_size, max sentence size in the batch/token size]
        
        batch_size , num_of_tokens = tokens.shape
        
        position = torch.arange(0,num_of_tokens).unsqueeze(0)         # [1,num_of_tokens]
        position = position.repeat(batch_size,1).to(self.device)    # [batch_size, num_of_tokens]
        
        
        # apply embedding layer
        token_embeddings    = self.token_embedding_layer(tokens)
        position_embeddings = self.position_embedding_layer(position)
        
        # token_embeddings = position_embeddings = [batch_size, num_of_tokens,embedding dimension]
        
        print(f"embed dim = {token_embeddings.shape[2]}")
        combination = self.dropout(token_embeddings+position_embeddings)
        # combination = [batch_size, num_of_tokens,embedding dimension]
        
        #permute for convolutional layer
        conved = combination.permute(0, 2, 1) 
        
        #conv_input = [batch size, emb dim, src len]
        
        #begin convolutional blocks...
        
        for i, conv_layer in enumerate(self.conv_layers):
        
            #pass through convolutional layer
            conved = conv_layer(self.dropout(conved))
            #conved = [batch size, emd dim, token len]
            

            #pass through GLU activation function
            conved = F.relu(conved)

            #conved = [batch size, emd dim, token len]
        
        conved = conved.squeeze(1).reshape(batch_size,100,-1).permute(0, 2, 1)
        
        output = self.final_layer(conved)
         print(f"output.shape = {output.shape}") 


        return output.permute(0, 2, 1)
        
        

In [16]:
learning_rate = 0.0001
num_epochs = 10
INPUT_DIM = len(tokens.vocab)
OUTPUT_DIM = len(edits.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 3
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = len(edits.vocab)
DROPOUT = 0.5
PAD_IDX = tokens.vocab.stoi[tokens.pad_token]
UNK_IDX = tokens.vocab.stoi[tokens.unk_token]

In [17]:
# model is created

model = Seq_to_edits(INPUT_DIM,OUTPUT_DIM,OUTPUT_DIM,EMBEDDING_DIM,N_FILTERS,FILTER_SIZES,DROPOUT,PAD_IDX)

In [18]:
# trainable parameters are printed

count_parameters= lambda model:sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 149,922 trainable parameters


In [19]:
# load the pre-tained embeddings 

pretrained_embeddings = tokens.vocab.vectors

model.token_embedding_layer.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.6149,  0.9273,  0.5583,  ..., -0.0922, -0.0798, -1.2843],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.4221,  0.6307, -0.1291,  ...,  0.1045,  0.1854, -0.4767],
        [ 0.2783,  0.4995,  0.2494,  ..., -0.3274, -0.3004,  0.0930],
        [ 0.7090,  0.6810, -1.2128,  ...,  0.6917,  1.4080, -1.0227]])

In [20]:
# initialize zero weights for unknown and padding tokens.


model.token_embedding_layer.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.token_embedding_layer.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [21]:
# Initialize optimizer and loss function

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

n_total_steps = len(train_data_iterator)
model = model.to(device)
criterion = criterion.to(device)
n_total_steps = len(train_data_iterator)

In [22]:
from torchsummary import summary

In [23]:
help(summary)

Help on function summary in module torchsummary.torchsummary:

summary(model, input_size, batch_size=-1, device='cuda')



In [None]:
summary(model)

In [24]:
model

Seq_to_edits(
  (token_embedding_layer): Embedding(174, 100)
  (position_embedding_layer): Embedding(100, 100)
  (conv_layers): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=same)
    (1): Conv1d(100, 100, kernel_size=(4,), stride=(1,), padding=same)
    (2): Conv1d(100, 100, kernel_size=(5,), stride=(1,), padding=same)
  )
  (final_layer): Linear(in_features=100, out_features=22, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

### Model(rough)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)



In [25]:

for epoch in range(num_epochs):
    for i, batch in enumerate(train_data_iterator):
        optimizer.zero_grad()


        # converting to cpu or gpu variable
        tokens_list = batch.tokens.to(device)
        edits_list = batch.edits.to(device)

        # Forward pass
        outputs = model(tokens_list)
        loss = criterion(outputs, edits_list)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        break 
        if (i+1) % 2000 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')



print('Finished Training')
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)


bs x tok_len 5 34
embed dim = 100
conv_input  = torch.Size([5, 100, 34])
i = 0 
 conved = torch.Size([5, 100, 34])
i = 1 
 conved = torch.Size([5, 100, 34])
i = 2 
 conved = torch.Size([5, 100, 34])
conved.shape = torch.Size([5, 34, 100])
output.shape = torch.Size([5, 34, 22])
bs x tok_len 5 41
embed dim = 100
conv_input  = torch.Size([5, 100, 41])
i = 0 
 conved = torch.Size([5, 100, 41])
i = 1 
 conved = torch.Size([5, 100, 41])
i = 2 
 conved = torch.Size([5, 100, 41])
conved.shape = torch.Size([5, 41, 100])
output.shape = torch.Size([5, 41, 22])
bs x tok_len 5 37
embed dim = 100
conv_input  = torch.Size([5, 100, 37])
i = 0 
 conved = torch.Size([5, 100, 37])
i = 1 
 conved = torch.Size([5, 100, 37])
i = 2 
 conved = torch.Size([5, 100, 37])
conved.shape = torch.Size([5, 37, 100])
output.shape = torch.Size([5, 37, 22])
bs x tok_len 5 37
embed dim = 100
conv_input  = torch.Size([5, 100, 37])
i = 0 
 conved = torch.Size([5, 100, 37])
i = 1 
 conved = torch.Size([5, 100, 37])
i = 2 
 c

  self.padding, self.dilation, self.groups)


In [32]:
model.eval()
    
epoch_loss = 0

with torch.no_grad():

    for batch in test_data_iterator:

        src = batch.tokens
        trg = batch.edits

        output, _ = model(src, trg)

        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]

        output_dim = output.shape[-1]

        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)

        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]

        loss = criterion(output, trg)

        epoch_loss += loss.item()

print( epoch_loss / len(iterator))

TypeError: '<' not supported between instances of 'Example' and 'Example'

### Rough work

In [None]:
l=['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.']
l1=['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']
print(len(l))
print(len(l1))

In [None]:
n_embeddings, dim = 10,4 #10 - # of vocac size 4 - # of emdebbing dimension

In [None]:
m = nn.Conv1d(1, 2, 3)
input = torch.randn(2, 1, 5) # (batch size, no. of channel, # of words)
# x = emb_1(input)
output = m(input)

In [None]:
input.shape

In [None]:
output.shape

In [None]:
from torch.nn import Embedding as emb

emb_1 = emb(188, 50)
print(f"embedding {emb_1}")
print(f"embedding weight's shape {emb_1.weight.shape}") #requires_grad=True therefore the matrix is learnable

print(f"values of weight {emb_1.weight}")

for batch in train_data_iterator:
    x = emb_1(batch.tokens)

In [None]:
x.shape

In [None]:
x = torch.tensor([[1,2,4],[1, 2, 3]])
print(x.shape)
x=x.repeat(4, 2)
print(x.shape)


In [None]:
import torch
N = 40
C_in = 40
L_in = 100
inputs = torch.rand([N, C_in, L_in])
padding = 3
kernel_size = 3
stride = 2
C_out = 10
x = torch.nn.Conv1d(C_in, C_out, kernel_size, stride=stride, padding=padding)
y = x(inputs)
print(y)
print(y.shape)

In [None]:
x_input = torch.randn([2,4,5])

# print(x_input)
x_input = x_input.permute(0,2,1).reshape(2,-1).unsqueeze(1)
print(x_input.shape)
print(x_input)

In [None]:
a = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 1,padding="same")(x_input)
b = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 1,padding="same")(a)
c = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 3,padding="same")(b)
d = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 4,padding="same")(c)

In [None]:
print(f"a = {a.shape}")
print(f"b = {b.shape}")
print(f"c = {c.shape}")
print(f"d = {d.shape}")

In [None]:
d = d.squeeze(1).reshape(2,5,-1)
d.shape


In [None]:
print(f"f = {f.shape}")

In [None]:
g = nn.Conv1d(in_channels =1 ,out_channels= 4,kernel_size = 3)(f)

In [None]:
print(f"g = {g.shape}")

In [None]:
f

In [None]:
x_input = torch.randn([2,4,5])

# print(x_input)
x_input = x_input.reshape(2,-1).unsqueeze(1)
print(x_input.shape)
print(x_input)

In [None]:
a1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 2,padding="same")(x_input)
b1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 1,padding="same")(x_input)
c1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 3,padding="same")(x_input)
d1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 4,padding="same")(x_input)

f1 = nn.Conv1d(in_channels = e1.shape[1],out_channels= 25,kernel_size = 1)(e1)

In [None]:
print(f"a1 = {a1.shape}")
print(f"b1 = {b1.shape}")
print(f"c1 = {c1.shape}")
print(f"d1 = {d1.shape}")

In [None]:
a1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 2,padding="same")(x_input)
b1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 1,padding="same")(x_input)
c1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 3,padding="same")(x_input)
d1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 4,padding="same")(x_input)
e1 = torch.cat([a1,b1,c1,d1],dim =1)
f1 = nn.Conv1d(in_channels = e1.shape[1],out_channels= 25,kernel_size = 1)(e1)

In [None]:
pooled1 = [a1,b1,c1,d1]

In [None]:
print(f"e1 = {e1.shape}")

In [None]:
print(f1.shape)