### Package imports

In [1]:
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.legacy import datasets
from torch.utils.data.dataset import Subset
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import random
import torch.optim as optim
import time
import random
import numpy as np
import torch.optim as optim
import time

#### parameters

In [2]:
BATCH_SIZE = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### seed initializing

In [3]:
seed=1234

random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.backends.cudnn.determininistic=True

### Data Processing

In [4]:
# step 1

removeTokens = lambda values: values[1:-1]   # function to remove [CLS] and [SEP] from the data set

tokens = Field(sequential=True,use_vocab=True,batch_first = True,preprocessing=removeTokens,lower=True,pad_token="0")
edits = Field(sequential=True,use_vocab=True,batch_first = True,preprocessing=removeTokens,lower=True,pad_token="0")

fields = {'tokens':('tokens',tokens),'labels':('edits',edits)}

train_data, test_data = TabularDataset.splits(path='data_filter',train='ptrain.jsonl',test='val.jsonl',
                                              format='json',fields=fields)

# train_data is dataset with edits and tokens pair. in edits and tokens list of string is available


In [5]:
# step 2  Build a vocab

tokens.build_vocab(train_data,min_freq=1,vectors='glove.6B.100d')
edits.build_vocab(train_data,min_freq=1,vectors='glove.6B.100d')

In [6]:
# Step 3 (Create a iterator to loop over the data. Also separate batchs with 
#         similar length and pad the extra space)

sort_by_len = lambda x:len(x)

train_data_iterator = BucketIterator(train_data,train=True,
                                            batch_size=BATCH_SIZE, device= device)#,sort_within_batch=False)

test_data_iterator =BucketIterator(test_data,BATCH_SIZE,train=False,sort=False, device= device)#,sort_within_batch=False)



#### data processing result

In [7]:
print(train_data)              # Tabular Data set object

print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

<torchtext.legacy.data.dataset.TabularDataset object at 0x7fcf10084a50>
dict_keys(['tokens', 'edits'])
dict_values([['darling', 'is', 'expected', 'to', 'announce', 'details', 'of', 'tax', 'cuts', 'and', 'plans', 'to', 'increases', 'public'], ['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$transform_verb_vbz_vb', '$keep']])


In [8]:
t,e = vars(test_data.examples[0])    # to print tokens of any row


In [9]:
batch_1 = next(iter(test_data_iterator))

In [10]:
print(batch_1.tokens)
print(batch_1.edits.shape)

tensor([[   10, 11375,  5268,  ...,     1,     1,     1],
        [ 3091,     0,  4113,  ...,     1,     1,     1],
        [  179,   125,   120,  ...,     1,     1,     1],
        ...,
        [  539,   270,    25,  ...,     1,     1,     1],
        [   28,  3302,    11,  ...,     1,     1,     1],
        [    6,     0,   581,  ...,     1,     1,     1]])
torch.Size([50, 46])


In [11]:
#string to index
print(f"tokens.vocab.stoi['$$$'] = {tokens.vocab.stoi['$$$']}")
print(f"tokens.vocab.itos[0] = {tokens.vocab.itos[0]}")
print(f"edits.vocab.stoi['$keep'] = {edits.vocab.stoi['$keep']}")
print(f"edits.vocab.itos[2] = {edits.vocab.itos[2]}")

tokens.vocab.stoi['$$$'] = 0
tokens.vocab.itos[0] = <unk>
edits.vocab.stoi['$keep'] = 2
edits.vocab.itos[2] = $keep


In [12]:
#length of vocabular create from the data set
print(f"len(tokens.vocab) = {len(tokens.vocab)}")
print(f"len(edits.vocab) = {len(edits.vocab)}")

len(tokens.vocab) = 63463
len(edits.vocab) = 22


In [13]:
# no. of unique words in tokens and edits
print(f"len(tokens.vocab.freqs.keys()) = {len(tokens.vocab.freqs.keys())}")
print(f"len(edits.vocab.freqs.keys()) = {len(edits.vocab.freqs.keys())} \n")
print(f"edits.vocab.freqs = {edits.vocab.freqs}")

len(tokens.vocab.freqs.keys()) = 63462
len(edits.vocab.freqs.keys()) = 20 

edits.vocab.freqs = Counter({'$keep': 1051580, '$delete': 39021, '$replace_,': 7424, '$transform_agreement_singular': 6413, '$append_,': 4747, '$append_the': 4689, '$replace_to': 3771, '$replace_the': 3711, '$replace_of': 3585, '$transform_verb_vbz_vb': 3344, '$replace_in': 2996, '$transform_verb_vbg_vb': 2799, '$transform_verb_vbn_vb': 2724, '$append_to': 2575, '$append_of': 2430, '$transform_agreement_plural': 2387, '$append_and': 2314, '$append_a': 2231, '$replace_.': 263, '$append_.': 144})


In [14]:
#shape of vocabular create from the data set
print(f"tokens.vocab.vectors.shape = {tokens.vocab.vectors.shape}")
print(f"edits.vocab.vectors.shape = {edits.vocab.vectors.shape}")
print()

tokens.vocab.vectors.shape = torch.Size([63463, 100])
edits.vocab.vectors.shape = torch.Size([22, 100])



build vocab just takes unique tokens from the dataset and given a position and stores
as a dictionary. when it is applied to the dataset the result comming from the 
bucket iteartor is just a postion no. from the build vocab and the rest is padded

### Model

In [15]:
class Seq_to_edits(nn.Module):
    def __init__(self,token_vocab_size : int        # num of unique token in the entire dataset 
               ,edit_vocab_size : list             # [batch_size,max sentence size in the batch]
               ,output_dim : list            # [batch_size,max sentence size in the batch]
               ,embed_dim : int              # embedding dimension
               ,num_of_filters : int         # to num of filters
               ,filter_sizes : list          # list of sizes of each filter
               ,dropout : int                # dropout percent ()
               ,pad_index : int              # padding index from the vocab
               ,max_lenght : int = 100):     # Maximum words in a list of tokens or sentence
               
        
        
        super().__init__()
        self.device = device
        
        # Embedding layer
        self.token_embedding_layer = nn.Embedding(token_vocab_size,embed_dim)
        self.position_embedding_layer = nn.Embedding(max_lenght,embed_dim)
        
        # Multiple conv 1D layers 
        self.conv_layers = nn.ModuleList([nn.Conv1d(in_channels = 1,
                                                    out_channels= 1,
                                                    kernel_size = filter_size,
                                                    padding = "same") 
                                          for filter_size in filter_sizes])
        
        self.final_layer = nn.Linear(embed_dim, edit_vocab_size)
        
        # Dropout layers
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
    def forward(self, tokens):
        
        # tokens size is [batch_size, max sentence size in the batch/token size]
        
        batch_size , num_of_tokens = tokens.shape
        
        position = torch.arange(0,num_of_tokens).unsqueeze(0)         # [1,num_of_tokens]
        position = position.repeat(batch_size,1).to(self.device)    # [batch_size, num_of_tokens]
        
        # apply embedding layer
        token_embeddings    = self.token_embedding_layer(tokens)
        # token_embeddings = [batch_size, num_of_tokens,embedding dimension]
        
        # elementwise sum of tokens and position 
        combination = self.dropout(token_embeddings)
        # combination = [batch_size, num_of_tokens,embedding dimension]
        #print(f"batch_size, num_of_tokens,embedding dimension = {combination.shape}")

        # Apply perute similar to transposing in multi dimension
        conved = combination.permute(0, 2, 1).reshape(batch_size,-1).unsqueeze(1)
        #conv_input = [batch size, 1, embedding dimension*num_of_tokens]
        #print(f"conv lay input = {conved.shape}")
        
        # multiple convolution and activation layers
        for i, conv_layer in enumerate(self.conv_layers):
            #pass through convolutional layer
            conved = conv_layer(self.dropout(conved))
            #conved = [batch size, 1, embedding dimension*num_of_tokens]
            
            #pass through relu activation function
            conved = F.relu(conved)*self.scale

            #conved = [batch size, 1, embedding dimension*num_of_tokens]
        
        #print(f"conv lay output = {conved.shape}")
        
        conved = conved.squeeze(1).reshape(batch_size,num_of_tokens,-1)
        #conved = [batch_size, num_of_tokens, embedding_dimension]
        
        
        output = self.final_layer(conved+combination)*self.scale
        #output = [batch size, num_of_tokens, target dimension]
        #print(f"output = {output.shape}")
        
        return output

### Hyper parameters

In [16]:
learning_rate = 0.001
num_epochs = 50
clip = 0.1
INPUT_DIM = len(tokens.vocab)
OUTPUT_DIM = len(edits.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 3
FILTER_SIZES = [1,2,4]
OUTPUT_DIM = len(edits.vocab)
DROPOUT = 0.5
PAD_IDX = tokens.vocab.stoi[tokens.pad_token]
UNK_IDX = tokens.vocab.stoi[tokens.unk_token]

### Model creation

In [17]:
# model is created

model = Seq_to_edits(INPUT_DIM,OUTPUT_DIM,OUTPUT_DIM,EMBEDDING_DIM,N_FILTERS,FILTER_SIZES,DROPOUT,PAD_IDX)

In [18]:
# trainable parameters are printed

count_parameters= lambda model:sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,358,532 trainable parameters


In [19]:
# load the pre-tained embeddings 

pretrained_embeddings = tokens.vocab.vectors

model.token_embedding_layer.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.6149,  0.9273,  0.5583,  ..., -0.0922, -0.0798, -1.2843],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [20]:
# initialize zero weights for unknown and padding tokens.


model.token_embedding_layer.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.token_embedding_layer.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [21]:
# Initialize optimizer and loss function

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

n_total_steps = len(train_data_iterator)
model = model.to(device)
criterion = criterion.to(device)
n_total_steps = len(train_data_iterator)

In [22]:
model

Seq_to_edits(
  (token_embedding_layer): Embedding(63463, 100)
  (position_embedding_layer): Embedding(100, 100)
  (conv_layers): ModuleList(
    (0): Conv1d(1, 1, kernel_size=(1,), stride=(1,), padding=same)
    (1): Conv1d(1, 1, kernel_size=(2,), stride=(1,), padding=same)
    (2): Conv1d(1, 1, kernel_size=(4,), stride=(1,), padding=same)
  )
  (final_layer): Linear(in_features=100, out_features=22, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

### Train and test functions

In [23]:
def train_model(model,data_iterator,optimizer,criterion,clip):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(train_data_iterator):
        
        # Make the gradient vector to zero so 
        #So not to added with the new gradient
        optimizer.zero_grad()

        # converting to cpu or gpu variable
        tokens_list = batch.tokens.to(device)
        edits_list = batch.edits.to(device)
        
        # Forward pass
        
        # get output from the model
        outputs = model(tokens_list)
        # outputs = [batch size, num_of_tokens, embedding dimension]
        
        outputs = outputs.contiguous().view(-1, OUTPUT_DIM)
        # outputs = [batch size, num_of_tokens*embedding dimension]
        trg = batch.edits.contiguous().view(-1)
        # trg = [batch size*num_of_tokens]
        
        loss = criterion(outputs, trg)

        # Backward and optimize
        
        # to calculate gradient
        loss.backward()
        #to avoid exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        # to make the updates in the parameter
        optimizer.step()
        
        epoch_loss += loss.item()
        
    
    #to return the avg loss for this epoch to train the model
    return epoch_loss / len(data_iterator)

# print('Finished Training')
# PATH = './cnn.pth'
# torch.save(model.state_dict(), PATH)


In [24]:
def evaluate_model(model, data_iterator, criterion):
    
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for i, batch in enumerate(data_iterator):
            # converting to cpu or gpu variable
            tokens_list = batch.tokens.to(device)
            edits_list = batch.edits.to(device)
            
            outputs = model(tokens_list)
            # outputs = [batch size, num_of_tokens, embedding dimension]

            outputs = outputs.contiguous().view(-1, OUTPUT_DIM)
            # outputs = [batch size, num_of_tokens*embedding dimension]
            trg = batch.edits.contiguous().view(-1)
            # trg = [batch size*num_of_tokens]
            loss = criterion(outputs, trg)
            
            _, predicted = torch.max(outputs.data, 1)
            print(f"pre = {predicted}")
            print(f"edit = {edits_list}")
            epoch_loss += loss.item()
            n_samples += edits_list.size(0)
            n_correct += (predicted == edits_list).sum().item()

    acc = 100.0 * n_correct / n_samples
    #to return the avg loss for this epoch to train the model
    return epoch_loss / len(data_iterator),acc


#### actual train and testing

In [None]:
start = time.perf_counter()
try:
    for epoch in range(num_epochs):
        train_loss = train_model(model,train_data_iterator,optimizer,criterion,clip)
        print (f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.3f}')
except KeyboardInterrupt:
    end = time.perf_counter()

  self.padding, self.dilation, self.groups)


Epoch [1/50], Train Loss: 0.391
Epoch [2/50], Train Loss: 0.192


In [None]:
end - start

In [None]:
test_loss = evaluate_model(model, test_data_iterator, criterion)
print(f"Test Loss {test_loss:.3f}")

### Rough work

In [None]:
l=['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.']
l1=['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']
print(len(l))
print(len(l1))

In [None]:
n_embeddings, dim = 10,4 #10 - # of vocac size 4 - # of emdebbing dimension

In [None]:
m = nn.Conv1d(1, 2, 3)
input = torch.randn(2, 1, 5) # (batch size, no. of channel, # of words)
# x = emb_1(input)
output = m(input)

In [None]:
input.shape

In [None]:
output.shape

In [None]:
from torch.nn import Embedding as emb

emb_1 = emb(188, 50)
print(f"embedding {emb_1}")
print(f"embedding weight's shape {emb_1.weight.shape}") #requires_grad=True therefore the matrix is learnable

print(f"values of weight {emb_1.weight}")

for batch in train_data_iterator:
    x = emb_1(batch.tokens)

In [None]:
x.shape

In [None]:
x = torch.tensor([[1,2,4],[1, 2, 3]])
print(x.shape)
x=x.repeat(4, 2)
print(x.shape)


In [None]:
import torch
N = 40
C_in = 40
L_in = 100
inputs = torch.rand([N, C_in, L_in])
padding = 3
kernel_size = 3
stride = 2
C_out = 10
x = torch.nn.Conv1d(C_in, C_out, kernel_size, stride=stride, padding=padding)
y = x(inputs)
print(y)
print(y.shape)

In [None]:
x_input = torch.randn([2,4,5])

# print(x_input)
x_input = x_input.permute(0,2,1).reshape(2,-1).unsqueeze(1)
print(x_input.shape)
print(x_input)

In [None]:
a = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 1,padding="same")(x_input)
b = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 1,padding="same")(a)
c = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 3,padding="same")(b)
d = nn.Conv1d(in_channels =1 ,out_channels= 1,kernel_size = 4,padding="same")(c)

In [None]:
print(f"a = {a.shape}")
print(f"b = {b.shape}")
print(f"c = {c.shape}")
print(f"d = {d.shape}")

In [None]:
d = d.squeeze(1).reshape(2,5,-1)
d.shape


In [None]:
print(f"f = {f.shape}")

In [None]:
g = nn.Conv1d(in_channels =1 ,out_channels= 4,kernel_size = 3)(f)

In [None]:
print(f"g = {g.shape}")

In [None]:
x_input = torch.randn([2,4,5])

# print(x_input)
x_input = x_input.reshape(2,-1).unsqueeze(1)
print(x_input.shape)
print(x_input)

In [None]:
a1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 2,padding="same")(x_input)
b1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 1,padding="same")(x_input)
c1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 3,padding="same")(x_input)
d1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 4,padding="same")(x_input)

f1 = nn.Conv1d(in_channels = e1.shape[1],out_channels= 25,kernel_size = 1)(e1)

In [None]:
print(f"a1 = {a1.shape}")
print(f"b1 = {b1.shape}")
print(f"c1 = {c1.shape}")
print(f"d1 = {d1.shape}")

In [None]:
a1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 2,padding="same")(x_input)
b1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 1,padding="same")(x_input)
c1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 3,padding="same")(x_input)
d1 = nn.Conv1d(in_channels =4 ,out_channels= 4,kernel_size = 4,padding="same")(x_input)
e1 = torch.cat([a1,b1,c1,d1],dim =1)
f1 = nn.Conv1d(in_channels = e1.shape[1],out_channels= 25,kernel_size = 1)(e1)

In [None]:
pooled1 = [a1,b1,c1,d1]

In [None]:
print(f"e1 = {e1.shape}")

In [None]:
print(f1.shape)