### Imports

In [47]:
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.legacy import datasets
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

import torch.optim as optim
import time
import random
import numpy as np

### HyperParameter

In [48]:
#hypa
BATCH_SIZE = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Data Processing

In [49]:
removeTokens = lambda values: values[1:-1]   # function to remove [CLS] and [SEP] from the data set

tokens = Field(sequential=True,use_vocab=True,preprocessing=removeTokens,lower=True,init_token="<sos>",eos_token="<eos>",pad_token="0")
edits = Field(sequential=True,use_vocab=True,preprocessing=removeTokens,lower=True,init_token="<sos>",eos_token="<eos>",pad_token="0")

fields = {'tokens':('tokens',tokens),'labels':('edits',edits)}

train_data, test_data = TabularDataset.splits(path='data',train='ptrain.jsonl',test='val.jsonl',
                                              format='json',fields=fields)

# train_data is dataset with edits and tokens pair. in edits and tokens list of string is available

In [70]:
print(train_data)
print(type(train_data[0]))
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

<torchtext.legacy.data.dataset.TabularDataset object at 0x7fa4e0037fd0>
<class 'torchtext.legacy.data.example.Example'>
dict_keys(['tokens', 'edits'])
dict_values([['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.'], ['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']])


In [85]:
tokens.build_vocab(train_data,min_freq=1,vectors='glove.6B.50d')
edits.build_vocab(train_data,min_freq=1,vectors='glove.6B.50d')

In [72]:
print(tokens)
print(tokens.vocab)
# print(train_data[0].__dict__.values())

<torchtext.legacy.data.field.Field object at 0x7fa4e0037e90>
<torchtext.legacy.vocab.Vocab object at 0x7fa4da7c9450>


In [73]:
train_data_iterator, test_data_iterator = BucketIterator.splits((train_data,test_data),
                                                                batch_size=BATCH_SIZE)
   

In [74]:
tokens.vocab.stoi["<pad>"]

0

In [75]:
tokens.vocab.itos[2]

'<sos>'

In [76]:
edits.vocab.stoi["$keep"]

4

In [77]:
edits.vocab.itos[4]

'$keep'

In [83]:
tokens.vocab.vectors.shape

torch.Size([188, 50])

In [86]:
edits.vocab.vectors.shape

torch.Size([25, 50])

In [95]:
print(len(tokens.vocab.freqs.keys()))

184


In [96]:
print(len(edits.vocab.freqs.keys()))

21


In [97]:
print(edits.vocab.freqs)

Counter({'$keep': 240, '$delete': 9, '$replace_.': 4, '$transform_agreement_singular': 2, '$replace_might': 1, '$replace_,': 1, '$transform_verb_vbz_vb': 1, '$append_.': 1, '$transform_case_lower': 1, '$append_had': 1, '$replace_the': 1, '$append_said': 1, '$replace_no': 1, '$transform_verb_vbn_vbg': 1, '$replace_by': 1, '$append_yesterday': 1, '$replace_are': 1, '$replace_so': 1, '$append_for': 1, '$replace_what': 1, '$append_every': 1})


In [24]:
s = set()
for b in train_data_iterator:
    s.add(b.edits)
len(s)

1

#### Rough work

In [25]:
l=['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.']
l1=['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']
print(len(l))
print(len(l1))

34
34


In [32]:
from torch.nn import Embedding as emb

In [33]:
n_embeddings, dim = 10,4 #10 - # of vocac size 4 - # of emdebbing dimension

In [41]:
emb_1 = emb(n_embeddings, dim)
print(emb_1)
print(emb_1.weight.shape) #requires_grad=True therefore the matrix is learnable

print(emb_1.weight)

Embedding(10, 4)
torch.Size([10, 4])
Parameter containing:
tensor([[ 0.4167, -0.0727,  0.2211,  0.5794],
        [-0.4527,  0.2912, -0.1159, -1.0680],
        [-0.4339, -0.0446,  1.5780, -0.8122],
        [ 0.5294,  1.0690, -0.8220, -1.6761],
        [ 0.4288,  0.7014, -1.3402,  0.1515],
        [-0.7275, -0.9264,  0.6392,  0.6486],
        [ 0.8664, -0.6618,  0.4202, -1.0387],
        [ 0.4916,  0.3138, -0.7286, -0.4734],
        [-0.3453,  0.0581,  0.1727, -1.4853],
        [ 0.4743, -0.3263, -0.3663,  0.0114]], requires_grad=True)
