### Imports

In [129]:
import torch
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.legacy import datasets
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

import torch.optim as optim
import time
import random
import numpy as np

### HyperParameter

In [148]:
BATCH_SIZE = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Data Processing

In [149]:
removeTokens = lambda values: values[1:-1]   # function to remove [CLS] and [SEP] from the data set

tokens = Field(sequential=True,use_vocab=True,batch_first = True,preprocessing=removeTokens,lower=True,init_token="<sos>",eos_token="<eos>",pad_token="0")
edits = Field(sequential=True,use_vocab=True,batch_first = True,preprocessing=removeTokens,lower=True,init_token="<sos>",eos_token="<eos>",pad_token="0")

fields = {'tokens':('tokens',tokens),'labels':('edits',edits)}

train_data, test_data = TabularDataset.splits(path='data',train='ptrain.jsonl',test='val.jsonl',
                                              format='json',fields=fields)

# train_data is dataset with edits and tokens pair. in edits and tokens list of string is available

In [150]:
print(train_data)
print(type(train_data[0]))
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

<torchtext.legacy.data.dataset.TabularDataset object at 0x7fd4481806d0>
<class 'torchtext.legacy.data.example.Example'>
dict_keys(['tokens', 'edits'])
dict_values([['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.'], ['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']])


In [151]:
tokens.build_vocab(train_data,min_freq=1,vectors='glove.6B.100d')
edits.build_vocab(train_data,min_freq=1,vectors='glove.6B.100d')

In [160]:
print(tokens)
print(tokens.vocab)
print(train_data[0].__dict__.keys())
print(train_data[-1].__dict__.values())
print(len(train_data[-1].__dict__.values()))

<torchtext.legacy.data.field.Field object at 0x7fd448180c90>
<torchtext.legacy.vocab.Vocab object at 0x7fd448179e10>
dict_keys(['tokens', 'edits'])
dict_values([['so', 'even', 'now', ',', 'weekend', 'i', 'tell', 'my', 'children', 'to', 'bring', 'their', 'children', 'so', 'i', 'one', 'can', 'see', 'my', 'children', 'as', 'well', 'as', 'my', 'life', 'grandchildren', 'here', 'or', 'elsewhere', 'almost', 'once', 'a', 'week', ',', 'so', 'even', 'now', ',', 'weekend', 'i', 'tell', 'my', 'children', 'to', 'bring', 'their', 'children', 'so', 'i', 'one', 'can', 'see', 'my', 'children', 'as', 'well', 'as', 'my', 'life', 'grandchildren', 'here', 'or', 'elsewhere', 'almost', 'once', 'a', 'week', ','], ['$keep', '$keep', '$keep', '$append_every', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', 

In [153]:
train_data_iterator, test_data_iterator = BucketIterator.splits((train_data,test_data),
                                                                batch_size=BATCH_SIZE)

In [154]:
# max_size_in_dataset = max(len(train_data))
maxValue = max(train_data_iterator, key=lambda x: len(x))

In [155]:
maxValue


[torchtext.legacy.data.batch.Batch of size 5]
	[.tokens]:[torch.LongTensor of size 5x40]
	[.edits]:[torch.LongTensor of size 5x40]

#### rough for data processing

In [99]:
for batch in train_data_iterator:
    print(batch.tokens.shape)
    print(batch.tokens)

torch.Size([43, 10])
tensor([[  2,   2,   2,   2,   2,   2,   2,   2,   2,   2],
        [ 30, 152, 132, 167,  56,  15,  11,  42, 142, 119],
        [ 77,  19,   5,   9, 179, 138, 163,  23,  87,  19],
        [ 29, 115,   4,  58, 109,   5, 150,  27, 114, 156],
        [  5,  23, 120,  83,  35,   7, 127,  80, 160, 118],
        [177,  34,   6,  15, 154,   5, 146,  10,  37, 122],
        [ 26,  62,   4,  28,  47,  24,  15,  44, 117,  50],
        [159,  10, 103,  34,  49,   6,   7,  69,  60,   4],
        [ 16,  25, 182, 140,   5,  55, 145,   6,   7,  11],
        [ 14,  91,  18,  38, 180,  93,  17, 157, 129, 153],
        [ 10,   4, 174,  98,  82,  29,  32,  66, 170,  11],
        [ 57, 168, 121,  78, 101, 151,  84,   9,   6,   4],
        [ 32,  71,   5, 184, 137, 133,   5, 131,  81, 107],
        [ 14, 165,  12,  86,  41,  48,  11,  10, 105,   5],
        [ 30,   4, 104,   4,  20, 123,  52,  97,   9,   3],
        [ 26, 183, 111, 116,   3,   4,  94, 134,   4,   1],
        [125, 136, 

In [44]:
for batch in train_data_iterator:
    print(batch.tokens)

tensor([[  2,   2,   2,   2,   2,   2,   2,   2,   2,   2],
        [ 11, 167,  30,  42,  15, 142, 152,  56, 119, 132],
        [163,   9,  77,  23, 138,  87,  19, 179,  19,   5],
        [150,  58,  29,  27,   5, 114, 115, 109, 156,   4],
        [127,  83,   5,  80,   7, 160,  23,  35, 118, 120],
        [146,  15, 177,  10,   5,  37,  34, 154, 122,   6],
        [ 15,  28,  26,  44,  24, 117,  62,  47,  50,   4],
        [  7,  34, 159,  69,   6,  60,  10,  49,   4, 103],
        [145, 140,  16,   6,  55,   7,  25,   5,  11, 182],
        [ 17,  38,  14, 157,  93, 129,  91, 180, 153,  18],
        [ 32,  98,  10,  66,  29, 170,   4,  82,  11, 174],
        [ 84,  78,  57,   9, 151,   6, 168, 101,   4, 121],
        [  5, 184,  32, 131, 133,  81,  71, 137, 107,   5],
        [ 11,  86,  14,  10,  48, 105, 165,  41,   5,  12],
        [ 52,   4,  30,  97, 123,   9,   4,  20,   3, 104],
        [ 94, 116,  26, 134,   4,   4, 183,   3,   1, 111],
        [  8,  74, 125, 147, 175,  72, 1

In [123]:
#string to index
print(f"tokens.vocab.stoi['$$$'] = {tokens.vocab.stoi['$$$']}")
print(f"tokens.vocab.itos[0] = {tokens.vocab.itos[0]}")
print(f"edits.vocab.stoi['$keep'] = {edits.vocab.stoi['$keep']}")
print(f"edits.vocab.itos[4] = {edits.vocab.itos[4]}")
print()

#length of vocabular create from the data set
print(f"len(tokens.vocab) = {len(tokens.vocab)}")
print(f"len(edits.vocab) = {len(edits.vocab)}")
print()

#shape of vocabular create from the data set
print(f"tokens.vocab.vectors.shape = {tokens.vocab.vectors.shape}")
print(f"edits.vocab.vectors.shape = {edits.vocab.vectors.shape}")
print()

# no. of unique words in tokens and edits
print(f"len(tokens.vocab.freqs.keys()) = {len(tokens.vocab.freqs.keys())}")
print(f"len(edits.vocab.freqs.keys()) = {len(edits.vocab.freqs.keys())} \n")
print(f"edits.vocab.freqs = {edits.vocab.freqs}")

tokens.vocab.stoi['$$$'] = 0
tokens.vocab.itos[0] = <unk>
edits.vocab.stoi['$keep'] = 4
edits.vocab.itos[4] = $keep

len(tokens.vocab) = 188
len(edits.vocab) = 25

tokens.vocab.vectors.shape = torch.Size([188, 100])
edits.vocab.vectors.shape = torch.Size([25, 100])

len(tokens.vocab.freqs.keys()) = 184
len(edits.vocab.freqs.keys()) = 21 

edits.vocab.freqs = Counter({'$keep': 240, '$delete': 9, '$replace_.': 4, '$transform_agreement_singular': 2, '$replace_might': 1, '$replace_,': 1, '$transform_verb_vbz_vb': 1, '$append_.': 1, '$transform_case_lower': 1, '$append_had': 1, '$replace_the': 1, '$append_said': 1, '$replace_no': 1, '$transform_verb_vbn_vbg': 1, '$replace_by': 1, '$append_yesterday': 1, '$replace_are': 1, '$replace_so': 1, '$append_for': 1, '$replace_what': 1, '$append_every': 1})


### Model(rough)

In [None]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(max_size_in_dataset, embedding_dim, padding_idx = pad_idx)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return cat 


In [None]:
class ConvNet(nn.Module):
    
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv1d(1, 1, 1)
        self.pool = nn.MaxPool1d(2, 2)
        self.conv2 = nn.Conv1d(1, 1, 3)
        self.conv2 = nn.Conv1d(1, 1, 5)
        self.conv2 = nn.Conv1d(1, 1, 7)

    def forward(self, x):
        # -> n, 3, 32, 32
        x = self.pool(F.relu(self.conv1(x)))  # -> n, 6, 14, 14
        x = self.pool(F.relu(self.conv2(x)))  # -> n, 16, 5, 5
        x = x.view(-1, 16 * 5 * 5)            # -> n, 400
        return x


In [None]:
model = ConvNet().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

n_total_steps = len(train_loader)

In [None]:
for epoch in range(num_epochs):
    for i, (tokens_list, edits_list) in enumerate(train_data_iterator):
        # origin shape: [4, 3, 32, 32] = 4, 3, 1024
        tokens_list = tokens_list.to(device)
        edits_list = edits_list.to(device)

        # Forward pass
        outputs = model(tokens_list)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 2000 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

print('Finished Training')
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)


In [None]:
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(10)]
    n_class_samples = [0 for i in range(10)]
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        
        for i in range(batch_size):
            label = labels[i]
            pred = predicted[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network: {acc} %')

    for i in range(10):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy of {classes[i]}: {acc} %')
Footer

### Rough work

In [None]:
l=['plus', ',', 'the', 'novelty', 'of', 'the', 'iphone', 'won', "'t", 'wear', 'off', ',', 'as', 'it', 'may', 'with', 'a', 'camcorder', ';', 'and', 'over', 'these', 'video', 'apps', 'have', 'fun', 'effects', 'that', 'a', 'camcorder', 'can', "'t", 'match', '.']
l1=['$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$replace_might', '$keep', '$keep', '$keep', '$replace_,', '$keep', '$delete', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep', '$keep']
print(len(l))
print(len(l1))

In [None]:
n_embeddings, dim = 10,4 #10 - # of vocac size 4 - # of emdebbing dimension

In [126]:
m = nn.Conv1d(1, 2, 3)
input = torch.randn(2, 1, 5) # (batch size, no. of channel, # of words)
# x = emb_1(input)
output = m(input)

In [112]:
input.shape

torch.Size([2, 1, 5])

In [87]:
output.shape

torch.Size([1, 2, 3])

In [55]:
from torch.nn import Embedding as emb

emb_1 = emb(188, 50)
print(f"embedding {emb_1}")
print(f"embedding weight's shape {emb_1.weight.shape}") #requires_grad=True therefore the matrix is learnable

print(f"values of weight {emb_1.weight}")

for batch in train_data_iterator:
    x = emb_1(batch.tokens)

embedding Embedding(188, 50)
embedding weight's shape torch.Size([188, 50])
values of weight Parameter containing:
tensor([[-2.2020, -1.5676,  1.0959,  ..., -1.5418,  1.0299, -0.1639],
        [ 0.4356,  0.5537, -0.7752,  ..., -0.0168,  0.5911,  0.4768],
        [-0.9705, -0.3136, -0.4529,  ..., -0.1166,  0.7507, -1.0787],
        ...,
        [-0.4359, -0.2323,  0.2142,  ...,  0.9488, -1.2057, -1.7878],
        [ 1.1814, -1.2606,  0.5614,  ..., -1.7198,  0.3160, -0.1267],
        [-0.1210, -0.7379,  0.1107,  ..., -0.8445,  0.0451,  1.6728]],
       requires_grad=True)


In [103]:
x.shape

torch.Size([43, 10, 50])