In [1]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)


In [3]:
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)


In [4]:
import random
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [10]:
print(f'Nnumber of training examples:{len(train_data)}')
print(f'Nnumber of validation examples:{len(valid_data)}')
print(f'Nnumber of testing examples:{len(test_data)}')

Nnumber of training examples:17500
Nnumber of validation examples:7500
Nnumber of testing examples:25000


In [6]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

In [7]:
batch = next(iter(valid_iterator))
batch.text.shape

torch.Size([50, 64])

In [20]:
len(list((iter(valid_iterator))))

118

In [21]:
len(list((iter(train_iterator))))

274

In [86]:
list(iter(valid_iterator))[0].text

tensor([[ 1378,   149,   485,  ...,    82,  3612,  7092],
        [ 4326,    62,   297,  ...,   922,     3, 19403],
        [    9,  1625,  1595,  ...,    17,   380,     9],
        ...,
        [  348,  8557,   767,  ...,     1,     1,     1],
        [  672,  2337,     4,  ...,     1,     1,     1],
        [    4,     4,     1,  ...,     1,     1,     1]], device='cuda:0')

In [44]:
[TEXT.vocab.itos[i] for i in batch.text[:,0]]

['Indian',
 'Summer',
 'is',
 'a',
 'good',
 'film',
 '.',
 'It',
 'made',
 'me',
 'feel',
 'good',
 'and',
 'I',
 'thought',
 'the',
 'cast',
 'was',
 'exceptional',
 '.',
 'How',
 'about',
 'Sam',
 'Raimi',
 'playing',
 'the',
 'camp',
 'buffoon',
 '.',
 'I',
 'thought',
 'his',
 'scenes',
 'were',
 'very',
 'funny',
 'in',
 'a',
 'Buster',
 'Keaton',
 '-',
 'like',
 'performance',
 '.',
 'Solid',
 'directing',
 'and',
 'nice',
 'cinematography',
 '.']

In [5]:
TEXT.build_vocab(train_data,max_size=25000, vectors='glove.6B.100d',unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [25]:
print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [89]:
TEXT.vocab.vectors.shape


torch.Size([25002, 100])

In [31]:
[TEXT.vocab.stoi]

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001BE9EB34828>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             '.': 4,
             'and': 5,
             'a': 6,
             'of': 7,
             'to': 8,
             'is': 9,
             'in': 10,
             'I': 11,
             'it': 12,
             'that': 13,
             '"': 14,
             "'s": 15,
             'this': 16,
             '-': 17,
             '/><br': 18,
             'was': 19,
             'as': 20,
             'with': 21,
             'movie': 22,
             'for': 23,
             'film': 24,
             'The': 25,
             'but': 26,
             '(': 27,
             'on': 28,
             ')': 29,
             "n't": 30,
             'you': 31,
             'are': 32,
             'not': 33,
             'have': 34,
             'his': 35,
             'be': 36,
             'he': 37,
  

In [9]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, pad_idx):
        super(WordAVGModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, pad_idx)
        self.linear = nn.Linear(embedding_size, output_size)
    
    def forward(self, text):
        embedded = self.embed(text) # [seq_len, batch_size, embedding_size]
        embedded = embedded.permute(1,0,2)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze() # [batch_size,embedding_size]
        return self.linear(pooled)

In [10]:
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_SIZE = 100
OUTPUT_SIZE = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordAVGModel(vocab_size=VOCAB_SIZE,
                     embedding_size=EMBEDDING_SIZE,
                     output_size=OUTPUT_SIZE,
                     pad_idx=PAD_IDX)

In [66]:
model

WordAVGModel(
  (embed): Embedding(25002, 100, padding_idx=1)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

2500301

In [12]:
pretrained_embedding = TEXT.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)

In [73]:
TEXT.vocab.vectors.shape

torch.Size([25002, 100])

In [13]:
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()

model = model.to(device)
crit = crit.to(device)

In [14]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [22]:
def train(model, iterator, optimizer, crit):
    epoch_loss, epoch_acc = 0., 0.
    model.train()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        
        # adam
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len  += len(batch.label)
    
    return epoch_loss / total_len, epoch_acc / total_len


In [23]:
def evaluate(model, iterator, crit):
    epoch_loss, epoch_acc = 0., 0.
    model.eval()
    total_len = 0.
    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len  += len(batch.label)

    
    return epoch_loss / total_len, epoch_acc / total_len


In [24]:
N_EPOCHS = 10
best_valid_acc = 0.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, crit)
    valid_loss, valid_acc = evaluate(model, valid_iterator, crit)
    
    if valid_acc > best_valid_acc:
        best_valid_acc = valid_acc
        torch.save(model.state_dict(), "wordavg-model.pth")
        
    print("Epoch",epoch,"Train Loss", train_loss, "Train Acc", train_acc)
    print("Epoch",epoch,"Valid Loss", valid_loss, "Valid Acc", valid_acc)

    

Epoch 0 Train Loss 0.2565487729651587 Train Acc 0.9196571429661342
Epoch 0 Valid Loss 0.48292096616427105 Valid Acc 0.8869333333969116
Epoch 1 Train Loss 0.23975270494052342 Train Acc 0.9250285714285714
Epoch 1 Valid Loss 0.4948615165869395 Valid Acc 0.8912000000635782
Epoch 2 Train Loss 0.22513470922538212 Train Acc 0.9288000000272478
Epoch 2 Valid Loss 0.5098448180198669 Valid Acc 0.8937333333969116
Epoch 3 Train Loss 0.2114784892678261 Train Acc 0.9341142857142857
Epoch 3 Valid Loss 0.5286603075663249 Valid Acc 0.8950666667302449
Epoch 4 Train Loss 0.19965306098461152 Train Acc 0.9379428572246007
Epoch 4 Valid Loss 0.5400861362457275 Valid Acc 0.897466666730245
Epoch 5 Train Loss 0.18931599967139107 Train Acc 0.9412000000272478
Epoch 5 Valid Loss 0.5550124594052632 Valid Acc 0.8984000000635783
Epoch 6 Train Loss 0.17731801265307834 Train Acc 0.9465142857415335
Epoch 6 Valid Loss 0.5714825155893961 Valid Acc 0.899466666730245
Epoch 7 Train Loss 0.16988706321035113 Train Acc 0.9500000