#Preparing Data


In [0]:
import torch
from torchtext import data

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [2]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT,LABEL)

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 26.9MB/s]


In [3]:
print(vars(train_data.examples[1])['label'])

pos


In [0]:
train_data, val_data = train_data.split()

In [5]:
len(val_data)

7500

In [0]:
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)

In [7]:
len(TEXT.vocab)

25002

In [8]:
TEXT.vocab.freqs.most_common(20)

[('the', 201739),
 (',', 191449),
 ('.', 165091),
 ('a', 109326),
 ('and', 109148),
 ('of', 100346),
 ('to', 93387),
 ('is', 76107),
 ('in', 61269),
 ('I', 54104),
 ('it', 53593),
 ('that', 49037),
 ('"', 44341),
 ("'s", 43197),
 ('this', 42192),
 ('-', 36824),
 ('/><br', 35412),
 ('was', 34469),
 ('as', 30288),
 ('with', 29902)]

In [9]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']

In [10]:
LABEL.vocab.stoi

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'neg': 1, 'pos': 0})

In [11]:
torch.cuda.is_available()

True

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, test_iter, val_iter = data.BucketIterator.splits(
  (train_data,test_data, val_data),
  batch_size=BATCH_SIZE,
  device=device
)

#Building the Model

In [0]:
import torch.nn as nn

class RNN(nn.Module):
  def __init__(self, input_dim, output_dim, hidden_dim, embeddings_dim):
    super().__init__()
    
    self.embeddings = nn.Embedding(input_dim, embeddings_dim)
    self.rnn = nn.RNN(embeddings_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, output_dim)
    
  def forward(self, x):
    
    #x = [sent len, batch size]
    
    embedded = self.embeddings(x)
    
    #embedded = [sent len, batch size, embedding dim]
    
    out, hidden = self.rnn(embedded)
    
    #output = [sent len, batch size, hid dim]
    #hidden = [1, batch size, hid dim]
    
    assert torch.equal(out[-1,:,:], hidden.squeeze(0))
    #print(hidden.squeeze(0).shape)
    
    return self.fc(hidden.squeeze(0))
        
 

In [36]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, OUTPUT_DIM, HIDDEN_DIM, EMBEDDING_DIM)
print(model)

RNN(
  (embeddings): Embedding(25002, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


#Training the Model

In [0]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [0]:
criterion = nn.BCEWithLogitsLoss()

In [0]:
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(pred, y):
  rounded_pred = torch.round(torch.sigmoid(pred))
  correct = (rounded_pred == y).float()
  acc = correct.sum()/len(correct)
  return acc

In [0]:
def train(model, optimizer, iterator, criterion):
  
  epoch_loss = 0.0
  epoch_accu = 0.0
  
  model.train()
  
  for batch in iterator:
    
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    #print(predictions)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    loss.backward()
    optimizer.step()
    
    epoch_loss += loss.item()
    epoch_accu += acc.item()
  
  return ((epoch_loss/len(iterator)), (epoch_accu/len(iterator)))

In [0]:
def evaluate(model, optimizer, iterator, criterion):
  
  epoch_loss = 0.0
  epoch_acc = 0.0
  
  model.eval()
  
  for batch in iterator:
    
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    
    epoch_loss += loss.item()
    epoch_acc += acc.item()
    
    return (epoch_loss/len(iterator)) , (epoch_acc/len(iterator))

In [39]:
epochs = 5

for epoch in range(epochs):
  train_loss, train_acc = train(model, optimizer, train_iter, criterion)
  val_loss, val_acc = evaluate(model, optimizer, val_iter, criterion)
  
  print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {val_loss:.3f} | Val. Acc: {val_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.697 | Train Acc: 50.25% | Val. Loss: 0.006 | Val. Acc: 0.54% |
| Epoch: 02 | Train Loss: 0.696 | Train Acc: 50.31% | Val. Loss: 0.006 | Val. Acc: 0.54% |
| Epoch: 03 | Train Loss: 0.697 | Train Acc: 50.24% | Val. Loss: 0.006 | Val. Acc: 0.54% |
| Epoch: 04 | Train Loss: 0.697 | Train Acc: 50.10% | Val. Loss: 0.006 | Val. Acc: 0.54% |
| Epoch: 05 | Train Loss: 0.697 | Train Acc: 50.22% | Val. Loss: 0.006 | Val. Acc: 0.54% |


In [45]:

test_loss, test_acc = evaluate(model, optimizer, test_iter, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')


| Test Loss: 0.002 | Test Acc: 0.14% |
