In [1]:
# from torch.utils.data import DataLoader

import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torchtext.data import get_tokenizer

import torch
from torch.utils.data import DataLoader
from torch.nn import ReLU, Sigmoid, Sequential, Linear, BCELoss, Embedding, Module, Conv1d, MaxPool1d, Conv2d
from torch.nn import functional as F
from torch.optim import Adam

import nltk
from nltk.corpus import stopwords

import re
from string import punctuation
import time
from tqdm import tqdm

In [2]:
train_dataset, eval_dataset = torchtext.datasets.IMDB()

In [3]:
train_dataset, eval_dataset = to_map_style_dataset(train_dataset), to_map_style_dataset(eval_dataset)

In [4]:
for label,text in train_dataset[:10]:
  print(label, text)

1 I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [5]:
tokenizer = get_tokenizer('basic_english')
tokenizer(train_dataset[0][1])[:5]

['i', 'rented', 'i', 'am', 'curious-yellow']

In [6]:
def build_vocabulary(datasets):
  for dataset in datasets:
    for _, text in dataset:
      yield tokenizer(text)

In [7]:
min_freq = 3

vocabulary = build_vocab_from_iterator(
  build_vocabulary([train_dataset, eval_dataset]), 
  min_freq=min_freq, 
  specials=["<UNK>"]  
)

vocabulary.set_default_index(vocabulary["<UNK>"]) 

In [8]:
torch.save(vocabulary, 'data/vocab-without-filter-cnn.pth')
print(len(vocabulary))

55574


In [9]:
def encode_seq_ints(vocab, text, max_tokens):
  seq_ids = vocab(tokenizer(text))
  padded_seq = seq_ids[:max_tokens]
  if len(seq_ids) < max_tokens:
    padded_seq = seq_ids + ([0] * (max_tokens - len(seq_ids)))
  
  return torch.tensor(padded_seq, dtype=torch.int32)


In [10]:
encode_seq_ints(vocabulary, train_dataset[0][1], 128)

tensor([   12,  1529,    12,   240, 49183,    42,    62,   398,  1076,    92,
            6,    37,     1,  6428,    14,  3572,    10,    58,    10,    16,
           97,   612,    11,  7492,     2,    12,    90,   548,    14,    36,
           97,    10,    16, 23190,    39,  1169,     2,    15,     2,  9087,
           50,    10,   129,   748,     7,  2284,    13,   661,     3,  1503,
          117,     5,   329,     6,   113,  1200,  2874,    12,    70,    74,
            7,    71,    13,    19,   529,     2,     1,   122,     9,  5796,
          191,     5,   194,  3664,   476,  1389,   765,  5730,    41,   491,
            7,   833,   269,    61,    57,    49,   128,     2,    11,   845,
           61,   491,     7,  1140,    48, 14510,     7,   260,    55,   435,
            6,   651,    27,    54,     1,   860, 26165,   202,    49,   743,
         1001,  1232,   147,    17,     1,  2455,   319,     4,  1387,  1232,
           11,     1,  2231,  1598,     2,    11,   207,  2145],

In [11]:
# max_tokens = max(len(tokenizer(data[1])) for data in train_dataset)
max_tokens = 256
print(max_tokens)

256


In [12]:
def encode_batch_text(batch):
  labels, texts = list(zip(*batch))
  encoded_texts = [encode_seq_ints(vocabulary, text, max_tokens) for text in texts]
  return torch.stack(encoded_texts), torch.tensor(labels, dtype=torch.float32) - 1

In [13]:
embedding = Embedding(num_embeddings=len(vocabulary), embedding_dim=128, padding_idx=0)

In [14]:
r = embedding(encode_seq_ints(vocabulary, train_dataset[0][1], max_tokens))
r

tensor([[-0.9070, -0.3619, -1.2225,  ..., -0.4984, -0.6576, -1.4168],
        [-0.4708,  1.0404, -0.7743,  ...,  0.6995, -0.2390,  0.9164],
        [-0.9070, -0.3619, -1.2225,  ..., -0.4984, -0.6576, -1.4168],
        ...,
        [-0.1243,  1.4179, -0.4990,  ..., -1.1983, -0.9667, -2.8585],
        [-0.7959,  0.0108, -0.5008,  ..., -0.5647, -0.0343,  0.2235],
        [-0.9038, -1.1262, -2.1869,  ..., -1.0351,  1.7237,  1.2920]],
       grad_fn=<EmbeddingBackward0>)

In [15]:
train_loader = DataLoader(train_dataset, batch_size=512, collate_fn=encode_batch_text, shuffle=True, pin_memory=True)
eval_loader = DataLoader(eval_dataset, batch_size=512, collate_fn=encode_batch_text, pin_memory=True)

In [16]:
for encoded_text, encoded_labels in train_loader:
 print(encoded_text)
 print(encoded_labels) 

tensor([[   12,    70,    44,  ...,     0,     0,     0],
        [14103,    11,     5,  ...,     0,     0,     0],
        [    1,   103,     1,  ...,     0,     0,     0],
        ...,
        [   10,     9,    43,  ...,     0,     0,     0],
        [   44,   114,   135,  ...,     2,   827,    35],
        [    1,  1228,     6,  ...,     0,     0,     0]], dtype=torch.int32)
tensor([1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0.,
        0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0.,
        0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0.,
        0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
        0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1.,
        1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1.,

In [17]:
class CNNClassifier(Module):
  def __init__(self, num_embeddings, embedding_dim, max_tokens, padding_idx):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.max_tokens = max_tokens
    self.embedding = Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=padding_idx)
    self.conv1 = Conv1d(in_channels=embedding_dim, out_channels=16, kernel_size=4, padding="same")
    self.linear = Linear(in_features=16, out_features=1)
  
  def forward(self, x):
    x = self.embedding(x)
    # Embedding length needs to be treated as channel dimension
    # x = x.reshape(len(x), self.embedding_dim, self.max_tokens)
    x = x.permute(0, 2, 1)
    x = F.relu(self.conv1(x))
    x, _ = x.max(dim=-1)
    x = F.dropout(x, p=0.4)
    x = F.sigmoid(self.linear(x))
    return x
  
  def display(self):
    for p in self.parameters():
      print(p)

In [18]:
model = CNNClassifier(num_embeddings=len(vocabulary), embedding_dim=128, max_tokens=max_tokens, padding_idx=0)
print(model)

CNNClassifier(
  (embedding): Embedding(55574, 128, padding_idx=0)
  (conv1): Conv1d(128, 16, kernel_size=(4,), stride=(1,), padding=same)
  (linear): Linear(in_features=16, out_features=1, bias=True)
)


In [19]:
encoded_texts, encoded_labels = encode_batch_text(train_dataset[:5])

In [20]:
model(encoded_texts)

  return F.conv1d(input, weight, bias, self.stride,


tensor([[0.6433],
        [0.5014],
        [0.0975],
        [0.3241],
        [0.5048]], grad_fn=<SigmoidBackward0>)

In [21]:
bows, labels = encode_batch_text(train_dataset[:4])
print(model(bows))

for p in model.parameters():
  print(p)

tensor([[0.4340],
        [0.6804],
        [0.2695],
        [0.2927]], grad_fn=<SigmoidBackward0>)
Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.7410, -0.0905, -2.7845,  ...,  0.4097,  0.3863, -0.4990],
        [-0.9704,  1.6961, -0.6968,  ..., -0.5241, -0.0653,  1.4935],
        ...,
        [ 0.2787,  0.1321, -0.9039,  ..., -0.8992, -0.2261, -1.8474],
        [-0.4713,  1.3671,  0.2529,  ..., -1.8085, -1.5757,  0.2893],
        [-0.6175,  0.2341,  0.4702,  ..., -1.4467,  2.1893, -1.5718]],
       requires_grad=True)
Parameter containing:
tensor([[[ 0.0037,  0.0157,  0.0170, -0.0014],
         [ 0.0190, -0.0442,  0.0383,  0.0388],
         [-0.0195, -0.0232,  0.0250,  0.0300],
         ...,
         [-0.0050,  0.0361, -0.0029,  0.0381],
         [ 0.0361,  0.0319, -0.0301, -0.0248],
         [ 0.0328,  0.0418,  0.0137, -0.0370]],

        [[-0.0123,  0.0111, -0.0066,  0.0098],
         [ 0.0434,  0.0373,  0.0205, -0.0003],
  

In [22]:
learning_rate = 1e-3
optimizer = Adam(model.parameters(), lr=learning_rate)
loss_fn = BCELoss() 

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
loss_fn = loss_fn.to(device) 

In [24]:
# def train(model, iterator, optimizer, criterion):
#   epoch_loss = 0
#   epoch_acc = 0

#   model.train()

#   for bows, labels in iterator:
#     bows = bows.to(device)
#     labels = labels.to(device)

#     optimizer.zero_grad()

#     predictions = model(bows).squeeze(1)

#     loss = criterion(predictions, labels)

#     rounded_preds = torch.round(predictions)
#     correct = (rounded_preds == labels).float()
#     acc = correct.sum() / len(correct)

#     loss.backward()

#     optimizer.step()

#     epoch_loss += loss.item()
#     epoch_acc += acc.item()

#   return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
def binary_accuracy(preds, y):
  rounded_preds = torch.round(preds)
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  return acc


def train(model, data_loader, optimizer, loss_fn):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for text_batch, label_batch in tqdm(data_loader):
    optimizer.zero_grad()
    predictions = model(text_batch).squeeze(1)
    loss = loss_fn(predictions, label_batch)
    acc = binary_accuracy(predictions, label_batch)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(data_loader), epoch_acc / len(data_loader)


def validate(model, data_loader, loss_fn):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for text_batch, label_batch in tqdm(data_loader):
      predictions = model(text_batch).squeeze(1)
      loss = loss_fn(predictions, label_batch)
      acc = binary_accuracy(predictions, label_batch)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

In [26]:
# def validate(model, iterator, criterion):
#   epoch_loss = 0
#   epoch_acc = 0

#   model.eval()

#   with torch.no_grad():
#     for bows, labels in iterator:
#       bows = bows.to(device)
#       labels = labels.to(device)

#       predictions = model(bows).squeeze(1)

#       loss = criterion(predictions, labels)

#       rounded_preds = torch.round(predictions)
#       correct = (rounded_preds == labels).float()
#       acc = correct.sum() / len(correct)

#       epoch_loss += loss.item()
#       epoch_acc += acc.item()

#   return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

  return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 50
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
  start_time = time.time()
  train_loss, train_acc = train(model, train_loader, optimizer, loss_fn)
  valid_loss, valid_acc = validate(model, eval_loader, loss_fn)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'data/sentiment-model-1.pt')
  
  print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%')

100%|██████████| 49/49 [00:21<00:00,  2.25it/s]
100%|██████████| 49/49 [00:13<00:00,  3.55it/s]


Epoch: 01 | Epoch Time: 0m 35s
	Train Loss: 0.713 | Train Acc: 53.43%
	 Val. Loss: 0.677 | Val. Acc: 57.21%


100%|██████████| 49/49 [00:21<00:00,  2.27it/s]
100%|██████████| 49/49 [00:13<00:00,  3.56it/s]


Epoch: 02 | Epoch Time: 0m 35s
	Train Loss: 0.645 | Train Acc: 61.82%
	 Val. Loss: 0.626 | Val. Acc: 63.85%


100%|██████████| 49/49 [00:21<00:00,  2.23it/s]
100%|██████████| 49/49 [00:14<00:00,  3.36it/s]


Epoch: 03 | Epoch Time: 0m 36s
	Train Loss: 0.609 | Train Acc: 66.10%
	 Val. Loss: 0.601 | Val. Acc: 66.44%


100%|██████████| 49/49 [00:22<00:00,  2.19it/s]
100%|██████████| 49/49 [00:14<00:00,  3.37it/s]


Epoch: 04 | Epoch Time: 0m 36s
	Train Loss: 0.584 | Train Acc: 68.79%
	 Val. Loss: 0.588 | Val. Acc: 67.81%


100%|██████████| 49/49 [00:22<00:00,  2.18it/s]
100%|██████████| 49/49 [00:14<00:00,  3.30it/s]


Epoch: 05 | Epoch Time: 0m 37s
	Train Loss: 0.565 | Train Acc: 70.23%
	 Val. Loss: 0.569 | Val. Acc: 70.04%


100%|██████████| 49/49 [00:22<00:00,  2.15it/s]
100%|██████████| 49/49 [00:14<00:00,  3.27it/s]


Epoch: 06 | Epoch Time: 0m 37s
	Train Loss: 0.546 | Train Acc: 71.70%
	 Val. Loss: 0.557 | Val. Acc: 70.62%


100%|██████████| 49/49 [00:22<00:00,  2.14it/s]
100%|██████████| 49/49 [00:14<00:00,  3.43it/s]


Epoch: 07 | Epoch Time: 0m 37s
	Train Loss: 0.522 | Train Acc: 73.85%
	 Val. Loss: 0.540 | Val. Acc: 72.21%


100%|██████████| 49/49 [00:20<00:00,  2.34it/s]
100%|██████████| 49/49 [00:12<00:00,  4.05it/s]


Epoch: 08 | Epoch Time: 0m 33s
	Train Loss: 0.497 | Train Acc: 76.05%
	 Val. Loss: 0.519 | Val. Acc: 73.99%


100%|██████████| 49/49 [00:19<00:00,  2.47it/s]
100%|██████████| 49/49 [00:13<00:00,  3.76it/s]


Epoch: 09 | Epoch Time: 0m 32s
	Train Loss: 0.472 | Train Acc: 77.64%
	 Val. Loss: 0.508 | Val. Acc: 74.64%


100%|██████████| 49/49 [00:20<00:00,  2.38it/s]
100%|██████████| 49/49 [00:12<00:00,  3.87it/s]


Epoch: 10 | Epoch Time: 0m 33s
	Train Loss: 0.448 | Train Acc: 78.94%
	 Val. Loss: 0.496 | Val. Acc: 75.12%


100%|██████████| 49/49 [00:21<00:00,  2.27it/s]
100%|██████████| 49/49 [00:16<00:00,  3.04it/s]


Epoch: 11 | Epoch Time: 0m 37s
	Train Loss: 0.427 | Train Acc: 80.82%
	 Val. Loss: 0.487 | Val. Acc: 76.19%


100%|██████████| 49/49 [00:23<00:00,  2.12it/s]
100%|██████████| 49/49 [00:14<00:00,  3.38it/s]


Epoch: 12 | Epoch Time: 0m 37s
	Train Loss: 0.404 | Train Acc: 82.19%
	 Val. Loss: 0.478 | Val. Acc: 76.84%


100%|██████████| 49/49 [00:22<00:00,  2.22it/s]
100%|██████████| 49/49 [00:13<00:00,  3.61it/s]


Epoch: 13 | Epoch Time: 0m 35s
	Train Loss: 0.378 | Train Acc: 83.94%
	 Val. Loss: 0.478 | Val. Acc: 77.09%


100%|██████████| 49/49 [00:21<00:00,  2.29it/s]
100%|██████████| 49/49 [00:13<00:00,  3.62it/s]


Epoch: 14 | Epoch Time: 0m 34s
	Train Loss: 0.355 | Train Acc: 85.42%
	 Val. Loss: 0.477 | Val. Acc: 77.45%


100%|██████████| 49/49 [00:22<00:00,  2.21it/s]
100%|██████████| 49/49 [00:13<00:00,  3.75it/s]


Epoch: 15 | Epoch Time: 0m 35s
	Train Loss: 0.331 | Train Acc: 86.70%
	 Val. Loss: 0.477 | Val. Acc: 77.78%


100%|██████████| 49/49 [00:21<00:00,  2.25it/s]
100%|██████████| 49/49 [00:13<00:00,  3.55it/s]


Epoch: 16 | Epoch Time: 0m 35s
	Train Loss: 0.306 | Train Acc: 87.86%
	 Val. Loss: 0.484 | Val. Acc: 77.77%


100%|██████████| 49/49 [00:22<00:00,  2.16it/s]
100%|██████████| 49/49 [00:13<00:00,  3.58it/s]


Epoch: 17 | Epoch Time: 0m 36s
	Train Loss: 0.289 | Train Acc: 88.74%
	 Val. Loss: 0.489 | Val. Acc: 77.87%


100%|██████████| 49/49 [00:21<00:00,  2.26it/s]
100%|██████████| 49/49 [00:13<00:00,  3.58it/s]


Epoch: 18 | Epoch Time: 0m 35s
	Train Loss: 0.269 | Train Acc: 89.87%
	 Val. Loss: 0.498 | Val. Acc: 77.90%


100%|██████████| 49/49 [00:22<00:00,  2.16it/s]
100%|██████████| 49/49 [00:13<00:00,  3.61it/s]


Epoch: 19 | Epoch Time: 0m 36s
	Train Loss: 0.249 | Train Acc: 90.50%
	 Val. Loss: 0.501 | Val. Acc: 77.87%


100%|██████████| 49/49 [00:21<00:00,  2.32it/s]
100%|██████████| 49/49 [00:12<00:00,  3.81it/s]


Epoch: 20 | Epoch Time: 0m 33s
	Train Loss: 0.233 | Train Acc: 91.27%
	 Val. Loss: 0.512 | Val. Acc: 77.84%


100%|██████████| 49/49 [00:20<00:00,  2.36it/s]
100%|██████████| 49/49 [00:12<00:00,  3.79it/s]


Epoch: 21 | Epoch Time: 0m 33s
	Train Loss: 0.215 | Train Acc: 92.17%
	 Val. Loss: 0.525 | Val. Acc: 78.00%


100%|██████████| 49/49 [00:21<00:00,  2.32it/s]
 86%|████████▌ | 42/49 [00:12<00:02,  3.45it/s]


KeyboardInterrupt: 

In [48]:
def predict_sentiment(text, vocab, model):
  model.eval()
  with torch.no_grad():
    bow = encode_seq_ints(vocab, text, max_tokens).unsqueeze(0)
    prediction = model(bow)
    if prediction.item() > 0.5:
      return prediction.item(), "positive"
  return (1 - prediction.item()), "negative"

In [53]:
predict_sentiment("this film is cool", vocabulary, model) 

(0.635297417640686, 'positive')

In [30]:
import numpy as np
def load_embeddings_index(path):
  embeddings_index = {}
  embeddings_dim = None
  with open(path, 'r') as f:
    for line in f:
      values = line.split()
      word = values[0]
      coefs = torch.tensor(np.asarray(values[1:], dtype='float32'))
      embeddings_index[word] = coefs
      if not embeddings_dim:
        embeddings_dim = coefs.shape[0]
  return embeddings_index, embeddings_dim

In [31]:
embeddings, embeddings_dim = load_embeddings_index('data/tp02/glove.6B.50d.txt')

In [33]:
def create_embedding_matrix(vocab, embeddings, embedding_dim):
  embedding_matrix = torch.zeros((len(vocab), embedding_dim))
  for i, word in enumerate(vocab.get_itos()):
    if word in embeddings:
      embedding_matrix[i] = embeddings[word]
  return embedding_matrix

In [56]:
embeddings_matrix = create_embedding_matrix(vocabulary, embeddings, embeddings_dim)
embeddings_matrix.shape

torch.Size([55574, 50])

In [61]:
class CNNClassifierWithPretrainedEmbedding(Module):
  def __init__(self, num_embeddings, embeddings, embedding_dim, max_tokens, padding_idx):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.max_tokens = max_tokens
    self.embedding = Embedding.from_pretrained(embeddings_matrix, freeze=True) if embeddings is not None else Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=padding_idx)
    self.conv1 = Conv1d(in_channels=embedding_dim, out_channels=16, kernel_size=4, padding="same")
    self.linear = Linear(in_features=16, out_features=1)
  
  def forward(self, x):
    x = self.embedding(x)
    # Embedding length needs to be treated as channel dimension
    x = x.reshape(len(x), self.embedding_dim, self.max_tokens)
    # x = x.permute(0, 2, 1)
    x = F.relu(self.conv1(x))
    x, _ = x.max(dim=-1)
    x = F.dropout(x, p=0.4)
    x = F.sigmoid(self.linear(x))
    return x
  
  def display(self):
    for p in self.parameters():
      print(p)

In [62]:
model2 = CNNClassifierWithPretrainedEmbedding(num_embeddings=len(vocabulary), embeddings=embeddings_matrix, embedding_dim=embeddings_dim, max_tokens=max_tokens, padding_idx=0)

In [63]:
N_EPOCHS = 20
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
  start_time = time.time()
  train_loss, train_acc = train(model2, train_loader, optimizer, loss_fn)
  valid_loss, valid_acc = validate(model2, eval_loader, loss_fn)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model2.state_dict(), 'data/sentiment-model-2.pt')
  
  print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%')

100%|██████████| 49/49 [00:10<00:00,  4.47it/s]
100%|██████████| 49/49 [00:08<00:00,  5.67it/s]


Epoch: 01 | Epoch Time: 0m 19s
	Train Loss: 0.748 | Train Acc: 50.29%
	 Val. Loss: 0.751 | Val. Acc: 49.73%


100%|██████████| 49/49 [00:10<00:00,  4.57it/s]
100%|██████████| 49/49 [00:08<00:00,  5.75it/s]


Epoch: 02 | Epoch Time: 0m 19s
	Train Loss: 0.751 | Train Acc: 49.91%
	 Val. Loss: 0.749 | Val. Acc: 50.44%


100%|██████████| 49/49 [00:11<00:00,  4.33it/s]
100%|██████████| 49/49 [00:08<00:00,  5.56it/s]


Epoch: 03 | Epoch Time: 0m 20s
	Train Loss: 0.751 | Train Acc: 50.35%
	 Val. Loss: 0.751 | Val. Acc: 49.72%


100%|██████████| 49/49 [00:11<00:00,  4.35it/s]
100%|██████████| 49/49 [00:08<00:00,  5.78it/s]


Epoch: 04 | Epoch Time: 0m 19s
	Train Loss: 0.750 | Train Acc: 50.26%
	 Val. Loss: 0.749 | Val. Acc: 49.97%


100%|██████████| 49/49 [00:11<00:00,  4.43it/s]
100%|██████████| 49/49 [00:08<00:00,  5.75it/s]


Epoch: 05 | Epoch Time: 0m 19s
	Train Loss: 0.748 | Train Acc: 50.46%
	 Val. Loss: 0.748 | Val. Acc: 50.33%


100%|██████████| 49/49 [00:11<00:00,  4.37it/s]
100%|██████████| 49/49 [00:08<00:00,  5.71it/s]


Epoch: 06 | Epoch Time: 0m 19s
	Train Loss: 0.748 | Train Acc: 50.17%
	 Val. Loss: 0.749 | Val. Acc: 49.94%


100%|██████████| 49/49 [00:10<00:00,  4.55it/s]
100%|██████████| 49/49 [00:08<00:00,  5.58it/s]


Epoch: 07 | Epoch Time: 0m 19s
	Train Loss: 0.748 | Train Acc: 50.42%
	 Val. Loss: 0.748 | Val. Acc: 50.08%


100%|██████████| 49/49 [00:10<00:00,  4.51it/s]
100%|██████████| 49/49 [00:08<00:00,  5.77it/s]


Epoch: 08 | Epoch Time: 0m 19s
	Train Loss: 0.750 | Train Acc: 50.08%
	 Val. Loss: 0.752 | Val. Acc: 49.87%


100%|██████████| 49/49 [00:10<00:00,  4.56it/s]
100%|██████████| 49/49 [00:08<00:00,  5.80it/s]


Epoch: 09 | Epoch Time: 0m 19s
	Train Loss: 0.750 | Train Acc: 49.71%
	 Val. Loss: 0.749 | Val. Acc: 49.92%


100%|██████████| 49/49 [00:10<00:00,  4.59it/s]
100%|██████████| 49/49 [00:08<00:00,  5.81it/s]


Epoch: 10 | Epoch Time: 0m 19s
	Train Loss: 0.750 | Train Acc: 49.90%
	 Val. Loss: 0.747 | Val. Acc: 50.32%


100%|██████████| 49/49 [00:10<00:00,  4.53it/s]
100%|██████████| 49/49 [00:08<00:00,  5.80it/s]


Epoch: 11 | Epoch Time: 0m 19s
	Train Loss: 0.749 | Train Acc: 50.22%
	 Val. Loss: 0.753 | Val. Acc: 49.42%


100%|██████████| 49/49 [00:10<00:00,  4.58it/s]
100%|██████████| 49/49 [00:08<00:00,  5.79it/s]


Epoch: 12 | Epoch Time: 0m 19s
	Train Loss: 0.751 | Train Acc: 50.09%
	 Val. Loss: 0.748 | Val. Acc: 50.21%


100%|██████████| 49/49 [00:11<00:00,  4.20it/s]
100%|██████████| 49/49 [00:09<00:00,  5.43it/s]


Epoch: 13 | Epoch Time: 0m 20s
	Train Loss: 0.751 | Train Acc: 50.15%
	 Val. Loss: 0.750 | Val. Acc: 49.68%


100%|██████████| 49/49 [00:11<00:00,  4.36it/s]
100%|██████████| 49/49 [00:09<00:00,  5.43it/s]


Epoch: 14 | Epoch Time: 0m 20s
	Train Loss: 0.748 | Train Acc: 50.45%
	 Val. Loss: 0.747 | Val. Acc: 50.21%


100%|██████████| 49/49 [00:11<00:00,  4.42it/s]
100%|██████████| 49/49 [00:08<00:00,  5.67it/s]


Epoch: 15 | Epoch Time: 0m 19s
	Train Loss: 0.751 | Train Acc: 50.00%
	 Val. Loss: 0.749 | Val. Acc: 50.51%


100%|██████████| 49/49 [00:11<00:00,  4.45it/s]
100%|██████████| 49/49 [00:09<00:00,  5.22it/s]


Epoch: 16 | Epoch Time: 0m 20s
	Train Loss: 0.750 | Train Acc: 49.88%
	 Val. Loss: 0.750 | Val. Acc: 49.80%


100%|██████████| 49/49 [00:12<00:00,  4.00it/s]
 33%|███▎      | 16/49 [00:02<00:06,  5.40it/s]

Source code: https://colab.research.google.com/drive/1NoInlyj5dp5CeL-9_3Ei9X-K3R_Xt1_M?usp=sharing

In [None]:
# model(torch.stack([encode_bow(vocabulary, "This movie is amazing")])).item()

In [None]:
# # classify a review as negative or positive
# def predict_sentiment(text, vocab, model):
#   encoded_text = encode_bow(vocab, text)
#   with torch.no_grad():
#     yhat = model(encoded_text)
#     percent_pos = yhat[0]
#     if torch.round(percent_pos) == 1:
#       return (percent_pos, 'POSITIVE')
#     else:
#       return (1 - percent_pos, 'NEGATIVE')
  

In [None]:
# text = 'Best movie ever! It was great, I recommend it.'
# percent, sentiment = predict_sentiment(text, vocabulary, model) 
# print(f'Review: [{text}]\nSentiment: {sentiment} ({percent*100:.2f}%)')

In [None]:
# filter_tokens(tokenizer(text))

In [None]:
# text = 'Wow. This movie is amazing, isnt it? To not say the opposite. '
# percent, sentiment = predict_sentiment(text, vocabulary, model)
# print(f'Review: [{text}]\nSentiment: {sentiment} ({percent*100:.2f}%)')