In [1]:
# from torch.utils.data import DataLoader

import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
from torchtext.data import get_tokenizer

import torch
from torch.utils.data import DataLoader
from torch.nn import ReLU, Sigmoid, Sequential, Linear, BCELoss
from torch.optim import Adam

import nltk
from nltk.corpus import stopwords

import re
from string import punctuation
import time
from tqdm import tqdm

In [2]:
train_dataset, eval_dataset = torchtext.datasets.IMDB()

In [3]:
train_dataset, eval_dataset = to_map_style_dataset(train_dataset), to_map_style_dataset(eval_dataset)

In [4]:
for label,text in train_dataset[:10]:
  print(label, text)

1 I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [5]:
tokenizer = get_tokenizer('basic_english')
tokenizer(train_dataset[0][1])[:5]

['i', 'rented', 'i', 'am', 'curious-yellow']

In [6]:
try:
  stopwords.words('english')[:5]
except LookupError:
  nltk.download('stopwords')
finally:
  en_stop_words = set(stopwords.words('english'))

In [7]:
re_punc = re.compile('[%s]' % re.escape(punctuation))

In [8]:
def filter_tokens(tokens):
  tokens = [token.lower() for token in tokens]
  tokens = [re_punc.sub('', token) for token in tokens]
  tokens = [token for token in tokens if token.isalpha()]
  tokens = [token for token in tokens if token not in en_stop_words]
  tokens = [token for token in tokens if len(token) > 1]
  return tokens

In [9]:
for _, text in train_dataset[:10]:
  print(filter_tokens(tokenizer(text)))

['rented', 'curiousyellow', 'video', 'store', 'controversy', 'surrounded', 'first', 'released', 'also', 'heard', 'first', 'seized', 'customs', 'ever', 'tried', 'enter', 'country', 'therefore', 'fan', 'films', 'considered', 'controversial', 'really', 'see', 'plot', 'centered', 'around', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'wants', 'learn', 'everything', 'life', 'particular', 'wants', 'focus', 'attentions', 'making', 'sort', 'documentary', 'average', 'swede', 'thought', 'certain', 'political', 'issues', 'vietnam', 'war', 'race', 'issues', 'united', 'states', 'asking', 'politicians', 'ordinary', 'denizens', 'stockholm', 'opinions', 'politics', 'sex', 'drama', 'teacher', 'classmates', 'married', 'men', 'kills', 'curiousyellow', 'years', 'ago', 'considered', 'pornographic', 'really', 'sex', 'nudity', 'scenes', 'far', 'even', 'shot', 'like', 'cheaply', 'made', 'porno', 'countrymen', 'mind', 'find', 'shocking', 'reality', 'sex', 'nudity', 'major', 'staple', 'swedish', 'ci

In [10]:
def build_vocabulary(datasets):
  for dataset in datasets:
    for _, text in dataset:
      yield filter_tokens(tokenizer(text))

In [11]:
min_freq = 3

vocabulary = build_vocab_from_iterator(
  build_vocabulary([train_dataset, eval_dataset]), 
  min_freq=min_freq, 
  specials=["<UNK>"]  
)

vocabulary.set_default_index(vocabulary["<UNK>"]) 

In [12]:
len(vocabulary)

53176

In [13]:
# get the word of index 100
vocabulary["henrique"]

0

In [14]:
torch.save(vocabulary, 'data/vocab-cnn.pth')

In [15]:
def encode_bow(vocab, text):
  encoded_text = torch.zeros(len(vocab), dtype=torch.float32)
  text = filter_tokens(tokenizer(text))
  token_ids = vocab(text)

  for token_id in token_ids:
    if token_id:
      encoded_text[token_id] += 1.0

  return encoded_text

In [16]:
encode_bow(vocabulary, train_dataset[10][1])

tensor([0., 2., 2.,  ..., 0., 0., 0.])

In [17]:
train_dataset[10][1]

'It was great to see some of my favorite stars of 30 years ago including John Ritter, Ben Gazarra and Audrey Hepburn. They looked quite wonderful. But that was it. They were not given any characters or good lines to work with. I neither understood or cared what the characters were doing.<br /><br />Some of the smaller female roles were fine, Patty Henson and Colleen Camp were quite competent and confident in their small sidekick parts. They showed some talent and it is sad they didn\'t go on to star in more and better films. Sadly, I didn\'t think Dorothy Stratten got a chance to act in this her only important film role.<br /><br />The film appears to have some fans, and I was very open-minded when I started watching it. I am a big Peter Bogdanovich fan and I enjoyed his last movie, "Cat\'s Meow" and all his early ones from "Targets" to "Nickleodeon". So, it really surprised me that I was barely able to keep awake watching this one.<br /><br />It is ironic that this movie is about a de

In [18]:
vocabulary.get_stoi()['films']

29

In [19]:
vocabulary.get_itos()[1], vocabulary.get_itos()[2]

('movie', 'film')

In [20]:
for i in range(0, 10):
  print(encode_bow(vocabulary, train_dataset[i][1]))

tensor([0., 0., 3.,  ..., 0., 0., 0.])
tensor([0., 0., 2.,  ..., 0., 0., 0.])
tensor([0., 0., 3.,  ..., 0., 0., 0.])
tensor([0., 1., 5.,  ..., 0., 0., 0.])
tensor([0., 0., 7.,  ..., 0., 0., 0.])
tensor([0., 0., 1.,  ..., 0., 0., 0.])
tensor([0., 2., 1.,  ..., 0., 0., 0.])
tensor([0., 2., 0.,  ..., 0., 0., 0.])
tensor([0., 6., 1.,  ..., 0., 0., 0.])
tensor([0., 1., 2.,  ..., 0., 0., 0.])


In [21]:
def encode_batch_text(batch):
  labels, texts = list(zip(*batch))
  encoded_texts = [encode_bow(vocabulary, text) for text in texts]
  return torch.stack(encoded_texts), torch.tensor(labels, dtype=torch.float32) - 1

In [27]:
# encode_batch_text(train_dataset[:4], vocabulary)

In [25]:
train_loader = DataLoader(train_dataset, batch_size=512, collate_fn=encode_batch_text, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=512, collate_fn=encode_batch_text)

In [26]:
for encoded_text, encoded_labels in train_loader:
 print(encoded_text)
 print(encoded_labels) 

tensor([[ 0.,  2.,  1.,  ...,  0.,  0.,  0.],
        [ 0.,  0.,  3.,  ...,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  ...,  0.,  0.,  0.],
        ...,
        [ 0.,  9.,  0.,  ...,  0.,  0.,  0.],
        [ 0.,  0.,  2.,  ...,  0.,  0.,  0.],
        [ 0.,  1., 14.,  ...,  0.,  0.,  0.]])
tensor([1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0.,
        0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0.,
        1., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
        1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1.,
        0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1.,
        1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
        0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
        0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0.

In [28]:

def define_model(n_words):
  model = Sequential(
    Linear(in_features=n_words, out_features=1024),
    ReLU(),
    Linear(in_features=1024, out_features=50),
    ReLU(),
    Linear(in_features=50, out_features=1),
    Sigmoid()
  )

  return model

In [29]:
model = define_model(len(vocabulary))
print(model)

Sequential(
  (0): Linear(in_features=53176, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=50, bias=True)
  (3): ReLU()
  (4): Linear(in_features=50, out_features=1, bias=True)
  (5): Sigmoid()
)


In [30]:
bows, labels = encode_batch_text(train_dataset[:4])
print(model(bows))

for p in model.parameters():
  print(p)

tensor([[0.5055],
        [0.5037],
        [0.5051],
        [0.5042]], grad_fn=<SigmoidBackward0>)
Parameter containing:
tensor([[-1.2393e-03,  1.5825e-03, -9.0924e-04,  ..., -5.6649e-04,
         -4.1211e-03, -3.3034e-03],
        [ 2.9380e-03, -9.8922e-04,  1.6145e-04,  ..., -7.8456e-04,
         -2.8580e-04,  2.6369e-03],
        [-1.6261e-03, -3.0577e-03,  3.1283e-03,  ...,  2.5686e-03,
          2.7975e-03,  3.9193e-03],
        ...,
        [-7.8620e-04,  1.0026e-03,  2.0553e-03,  ...,  2.6288e-03,
          2.1337e-03, -9.8134e-04],
        [ 1.9326e-03, -3.1915e-05, -4.1968e-04,  ...,  1.5720e-04,
          4.2641e-03,  2.7773e-03],
        [ 2.3354e-03,  2.6687e-03,  2.6666e-03,  ...,  9.6398e-04,
         -1.3010e-03,  3.5472e-03]], requires_grad=True)
Parameter containing:
tensor([-0.0032, -0.0025, -0.0002,  ...,  0.0034,  0.0039,  0.0013],
       requires_grad=True)
Parameter containing:
tensor([[ 2.3569e-02, -8.3567e-03,  4.5732e-04,  ...,  1.6090e-02,
          1.4724e-

In [31]:
learning_rate = 1e-3
optimizer = Adam(model.parameters(), lr=learning_rate)
loss_fn = BCELoss() 

In [None]:
from tqdm import tqdm

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
loss_fn = loss_fn.to(device) 

In [None]:
# def train(model, iterator, optimizer, criterion):
#   epoch_loss = 0
#   epoch_acc = 0

#   model.train()

#   for bows, labels in iterator:
#     bows = bows.to(device)
#     labels = labels.to(device)

#     optimizer.zero_grad()

#     predictions = model(bows).squeeze(1)

#     loss = criterion(predictions, labels)

#     rounded_preds = torch.round(predictions)
#     correct = (rounded_preds == labels).float()
#     acc = correct.sum() / len(correct)

#     loss.backward()

#     optimizer.step()

#     epoch_loss += loss.item()
#     epoch_acc += acc.item()

#   return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
def binary_accuracy(preds, y):
  rounded_preds = torch.round(preds)
  correct = (rounded_preds == y).float()
  acc = correct.sum() / len(correct)
  return acc


def train(model, data_loader, optimizer, loss_fn):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for text_batch, label_batch in tqdm(data_loader):
    optimizer.zero_grad()
    predictions = model(text_batch).squeeze(1)
    loss = loss_fn(predictions, label_batch)
    acc = binary_accuracy(predictions, label_batch)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(data_loader), epoch_acc / len(data_loader)


def validate(model, data_loader, loss_fn):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for text_batch, label_batch in tqdm(data_loader):
      predictions = model(text_batch).squeeze(1)
      loss = loss_fn(predictions, label_batch)
      acc = binary_accuracy(predictions, label_batch)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(data_loader), epoch_acc / len(data_loader)

In [34]:
# def validate(model, iterator, criterion):
#   epoch_loss = 0
#   epoch_acc = 0

#   model.eval()

#   with torch.no_grad():
#     for bows, labels in iterator:
#       bows = bows.to(device)
#       labels = labels.to(device)

#       predictions = model(bows).squeeze(1)

#       loss = criterion(predictions, labels)

#       rounded_preds = torch.round(predictions)
#       correct = (rounded_preds == labels).float()
#       acc = correct.sum() / len(correct)

#       epoch_loss += loss.item()
#       epoch_acc += acc.item()

#   return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [35]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

  return elapsed_mins, elapsed_secs

In [None]:
from tqdm.auto import tqdm
num_training_steps = 1000
progress_bar = tqdm(range(num_training_steps))

In [36]:
N_EPOCHS = 10
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
  start_time = time.time()
  train_loss, train_acc = train(model, train_loader, optimizer, loss_fn)
  valid_loss, valid_acc = validate(model, eval_loader, loss_fn)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'data/sentiment-model-1.pt')
  
  print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%')

100%|██████████| 49/49 [00:27<00:00,  1.77it/s]
100%|██████████| 49/49 [00:19<00:00,  2.48it/s]


Epoch: 01 | Epoch Time: 0m 47s
	Train Loss: 0.361 | Train Acc: 86.36%
	 Val. Loss: 0.344 | Val. Acc: 86.55%


100%|██████████| 49/49 [00:28<00:00,  1.71it/s]
100%|██████████| 49/49 [00:20<00:00,  2.42it/s]


Epoch: 02 | Epoch Time: 0m 49s
	Train Loss: 0.118 | Train Acc: 96.14%
	 Val. Loss: 0.439 | Val. Acc: 86.32%


100%|██████████| 49/49 [00:29<00:00,  1.64it/s]
100%|██████████| 49/49 [00:20<00:00,  2.42it/s]


Epoch: 03 | Epoch Time: 0m 50s
	Train Loss: 0.046 | Train Acc: 98.63%
	 Val. Loss: 0.633 | Val. Acc: 85.30%


100%|██████████| 49/49 [00:28<00:00,  1.74it/s]
100%|██████████| 49/49 [00:20<00:00,  2.42it/s]


Epoch: 04 | Epoch Time: 0m 48s
	Train Loss: 0.016 | Train Acc: 99.60%
	 Val. Loss: 0.825 | Val. Acc: 85.11%


100%|██████████| 49/49 [00:27<00:00,  1.79it/s]
100%|██████████| 49/49 [00:20<00:00,  2.44it/s]


Epoch: 05 | Epoch Time: 0m 47s
	Train Loss: 0.005 | Train Acc: 99.86%
	 Val. Loss: 1.113 | Val. Acc: 84.97%


100%|██████████| 49/49 [00:26<00:00,  1.85it/s]
100%|██████████| 49/49 [00:20<00:00,  2.41it/s]


Epoch: 06 | Epoch Time: 0m 46s
	Train Loss: 0.004 | Train Acc: 99.95%
	 Val. Loss: 1.254 | Val. Acc: 84.44%


100%|██████████| 49/49 [00:27<00:00,  1.79it/s]
100%|██████████| 49/49 [00:20<00:00,  2.42it/s]


Epoch: 07 | Epoch Time: 0m 47s
	Train Loss: 0.003 | Train Acc: 99.98%
	 Val. Loss: 1.394 | Val. Acc: 84.78%


100%|██████████| 49/49 [00:26<00:00,  1.83it/s]
100%|██████████| 49/49 [00:20<00:00,  2.43it/s]


Epoch: 08 | Epoch Time: 0m 46s
	Train Loss: 0.002 | Train Acc: 100.00%
	 Val. Loss: 1.574 | Val. Acc: 85.08%


100%|██████████| 49/49 [00:26<00:00,  1.84it/s]
100%|██████████| 49/49 [00:19<00:00,  2.47it/s]


Epoch: 09 | Epoch Time: 0m 46s
	Train Loss: 0.000 | Train Acc: 100.00%
	 Val. Loss: 1.617 | Val. Acc: 85.03%


100%|██████████| 49/49 [00:26<00:00,  1.83it/s]
100%|██████████| 49/49 [00:19<00:00,  2.46it/s]

Epoch: 10 | Epoch Time: 0m 46s
	Train Loss: 0.000 | Train Acc: 100.00%
	 Val. Loss: 1.664 | Val. Acc: 85.06%





In [None]:
model(torch.stack([encode_bow(vocabulary, "This movie is amazing")])).item()

In [47]:
# classify a review as negative or positive
def predict_sentiment(text, vocab, model):
  encoded_text = encode_bow(vocab, text)
  with torch.no_grad():
    yhat = model(encoded_text)
    percent_pos = yhat[0]
    if torch.round(percent_pos) == 1:
      return (percent_pos, 'POSITIVE')
    else:
      return (1 - percent_pos, 'NEGATIVE')
  

In [54]:
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocabulary, model) 
print(f'Review: [{text}]\nSentiment: {sentiment} ({percent*100:.2f}%)')

Review: [Best movie ever! It was great, I recommend it.]
Sentiment: POSITIVE (97.03%)


In [58]:
filter_tokens(tokenizer(text))

['wow', 'movie', 'amazing', 'isnt', 'say', 'opposite']

In [56]:
text = 'Wow. This movie is amazing, isnt it? To not say the opposite. '
percent, sentiment = predict_sentiment(text, vocabulary, model)
print(f'Review: [{text}]\nSentiment: {sentiment} ({percent*100:.2f}%)')

Review: [Wow. This movie is amazing, isnt it? To not say the opposite.]
Sentiment: POSITIVE (98.82%)
