Loading PyTorch
==

In [None]:
# Imports PyTorch.
import torch

Downloading the dataset
==
The dataset we are going to use is the Large Movie Review Dataset (https://ai.stanford.edu/~amaas/data/sentiment/).

In [None]:
# Downloads the dataset.
import urllib

tmp = urllib.request.urlretrieve("https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")
filename = tmp[0]

In [None]:
filename

'/tmp/tmptf2y_u1p'

In [None]:
# Extracts the dataset.
import tarfile
tar = tarfile.open(filename)
tar.extractall()
tar.close()

In [None]:
import os # Useful library to read files and inspect directories.

In [None]:
# Shows which files and directories are present at the root of the file system.
for filename in os.listdir("."):
  print(filename)

.config
aclImdb
sample_data


In [None]:
dataset_root = "aclImdb"
# Shows which files and directories are present at the root of the dataset directory.
for filename in os.listdir(dataset_root):
  print(filename)

test
imdb.vocab
README
imdbEr.txt
train


In [None]:
# Shows several reviews.
dirname = os.path.join(dataset_root, "train", "neg") # "aclImdb/{train|test}/{neg|pos}"
for idx, filename in enumerate(os.listdir(dirname)):
  if(idx >= 5): break # Stops after the 5th file.

  print(filename)
  with open(os.path.join(dirname, filename)) as f:
    review = f.read()
    print(review)
  print()

766_4.txt
Whenever a Columbo story deviates from the familiar plot (colorful killer commits crime, Columbo smokes out killer, Columbo becomes a pest in the process), the writers somehow are never able to match the quality and interest of most traditional episodes. This episode deviates in the extreme, and the result is a major flop.<br /><br /> Would you believe: Columbo never faces the villain till the very end?!!<br /><br />Frankly, I was tempted to turn it off about two-thirds through.<br /><br /> Oh, the sacrifices we self-appointed reviewers make!!!

6373_2.txt
Oh this was a really bad movie. The girl who plays Jennifer is OK, but I think she acts bitchy through the movie, not because she is having her organs ripped out by a raven at night, but because she is thinking of firing her agent for putting her in this piece of crap. Faye Dunaway acts like she is remaking Mommy Dearest and the ending is completely silly. I really can't recommend this movie at all even though as a fan of E

Preprocessing the dataset
==

In [None]:
import nltk # Imports NLTK, an NLP library.
nltk.download('punkt') # Loads a module required for tokenization.
import collections # This library defines useful data structures.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
newline = "<br />" # The reviews sometimes contain this HTLM tag to indicate a line break.
def preprocess(text):
  text = text.replace(newline, " ") # Replaces the newline HTML tag with a space.
  tokens = nltk.word_tokenize(text); # Converts the text to a list of tokens (strings).
  tokens = [token.lower() for token in tokens] # Lowercases all tokens.

  return tokens

# Reads and pre-processes the reviews.
dataset = {"train": [], "test": []}
binary_classes = {"neg": 0, "pos": 1}
for part_name, l in dataset.items():
  for class_name, value in binary_classes.items():
    path = os.path.join(dataset_root, part_name, class_name)
    print("Processing %s..." % path, end='');
    for filename in os.listdir(path):
        with open(os.path.join(path, filename)) as f:
          review_text = f.read()
          review_tokens = preprocess(review_text)

          l.append((review_tokens, value))
    print(" done")

Processing aclImdb/train/neg... done
Processing aclImdb/train/pos... done
Processing aclImdb/test/neg... done
Processing aclImdb/test/pos... done


In [None]:
# Splits the train set into a proper train set and a development/validation set.
# 'dataset["train"]' happens to be a list composed of a certain number of negative examples followed by the same number of positive examples.
# We are going to use 3/4 of the original train set as our actual train set, and 1/4 as our development set.
# We want to keep balanced train and development sets, i.e. for both, half of the reviews should be positive and half should be negative.
if("dev" in dataset): print("This should only be run once.")
else:
  dev_set_half_size = int((len(dataset["train"]) / 4) / 2) # Half of a quarter of the training set size.
  dataset["dev"] = dataset["train"][:dev_set_half_size] + dataset["train"][-dev_set_half_size:] # Takes some negative examples at the beginning and some positive ones at the end.
  dataset["train"] = dataset["train"][dev_set_half_size:-dev_set_half_size] # Removes the examples used for the development set.

  for (part, data) in dataset.items():
    class_counts = collections.defaultdict(int)
    for (_, p) in data: class_counts[p] += 1
    print(f"{part}: {class_counts}")
  print("Train set split into train/dev.")

train: defaultdict(<class 'int'>, {0: 9375, 1: 9375})
test: defaultdict(<class 'int'>, {0: 12500, 1: 12500})
dev: defaultdict(<class 'int'>, {0: 3125, 1: 3125})
Train set split into train/dev.


Loading the word embeddings
==
We are going to use GloVe embeddings.

All word forms with a frequency below a given threshold are going to be considered unknown forms.

In [None]:
# Computes the frequency of all word forms in the train set.
word_counts = collections.defaultdict(int)
for tokens, _ in dataset["train"]:
  for token in tokens: word_counts[token] += 1

# print(word_counts)

In [None]:
# Builds a vocabulary containing only those words present in the train set with a frequency above a given threshold.
count_threshold = 4;
vocabulary = set()
for word, count in word_counts.items():
    if(count > count_threshold): vocabulary.add(word)

# print(vocabulary)
# print(len(vocabulary))  #26317

In [None]:
import zipfile
import numpy as np

In [None]:
# Returns a dictionary {word[String]: id[Integer]} and a list of Numpy arrays.
# `data_path` is the path of the directory containing the GloVe files (if None, 'glove.6B' is used)
# `max_size` is the number of word embeddings read (starting from the most frequent; in the GloVe files, the words are sorted)
# If `vocabulary` is specified (as a set of strings, or a dictionary from strings to integers), the output vocabulary contains the intersection of `vocabulary` and the words with a defined embedding. Otherwise, all words with a defined embedding are used.
def get_glove(dim=50, vocabulary=None, max_size=-1, data_path=None):
  dimensions = set([50, 100, 200, 300]) # Available dimensions for GloVe 6B
  fallback_url = 'http://nlp.stanford.edu/data/glove.6B.zip' # (Remember that in GloVe 6B, words are lowercased.)

  assert (dim in dimensions), (f'Unavailable GloVe 6B dimension: {dim}.')

  if(data_path is None): data_path = 'glove.6B'

  # Checks that the data is here, otherwise downloads it.
  if(not os.path.isdir(data_path)):
    #print('Directory "%s" does not exist. Creation.' % data_path)
    os.makedirs(data_path)

  glove_weights_file_path = os.path.join(data_path, f'glove.6B.{dim}d.txt')

  if(not os.path.isfile(glove_weights_file_path)):
    local_zip_file_path = os.path.join(data_path, os.path.basename(fallback_url))

    if(not os.path.isfile(local_zip_file_path)):
      print(f'Retreiving GloVe embeddings from {fallback_url}.')
      urllib.request.urlretrieve(fallback_url, local_zip_file_path)

    with zipfile.ZipFile(local_zip_file_path, 'r') as z:
      print(f'Extracting GloVe embeddings from {local_zip_file_path}.')
      z.extractall(path=data_path)

  assert os.path.isfile(glove_weights_file_path), (f"GloVe file {glove_weights_file_path} not found.")

  # Reads GloVe data.
  print('Reading GloVe embeddings.')
  new_vocabulary = {} # A dictionary {word[String]: id[Integer]}
  embeddings = [] # The list of embeddings (Numpy arrays)
  with open(glove_weights_file_path, 'r') as f:
    for line in f: # Each line consist of the word followed by a space and all of the coefficients of the vector separated by a space.
      values = line.split()

      # Here, I'm trying to detect where on the line the word ends and where the vector begins. As in some version(s) of GloVe words can contain spaces, this is not entirely trivial.
      vector_part = ' '.join(values[-dim:])
      x = line.find(vector_part)
      word = line[:(x - 1)]

      if((vocabulary is not None) and (not word in vocabulary)): # If a vocabulary was specified and if the word is not it…
        continue # …this word is skipped.

      new_vocabulary[word] = len(new_vocabulary)
      embedding = np.asarray(values[-dim:], dtype=np.float32)
      embeddings.append(embedding)

      if(len(new_vocabulary) == max_size): break
  print('(GloVe embeddings loaded.)')
  print()

  return (new_vocabulary, embeddings)

In [None]:
%%time
(new_vocabulary, embeddings) = get_glove(dim=50, vocabulary=vocabulary)

Retreiving GloVe embeddings from http://nlp.stanford.edu/data/glove.6B.zip.
Extracting GloVe embeddings from glove.6B/glove.6B.zip.
Reading GloVe embeddings.
(GloVe embeddings loaded.)

CPU times: user 22.4 s, sys: 5.94 s, total: 28.3 s
Wall time: 3min 7s


In [None]:
# print(len(new_vocabulary)) # 25532
# print(new_vocabulary) # Shows each word and its id.

Batch generator
==

In [None]:
# Defines a class of objects that produce batches from the dataset.
class BatchGenerator:
  def __init__(self, dataset, vocabulary):
    self.dataset = dataset
    for part in self.dataset.values(): # Shuffles the dataset so that positive and negative examples are mixed.
      np.random.shuffle(part)

    self.vocabulary = vocabulary # Dictonary {word[String]: id[Integer]}
    self.unknown_word_id = len(vocabulary) # Id for unknown forms
    self.padding_idx = len(vocabulary) + 1 # Not all reviews of a given batch will have the same length. We will "pad" shorter reviews with a special token id so that the batch can be represented by a matrix.

  def length(self, data_type='train'):
    return len(self.dataset[data_type])

  # Returns a random batch.
  # Batches are output as a triples (word_ids, polarity, texts).
  # If `subset` is an integer, only a subset of the corpus is used. This can be useful to debug the system.
  def get_batch(self, batch_size, data_type, subset=None):
    data = self.dataset[data_type] # selects the relevant portion of the dataset.

    max_i = len(data) if(subset is None) else min(subset, len(data))
    instance_ids = np.random.randint(max_i, size=batch_size) # Randomly picks some instance ids.

    return self._ids_to_batch(data, instance_ids)

  def _ids_to_batch(self, data, instance_ids):
    word_ids = [] # Will be a list of lists of word ids (Integer)
    polarity = [] # Will be a list of review polarities (Boolean)
    texts = [] # Will be a list of lists of words (String)
    for instance_id in instance_ids:
      text, p = data[instance_id]

      word_ids.append([self.vocabulary.get(w, self.unknown_word_id) for w in text])
      polarity.append(p)
      texts.append(text)

    # Padding
    self.pad(word_ids)

    word_ids = torch.tensor(word_ids, dtype=torch.long) # Conversion to a tensor
    polarity = torch.tensor(polarity, dtype=torch.bool) # Conversion to a tensor

    return (word_ids, polarity, texts) # We don't really need `texts` but it might be useful to debug the system.

  # Pads a list of lists (i.e. adds fake word ids so that all sequences in the batch have the same length, so that we can use a matrix to represent them).
  # In place
  def pad(self, word_ids):
    max_length = max([len(s) for s in word_ids])
    for s in word_ids: s.extend([self.padding_idx] * (max_length - len(s)))

  # Returns a generator of batches for a full epoch.
  # If `subset` is an integer, only a subset of the corpus is used. This can be useful to debug the system.
  def all_batches(self, batch_size, data_type="train", subset=None):
    data = self.dataset[data_type]

    max_i = len(data) if(subset is None) else min(subset, len(data))

    # Loop that generates all full batches (batches of size 'batch_size').
    i = 0
    while((i + batch_size) <= max_i):
      instance_ids = np.arange(i, (i + batch_size))
      yield self._ids_to_batch(data, instance_ids)
      i += batch_size

    # Possibly generates the last (not full) batch.
    if(i < max_i):
      instance_ids = np.arange(i, max_i)
      yield self._ids_to_batch(data, instance_ids)

  # Turns a list of arbitrary pre-processed texts into a batch.
  # This function will be used to infer the polarity of a unannotated review.
  def turn_into_batch(self, texts):
    word_ids = [[self.vocabulary.get(w, self.unknown_word_id) for w in text] for text in texts]
    self.pad(word_ids)
    return torch.tensor(word_ids, dtype=torch.long)

batch_generator = BatchGenerator(dataset=dataset, vocabulary=new_vocabulary)
print(batch_generator.length('train')) # Prints the number of instance in the train set.

18750


In [None]:
tmp = batch_generator.get_batch(3, data_type="train")
print(tmp[0]) # Prints the matrix of token ids. This matrix is what will be fed as input to the model (defined below).
print(tmp[1]) # Prints the vector of polarities. This vector will be used to compute the loss when training the model.
print(tmp[2]) # Prints the list of reviews.

tensor([[   40,  5112,   571,  ..., 25533, 25533, 25533],
        [   36, 19306,    13,  ...,    30,  1043,     2],
        [ 1918,   797,    87,  ..., 25533, 25533, 25533]])
tensor([ True, False, False])


In [None]:
len(list(batch_generator.all_batches(batch_size=3, data_type="train"))) # Number of batches in the training set for batches of size 3

6250

The model
==

In [None]:
import torch
import torch.nn.functional as F

class SentimentClassifier(torch.nn.Module):
    def __init__(self, embeddings, hidden_sizes, freeze_embeddings=True, device='cpu'):
        super().__init__()

        # Create padding token vector (zeros) and unknown token vector (average of all embeddings)
        padding_vector = torch.zeros(embeddings[0].shape)
        unknown_vector = torch.mean(torch.stack([torch.tensor(e) for e in embeddings]), dim=0)

        # Append unknown and padding vectors to the embeddings list
        embeddings.append(unknown_vector.numpy())
        embeddings.append(padding_vector.numpy())

        # Convert to tensor and create the embedding layer
        embedding_matrix = torch.tensor(embeddings)
        self.padding_idx = len(embeddings) - 1  # Padding index is the last element
        self.unknown_idx = len(embeddings) - 2  # Unknown token index is second to last

        self.embeddings = torch.nn.Embedding.from_pretrained(embedding_matrix, freeze=freeze_embeddings, padding_idx=self.padding_idx)
        self.embeddings = self.embeddings.to(device)  # Send to device

        # Define the main part of the network (sequence of linear layers)
        layers = []
        input_size = embedding_matrix.shape[1]  # Input size is the embedding dimension
        for hidden_size in hidden_sizes:
            layers.append(torch.nn.Linear(input_size, hidden_size))
            layers.append(torch.nn.ReLU())
            input_size = hidden_size

        # Output layer (final linear layer to predict sentiment score)
        layers.append(torch.nn.Linear(input_size, 1))

        self.main_part = torch.nn.Sequential(*layers)
        self.main_part = self.main_part.to(device)  # Send to device

        self.device = device

    def forward(self, batch):
        # Turn batch into embeddings
        embeds = self.embeddings(batch)  # batch shape: (batch_size, seq_length, embedding_dim)

        # Create mask to ignore padding embeddings
        mask = (batch != self.padding_idx).unsqueeze(-1).float()  # shape: (batch_size, seq_length, 1)

        # Compute the average of the embeddings (ignoring padding tokens)
        sum_embeddings = torch.sum(embeds * mask, dim=1)
        sum_mask = torch.sum(mask, dim=1)  # Count how many non-padding tokens per review

        avg_embeddings = sum_embeddings / sum_mask  # Avoids dividing by zero

        # Pass the averaged embeddings through the network
        output = self.main_part(avg_embeddings).squeeze(1)  # Squeeze to make the shape (batch_size,)

        return output


In [None]:
model = SentimentClassifier(embeddings, hidden_sizes=[100], freeze_embeddings=True)
batch = batch_generator.get_batch(3, data_type="train")
print(model(batch[0])) # This output (its shape) should be checked.

tensor([-0.0635, -0.0522, -0.0715], grad_fn=<SqueezeBackward1>)


  embedding_matrix = torch.tensor(embeddings)


In [None]:
# Function that computes the accuracy of the model on a given part of the dataset.
evaluation_batch_size = 256
def evaluation(model, data_type, subset=None):
  nb_correct = 0
  total = 0
  for batch in batch_generator.all_batches(evaluation_batch_size, data_type=data_type, subset=subset):
    prob = model(batch[0].to(model.device)) # Forward pass
    answer = (prob > 0.5) # Shape: (evaluation_batch_size, 1)
    nb_correct += (answer == batch[1].to(model.device)).sum().item()
    total += batch[0].shape[0]

  accuracy = (nb_correct / total)
  return accuracy

Training
==

In [None]:
# Initialize variables before the training loop
nb_epoch = 20  # Number of epochs
epoch_id = 0   # Id of the current epoch
instances_processed = 0  # Number of instances trained on in the current epoch
epoch_loss = []  # Will contain the loss for each batch of the current epoch
batch_size = 16 # Batch size
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with a learning rate of 0.001
epoch_size = len(list(batch_generator.all_batches(batch_size, data_type="train")))  # Number of batches in the training set

while epoch_id < nb_epoch:
    model.train()  # Switch to training mode

    model.zero_grad()  # Clear the gradients

    # Get a batch of training data (you can ignore 'texts' using the underscore '_')
    batch, labels, _ = batch_generator.get_batch(batch_size, data_type="train", subset=None)
    batch, labels = batch.to(model.device), labels.to(model.device)  # Move batch and labels to the device


    # (i) Compute model predictions
    predictions = model(batch)

    # (ii) Compute the loss
    loss_function = torch.nn.BCEWithLogitsLoss()  # Binary classification loss function
    loss = loss_function(predictions, labels.float())  # Assuming labels are binary (0 or 1)

    # (iii) Backpropagate the loss
    loss.backward()

    # (iv) Store the loss
    epoch_loss.append(loss.item())

    # Update model parameters
    optimizer.step()

    # Track the number of instances processed
    instances_processed += batch_size

    # End of epoch: Print statistics and evaluate the model
    if instances_processed >= epoch_size:
        print(f"-- END OF EPOCH {epoch_id}.")
        print(f"Average loss: {sum(epoch_loss) / len(epoch_loss)}.")

        # Evaluation: Switch to evaluation mode
        model.eval()
        with torch.no_grad():
            # Compute accuracy on training set
            train_accuracy = evaluation(model, "train")
            print(f"Accuracy on the train set: {train_accuracy}.")

            # Compute accuracy on validation set (dev set)
            dev_accuracy = evaluation(model, "dev")
            print(f"Accuracy on the dev set: {dev_accuracy}.")

        # Increment epoch counter and reset the loss for the next epoch
        epoch_id += 1
        instances_processed -= epoch_size
        epoch_loss = []

-- END OF EPOCH 0.
Average loss: 0.6867781909736427.
Accuracy on the train set: 0.5.
Accuracy on the dev set: 0.5.
-- END OF EPOCH 1.
Average loss: 0.6732716478713571.
Accuracy on the train set: 0.52368.
Accuracy on the dev set: 0.52528.
-- END OF EPOCH 2.
Average loss: 0.6564809949430701.
Accuracy on the train set: 0.5415466666666666.
Accuracy on the dev set: 0.54448.
-- END OF EPOCH 3.
Average loss: 0.6407832601299025.
Accuracy on the train set: 0.5788266666666667.
Accuracy on the dev set: 0.57888.
-- END OF EPOCH 4.
Average loss: 0.6160501716910182.
Accuracy on the train set: 0.65568.
Accuracy on the dev set: 0.65456.
-- END OF EPOCH 5.
Average loss: 0.6075445585054894.
Accuracy on the train set: 0.62064.
Accuracy on the dev set: 0.62448.
-- END OF EPOCH 6.
Average loss: 0.5917632363430442.
Accuracy on the train set: 0.64544.
Accuracy on the dev set: 0.64736.
-- END OF EPOCH 7.
Average loss: 0.5751003202510206.
Accuracy on the train set: 0.7.
Accuracy on the dev set: 0.7016.
-- END 

In [None]:
model.eval() # Tells PyTorch that we are in evaluation/inference mode (can be useful if dropout is used, for instance).
model(batch_generator.turn_into_batch([preprocess(text) for text in ["This movie was terrible!!", "Pure gold!", "Bad.", "Not bad!"]]).to(model.device))

tensor([-3.4477,  4.1759, -7.2726, -7.3947], grad_fn=<SqueezeBackward1>)