In [8]:
#load the necessary libraries
import torch
import torchtext
from torchtext import data
from torchtext import datasets

import random
from tensorboardX import SummaryWriter

In [9]:
#define the SEED for reproducebility
SEED = 101
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

We will create our TEXT field and the LABEL field, which will hold our actual TEXT and the LABELS correspondingly.

In [10]:
#load the Field and the Labels
TEXT = data.Field(sequential = True, lower = True)
LABEL = data.LabelField(dtype = torch.float32)

In [4]:
#now let's load our data
#the following code will load our IMDB data and split it into training and testing set
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [5]:
train_data.examples

[<torchtext.data.example.Example at 0x269e44fa320>,
 <torchtext.data.example.Example at 0x269e44fa2b0>,
 <torchtext.data.example.Example at 0x269e44fa4a8>,
 <torchtext.data.example.Example at 0x269e44fa4e0>,
 <torchtext.data.example.Example at 0x269e44fa518>,
 <torchtext.data.example.Example at 0x269e44fa550>,
 <torchtext.data.example.Example at 0x269e44fa588>,
 <torchtext.data.example.Example at 0x269e44fa5c0>,
 <torchtext.data.example.Example at 0x269e44fa5f8>,
 <torchtext.data.example.Example at 0x269e44fa630>,
 <torchtext.data.example.Example at 0x269e44fa668>,
 <torchtext.data.example.Example at 0x269e44fa6a0>,
 <torchtext.data.example.Example at 0x269e44fa6d8>,
 <torchtext.data.example.Example at 0x269e44fa710>,
 <torchtext.data.example.Example at 0x269e44fa748>,
 <torchtext.data.example.Example at 0x269e44fa780>,
 <torchtext.data.example.Example at 0x269e44fa7b8>,
 <torchtext.data.example.Example at 0x269e44fa7f0>,
 <torchtext.data.example.Example at 0x269e44fa828>,
 <torchtext.

In [6]:
#let's look at the sample example of the train_data
print(vars(train_data.examples[0]))

{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', '.........', 'at', '..........', 'high.', 'a', 'classic', 'line:', 'inspector:', "i'm", 'here', 'to', 'sack', 'one', 'of', '

In [7]:
#let's check the length of the train and test data
print(f"THE LENGTH OF THE TRAIN DATA IS : {len(train_data)}")
print(f"THE LENGTH OF THE TEST DATA IS : {len(test_data)}")

THE LENGTH OF THE TRAIN DATA IS : 25000
THE LENGTH OF THE TEST DATA IS : 19267


In [14]:
#we will spit our train data further into training and validation
train_data, val_data = train_data.split(split_ratio = 0.7, random_state = random.seed(SEED))

In [15]:
#let's check the length of the train and test data and val data
print(f"THE LENGTH OF THE TRAIN DATA IS : {len(train_data)}")
print(f"THE LENGTH OF THE VAL DATA IS : {len(val_data)}")
print(f"THE LENGTH OF THE TEST DATA IS : {len(test_data)}")

THE LENGTH OF THE TRAIN DATA IS : 17500
THE LENGTH OF THE VAL DATA IS : 7500
THE LENGTH OF THE TEST DATA IS : 25000




Next, we have to build a vocabulary. This is a effectively a look up table where every unique word in your data set has a corresponding index (an integer).

We do this as our machine learning model cannot operate on strings, only numbers. Each index is used to construct a one-hot vector for each word. A one-hot vector is a vector where all of the elements are 0, except one, which is 1, and dimensionality is the total number of unique words in your vocabulary, commonly denoted by $V$.



In [16]:
#example of how the index for each word can be used to obatin the OHE of the corresponding word
from IPython.display import Image
Image('table.png')

TypeError: a bytes-like object is required, not 'str'

TypeError: a bytes-like object is required, not 'str'

<IPython.core.display.Image object>

The number of unique words in our training set is over 100,000, which means that our one-hot vectors will have over 100,000 dimensions! This will make training slow and possibly won't fit onto your GPU (if you're using one).

There are two ways effectively cut down our vocabulary, we can either only take the top $n$ most common words or ignore words that appear less than $m$ times. We'll do the former, only keeping the top 25,000 words.

What do we do with words that appear in examples but we have cut from the vocabulary? We replace them with a special unknown or $<unk>$ token. For example, if the sentence was "This film is great and I love it" but the word "love" was not in the vocabulary, it would become "This film is great and I $<unk>$ it".

The following builds the vocabulary, only keeping the most common max_size tokens.


In [17]:
#let's build the vcabulary by keeping the max_size = 25,000
TEXT.build_vocab(train_data, max_size = 25000)
LABEL.build_vocab(test_data)

In [18]:
#let's check the length of our vocabulary
print(f"THE LENGTH OF THE TEXT VOCAB IS : {len(TEXT.vocab)}")
print(f"THE LENGTH OF THE LABEL VOCAB IS : {len(LABEL.vocab)}")

THE LENGTH OF THE TEXT VOCAB IS : 25002
THE LENGTH OF THE LABEL VOCAB IS : 2


As you can see, the size of our TEXT vocabulary is 25002 (25000 because we mentioned that we want the max size of the vocabulary to be 25000 and additional 2 because of the fake tokens of $<unk>$ token and $<pad>$ token.

The length of the LABEL vocab is 2 because we have two different classes of $pos$ and $neg$.

In [19]:
#let's also view the most common words and their frequency in the vocabulary
print(TEXT.vocab.freqs.most_common(20))

[('the', 224887), ('a', 111557), ('and', 110593), ('of', 100616), ('to', 93640), ('is', 72251), ('in', 63402), ('i', 49253), ('this', 48412), ('that', 46215), ('it', 45621), ('/><br', 35559), ('was', 32892), ('as', 31290), ('for', 29951), ('with', 29874), ('but', 27735), ('on', 21968), ('movie', 21391), ('his', 20331)]


In [20]:
#also we can see the total number of examples belonging to each of the 'pos' and 'neg' sentiments
print(LABEL.vocab.freqs.most_common())

[('pos', 12500), ('neg', 12500)]


We can also see the vocabulary directly using either the stoi (string to int) or itos (int to string) method.

In [21]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'i']


The final step of preparing the data is creating the iterators. We iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.

We'll use a BucketIterator which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.

We also want to place the tensors returned by the iterator on the GPU (if you're using one). PyTorch handles this using torch.device, we then pass this device to the iterator.

In [22]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, val_data, test_data),
                                                                           batch_size = BATCH_SIZE,
                                                                           device = device)

In [23]:
print(vars(train_iterator.dataset.examples[0]))

{'text': ['blonde', 'and', 'blonder', 'was', 'unfunny.basically,', 'it', 'was', 'a', 'rip-off', 'girl', 'version', 'of', 'dumb', 'and', 'dumber,', 'but', 'less', 'funny,', 'and', 'they', 'used', 'too', 'much', 'background', 'noises', 'and', 'music.way', 'too', 'much', 'background', 'noises', 'and', 'music', 'if', 'you', 'ask', 'me!!!!it', 'starts', 'out', 'immensely', 'boring,', 'and', 'totally', 'inane.it', "doesn't", 'pick', 'up', 'pace', 'anywhere', 'soon,', 'and', 'i', 'was', 'feeling', 'more', 'frustrated', 'as', 'this', 'nonsense', 'carried', 'on.maybe,', 'the', 'only', 'thing', 'that', 'saved', 'me', 'from', 'giving', 'this', 'movie', 'a', '1', 'was', 'the', 'last', '30', 'minutes.i', 'found', 'it', 'somewhat', 'entertaining', 'and', 'interesting', 'as', 'it', 'neared', 'the', 'end,', 'but', 'that', 'was', 'the', 'only', 'part.also,', 'i', "couldn't", 'help', 'but', 'like', 'pamela', 'anderson', 'and', 'denise', "richard's", 'characters', 'a', 'little.even', 'though', 'this', 'm

## BUILD THE MODEL

Our three layers are an embedding layer, our RNN, and a linear layer. All layers have their parameters initialized to random values, unless explicitly specified.

The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). This embedding layer is simply a single fully connected layer. As well as reducing the dimensionality of the input to the RNN, there is the theory that words which have similar impact on the sentiment of the review are mapped close together in this dense vector space. For more information about word embeddings, see here.

The RNN layer is our RNN which takes in our dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$. (see below pic)

In [24]:
from IPython.display import Image
Image('RNN.png')

TypeError: a bytes-like object is required, not 'str'

TypeError: a bytes-like object is required, not 'str'

<IPython.core.display.Image object>

Finally, the linear layer takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.

The forward method is called when we feed examples into our model.

Each batch, text, is a tensor of size [sentence length, batch size]. That is a batch of sentences, each having each word converted into a one-hot vector.

You may notice that this tensor should have another dimension due to the one-hot vectors, however PyTorch conveniently stores a one-hot vector as it's index value, i.e. the tensor representing a sentence is just a tensor of the indexes for each token in that sentence. The act of converting a list of tokens into a list of indexes is commonly called numericalizing.

The input batch is then passed through the embedding layer to get embedded, which gives us a dense vector representation of our sentences. embedded is a tensor of size [sentence length, batch size, embedding dim].

embedded is then fed into the RNN. In some frameworks you must feed the initial hidden state, $h_0$, into the RNN, however in PyTorch, if no initial hidden state is passed as an argument it defaults to a tensor of all zeros.

The RNN returns 2 tensors, output of size [sentence length, batch size, hidden dim] and hidden of size [1, batch size, hidden dim]. output is the concatenation of the hidden state from every time step, whereas hidden is simply the final hidden state. We verify this using the assert statement. Note the squeeze method, which is used to remove a dimension of size 1.

Finally, we feed the last hidden state, hidden, through the linear layer, fc, to produce a prediction.

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [26]:
#let's start building the model
class RNN(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, hidden_dim):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        #size of the input text ---> text = [sent len, batch size]
        embedded = self.embedding(text)
        #size of the output of the embedding layer ---> embedded = [sent len, batch size, emb dim]
        output, hidden = self.rnn(embedded)
        #output = [sent len, batch size, hidden dim]
        #hidden = [1, batch size, hidden dim]
        out = self.fc(hidden.squeeze(0))
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return out

We now create an instance of our RNN class.

The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size.

The embedding dimension is the size of the dense word vectors. This is usually around 50-250 dimensions, but depends on the size of the vocabulary.

The hidden dimension is the size of the hidden states. This is usually around 100-500 dimensions, but also depends on factors such as on the vocabulary size, the size of the dense vectors and the complexity of the task.

The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [35]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(input_dim = INPUT_DIM, output_dim = OUTPUT_DIM, embedding_dim = EMBEDDING_DIM, hidden_dim = HIDDEN_DIM).to(device)
print(model)

RNN(
  (embedding): Embedding(25002, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


## TRAIN THE MODEL

Now we'll set up the training and then train the model.

First, we'll create an optimizer. This is the algorithm we use to update the parameters of the module. Here, we'll use stochastic gradient descent (SGD). The first argument is the parameters will be updated by the optimizer, the second is the learning rate, i.e. how much we'll change the parameters by when we do a parameter update.

In [36]:
#define our optimizer
optimizer = optim.Adam(model.parameters(), lr = 0.001)

Next, we'll define our loss function. In PyTorch this is commonly called a criterion.

The loss function here is binary cross entropy with logits.

Our model currently outputs an unbound real number. As our labels are either 0 or 1, we want to restrict the predictions to a number between 0 and 1. We do this using the sigmoid or logit functions.

We then use this this bound scalar to calculate the loss using binary cross entropy.

The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps.

In [37]:
#define our loss function
criterion = nn.BCEWithLogitsLoss()

Our criterion function calculates the loss, however we have to write our function to calculate the accuracy.

This function first feeds the predictions through a sigmoid layer, squashing the values between 0 and 1, we then round them to the nearest integer. This rounds any value greater than 0.5 to 1 (a positive sentiment) and the rest to 0 (a negative sentiment).

We then calculate how many rounded predictions equal the actual labels and average it across the batch.

In [38]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [43]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

evaluate is similar to train, with a few modifications as you don't want to update the parameters when evaluating.

model.eval() puts the model in "evaluation mode", this turns off dropout and batch normalization. Again, we are not using them in this model, but it is good practice to include them.

No gradients are calculated on PyTorch operations inside the with no_grad() block. This causes less memory to be used and speeds up computation.

The rest of the function is the same as train, with the removal of optimizer.zero_grad(), loss.backward() and optimizer.step(), as we do not update the model's parameters when evaluating.

In [47]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            
            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

We'll also create a function to tell us how long an epoch takes to compare training times between models.

In [48]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

We then train the model through multiple epochs, an epoch being a complete pass through all examples in the training and validation sets.

At each epoch, if the validation loss is the best we have seen so far, we'll save the parameters of the model and then after training has finished we'll use that model on the test set.

In [49]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model = model, iterator = train_iterator, optimizer = optimizer, criterion = criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 56s
	Train Loss: 0.697 | Train Acc: 50.41%
	 Val. Loss: 0.697 |  Val. Acc: 49.81%
Epoch: 02 | Epoch Time: 0m 55s
	Train Loss: 0.695 | Train Acc: 50.42%
	 Val. Loss: 0.699 |  Val. Acc: 50.41%
Epoch: 03 | Epoch Time: 0m 59s
	Train Loss: 0.697 | Train Acc: 50.01%
	 Val. Loss: 0.704 |  Val. Acc: 49.54%
Epoch: 04 | Epoch Time: 0m 55s
	Train Loss: 0.697 | Train Acc: 49.22%
	 Val. Loss: 0.699 |  Val. Acc: 50.72%
Epoch: 05 | Epoch Time: 0m 56s
	Train Loss: 0.696 | Train Acc: 50.23%
	 Val. Loss: 0.705 |  Val. Acc: 48.99%
Epoch: 06 | Epoch Time: 0m 55s
	Train Loss: 0.695 | Train Acc: 50.12%
	 Val. Loss: 0.703 |  Val. Acc: 50.95%
Epoch: 07 | Epoch Time: 0m 56s
	Train Loss: 0.695 | Train Acc: 50.39%
	 Val. Loss: 0.697 |  Val. Acc: 50.55%
Epoch: 08 | Epoch Time: 0m 58s
	Train Loss: 0.697 | Train Acc: 49.61%
	 Val. Loss: 0.704 |  Val. Acc: 50.45%
Epoch: 09 | Epoch Time: 0m 56s
	Train Loss: 0.696 | Train Acc: 49.58%
	 Val. Loss: 0.705 |  Val. Acc: 50.56%
Epoch: 10 | Epoch T

Finally, the metric we actually care about, the test loss and accuracy, which we get from our parameters that gave us the best validation loss.

In [50]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.699 | Test Acc: 49.87%


### NEXT STEPS

In the next notebook, the improvements we will make are:

1. Different optimizer
2. Use pre-trained word embeddings
3. Different RNN architecture
4. Bidirectional RNN
5. Multi-layer RNN
6. Regularization

This will allow us to achieve ~85% accuracy.