# RNN Basic

# Setup

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from google.colab import drive
drive.mount('/content/drive')
import os
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Colab Notebooks'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
#print(os.listdir(GOOGLE_DRIVE_PATH))
import sys
sys.path.append(GOOGLE_DRIVE_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Implementation of RNN

In [5]:
import torch
from torch import nn

import numpy as np

## Creating Vocabulary Dictionaries

In [6]:
text = ['hey how are you','good i am fine','have a nice day']

# Join all the sentences together and extract the unique characters from the combined sentences
chars = set(''.join(text))
print('unique chars',chars)

# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))
print('integer mappnig', int2char)

# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}
print('dictionary',char2int)

unique chars {'a', 'u', 'h', 'o', 'y', 'n', 'm', 'd', 'v', 'c', 'i', 'f', 'e', 'w', ' ', 'r', 'g'}
integer mappnig {0: 'a', 1: 'u', 2: 'h', 3: 'o', 4: 'y', 5: 'n', 6: 'm', 7: 'd', 8: 'v', 9: 'c', 10: 'i', 11: 'f', 12: 'e', 13: 'w', 14: ' ', 15: 'r', 16: 'g'}
dictionary {'a': 0, 'u': 1, 'h': 2, 'o': 3, 'y': 4, 'n': 5, 'm': 6, 'd': 7, 'v': 8, 'c': 9, 'i': 10, 'f': 11, 'e': 12, 'w': 13, ' ': 14, 'r': 15, 'g': 16}


We can see the space(' ') is also included as an element of the dictionary.

# Padding and splitting into input/labels
Next, we'll be padding our input sentences to ensure that all the sentences are of the sample length. While RNNs are typically able to take in variably sized inputs, we will usually want to feed training data in batches to speed up the training process. In order to used batches to train on our data, we'll need to ensure that each sequence within the input data are of equal size.

Therefore, in most cases, padding can be done by filling up sequences that are too short with 0 values and trimming sequences that are too long. In our case, we'll be finding the length of the longest sequence and padding the rest of the sentences with blank spaces to match that length.

In [7]:
print(max(text,key=len))
maxlen = len(max(text, key=len))
print("The longest string has {} characters".format(maxlen))

hey how are you
The longest string has 15 characters


In [8]:
# Padding

# A simple loop that loops through the list of sentences and adds a ' ' whitespace until the length of the sentence matches
# the length of the longest sentence
for i in range(len(text)):
    while len(text[i])<maxlen:
        text[i] += ' '

print(text)

['hey how are you', 'good i am fine ', 'have a nice day']


As we're going to predict the next character in the sequence at each time step, we'll have to divide each sentence into

- Input data
    - The last input character should be excluded as it does not need to be fed into the model
- Target/Ground Truth Label
    - One time-step ahead of the Input data as this will be the "correct answer" for the model at each time step corresponding to the input data

In [9]:
# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []

for i in range(len(text)):
    # Remove last character for input sequence
    input_seq.append(text[i][:-1])
    
    # Remove firsts character for target sequence
    target_seq.append(text[i][1:])
    print("Input Sequence: {}\nTarget Sequence: {}".format(input_seq[i], target_seq[i]))

Input Sequence: hey how are yo
Target Sequence: ey how are you
Input Sequence: good i am fine
Target Sequence: ood i am fine 
Input Sequence: have a nice da
Target Sequence: ave a nice day


Now we can convert our input and target sequences to sequences of integers instead of characters by mapping them using the dictionaries we created above. This will allow us to one-hot-encode our input sequence subsequently.

In [10]:
for i in range(len(text)):
    input_seq[i] = [char2int[character] for character in input_seq[i]]
    target_seq[i] = [char2int[character] for character in target_seq[i]]

In [11]:
print(input_seq)

[[2, 12, 4, 14, 2, 3, 13, 14, 0, 15, 12, 14, 4, 3], [16, 3, 3, 7, 14, 10, 14, 0, 6, 14, 11, 10, 5, 12], [2, 0, 8, 12, 14, 0, 14, 5, 10, 9, 12, 14, 7, 0]]


Before encoding our input sequence into one-hot vectors, we'll define 3 key variables:

- ``dict_size``: The number of unique characters that we have in our text
    - This will determine the one-hot vector size as each character will have an assigned index in that vector
- ``seq_len``: The length of the sequences that we're feeding into the model
    - As we standardised the length of all our sentences to be equal to the longest sentences, this value will be the ``max length - 1`` as we removed the last character input as well
- ``batch_size``: The number of sentences that we defined and are going to feed into the model as a batch

In [12]:
print(char2int)
dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
    
    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features

{'a': 0, 'u': 1, 'h': 2, 'o': 3, 'y': 4, 'n': 5, 'm': 6, 'd': 7, 'v': 8, 'c': 9, 'i': 10, 'f': 11, 'e': 12, 'w': 13, ' ': 14, 'r': 15, 'g': 16}


In [13]:
input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)
print("Input shape: {} --> (Batch Size, Sequence Length, One-Hot Encoding Size)".format(input_seq.shape))

Input shape: (3, 14, 17) --> (Batch Size, Sequence Length, One-Hot Encoding Size)


## Move to PyTorch

In [14]:
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

In [15]:
print('target',target_seq)

target tensor([[12.,  4., 14.,  2.,  3., 13., 14.,  0., 15., 12., 14.,  4.,  3.,  1.],
        [ 3.,  3.,  7., 14., 10., 14.,  0.,  6., 14., 11., 10.,  5., 12., 14.],
        [ 0.,  8., 12., 14.,  0., 14.,  5., 10.,  9., 12., 14.,  7.,  0.,  4.]])


You may run this on GPU if you have much larger dataset size. But in this notebook, we will just use CPU since it's a very small task and doesn't need GPU.

In [16]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")


GPU is available


# Defining the model

We will use a single-layer RNN followed by a fully connected layer. 

## ``nn.RNN``

Let's see the structure of ``nn.RNN`` first. For full details, see https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

In [17]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        
        batch_size = x.size(0)

        #Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
         # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden

We will train the model witih ``adam`` optimizer and the cross entropy loss since it's a classification task.

In [18]:
# Instantiate the model with hyperparameters
model = Model(input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1)
print(model)
# We'll also set the model to the device that we defined earlier (default is CPU)
model = model.to(device)

# Define hyperparameters
n_epochs = 1000
lr=0.01

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Model(
  (rnn): RNN(17, 12, batch_first=True)
  (fc): Linear(in_features=12, out_features=17, bias=True)
)


In [19]:
print('input 1\n',input_seq[0])
print('output 1\n',target_seq[0])

input 1
 tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.

# Train the model

In [20]:
# Training Run
input_seq = input_seq.to(device)
for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() # Clears existing gradients from previous epoch
    #input_seq = input_seq.to(device)
    output, hidden = model(input_seq)
    output = output.to(device)
    target_seq = target_seq.to(device)
    loss = criterion(output, target_seq.view(-1).long())
    loss.backward() # Does backpropagation and calculates gradients
    optimizer.step() # Updates the weights accordingly
    
    if epoch%10 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

Epoch: 10/1000............. Loss: 2.4414
Epoch: 20/1000............. Loss: 2.1487
Epoch: 30/1000............. Loss: 1.7714
Epoch: 40/1000............. Loss: 1.3261
Epoch: 50/1000............. Loss: 0.9323
Epoch: 60/1000............. Loss: 0.6424
Epoch: 70/1000............. Loss: 0.4501
Epoch: 80/1000............. Loss: 0.3192
Epoch: 90/1000............. Loss: 0.2321
Epoch: 100/1000............. Loss: 0.1762
Epoch: 110/1000............. Loss: 0.1406
Epoch: 120/1000............. Loss: 0.1171
Epoch: 130/1000............. Loss: 0.1010
Epoch: 140/1000............. Loss: 0.0896
Epoch: 150/1000............. Loss: 0.0811
Epoch: 160/1000............. Loss: 0.0747
Epoch: 170/1000............. Loss: 0.0696
Epoch: 180/1000............. Loss: 0.0655
Epoch: 190/1000............. Loss: 0.0621
Epoch: 200/1000............. Loss: 0.0593
Epoch: 210/1000............. Loss: 0.0569
Epoch: 220/1000............. Loss: 0.0548
Epoch: 230/1000............. Loss: 0.0530
Epoch: 240/1000............. Loss: 0.0515
E

# Predict

In [21]:
def predict(model, character):
    # One-hot encoding our input to fit into the model
    character = np.array([[char2int[c] for c in character]])
    character = one_hot_encode(character, dict_size, character.shape[1], 1)
    character = torch.from_numpy(character)
    character = character.to(device)
    
    out, hidden = model(character)

    prob = nn.functional.softmax(out[-1], dim=0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim=0)[1].item()

    return int2char[char_ind], hidden

In [22]:
predict(model, 'good')

(' ', tensor([[[ 0.8923,  0.9730, -0.9979,  0.7101, -0.9843, -0.9024,  0.9480,
           -0.9996,  0.9988, -0.9873,  0.9513,  0.9984]]], device='cuda:0',
        grad_fn=<CudnnRnnBackward0>))

Next character is ' '. Let's generate the sentence in a sequential way.

In [23]:
predict(model, 'g')

('o', tensor([[[ 0.3956,  0.7752, -0.9768,  0.7956, -0.9193, -0.2155,  0.7788,
            0.7800, -0.7440, -0.9594, -0.3289, -0.5743]]], device='cuda:0',
        grad_fn=<CudnnRnnBackward0>))

In [24]:
predict(model, 'good ')

('i', tensor([[[ 0.9998,  0.9995, -0.8725,  0.9944,  0.8340,  0.9014,  0.9879,
           -0.9575,  0.9524,  0.9971,  0.9796,  0.9998]]], device='cuda:0',
        grad_fn=<CudnnRnnBackward0>))

In [25]:
predict(model,'good i')

(' ', tensor([[[-0.1528, -0.9916,  0.9044,  0.9478,  0.1106, -0.9992,  0.9846,
           -0.5940,  1.0000, -0.8741,  0.9997, -0.7217]]], device='cuda:0',
        grad_fn=<CudnnRnnBackward0>))

In [26]:
predict(model, 'good i ')

('a', tensor([[[-0.9758,  0.9766, -0.9974, -0.9284, -0.9935,  0.4962, -0.9774,
           -0.4553, -0.2701,  0.9987, -0.6804,  0.9999]]], device='cuda:0',
        grad_fn=<CudnnRnnBackward0>))

In [27]:
def sample(model, out_len, start='hey'):
    model.eval() # eval mode
    start = start.lower()
    # First off, run through the starting characters
    chars = [ch for ch in start]  # ['h', 'e', 'y'] for example
    size = out_len - len(chars)
    # Now pass in the previous characters and get a new one
    for ii in range(size):
        char, h = predict(model, chars)
        chars.append(char)

    return ''.join(chars)

In [28]:
sample(model, 15, 'good')

'good i am fine '

In [29]:
sample(model, 30, 'g')

'good i am fine youm fine youm '

# LSTM Basic

Before we jump into a project with a full dataset, let's just take a look at how the PyTorch LSTM layer really works in practice by visualizing the outputs. We don't need to instantiate a model to see how the layer works. See https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

In [30]:
import torch
import torch.nn as nn

input_dim = 5
hidden_dim = 10
n_layers = 1

lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)

Let's create some dummy data to see how the layer takes in the input. As our input dimension is 5, we have to create a tensor of the shape (1, 1, 5) which represents (batch size, sequence length, input dimension).

Additionally, we'll have to initialize a hidden state and cell state for the LSTM as this is the first cell. The hidden state and cell state is stored in a tuple with the format (hidden_state, cell_state)

In [31]:
batch_size = 1
seq_len = 3

inp = torch.randn(batch_size, seq_len, input_dim)
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
cell_state = torch.randn(n_layers, batch_size, hidden_dim)
hidden = (hidden_state, cell_state)

print("Input shape: {}".format(inp.shape))
print("Hidden shape: ({}, {})".format(hidden[0].shape, hidden[1].shape))

Input shape: torch.Size([1, 3, 5])
Hidden shape: (torch.Size([1, 1, 10]), torch.Size([1, 1, 10]))


Next, we’ll feed the input and hidden states and see what we’ll get back from it

In [32]:
out, hidden = lstm_layer(inp, hidden)
print("Output shape: ", out.shape)
print("Hidden: ", hidden)

Output shape:  torch.Size([1, 3, 10])
Hidden:  (tensor([[[-0.1780, -0.1436, -0.1630, -0.1567,  0.0924, -0.0373,  0.1726,
          -0.0755, -0.1744,  0.1407]]], grad_fn=<StackBackward0>), tensor([[[-0.3908, -0.2441, -0.2622, -0.2477,  0.1494, -0.0828,  0.3305,
          -0.1123, -0.3849,  0.4508]]], grad_fn=<StackBackward0>))


we'll need an output at every time step, such as Text Generation, the output of each time step can be extracted directly from the 2nd dimension and fed into a fully connected layer. For text classification tasks, such as Sentiment Analysis, the last output can be taken to be fed into a classifier.

In [33]:
# Obtaining the last output in the sequence
out = out.squeeze()[-1, :]
print(out.shape)

torch.Size([10])


# Sentiment Analysis on Amazon Reviews with LSTM
we’ll be using the Amazon customer reviews dataset which can be found on Kaggle. The dataset contains a total of 4 million reviews with each review labeled to be of either positive or negative sentiment. 

For our data pre-processing steps, we'll be using regex, ``numpy`` and the ``nltk`` (Natural Language Toolkit) library for some simple NLP helper functions. As the data is compressed in the ``bz2`` format, we'll use the Python bz2 module to read the data.

In [34]:
import bz2
from collections import Counter
import re
import nltk
import numpy as np
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Download the data using the URL below
Download the two files and store them on your working directory.
- https://storage.googleapis.com/amazonreviews/test.ft.txt.bz2
- https://storage.googleapis.com/amazonreviews/train.ft.txt.bz2

In [35]:
train_file = bz2.BZ2File(GOOGLE_DRIVE_PATH+'/train.ft.txt.bz2')
test_file = bz2.BZ2File(GOOGLE_DRIVE_PATH+'/test.ft.txt.bz2')

In [36]:
train_file = train_file.readlines()
test_file = test_file.readlines()

In [37]:
print("Number of training reivews: " + str(len(train_file)))
print("Number of test reviews: " + str(len(test_file)))

Number of training reivews: 3600000
Number of test reviews: 400000


## Choose subsets

In [38]:
num_train = 300000 #We're training on the first 500,000 reviews in the dataset
num_test = 100000 #Using 100,000 reviews from test set

train_file = [x.decode('utf-8') for x in train_file[:num_train]]
test_file = [x.decode('utf-8') for x in test_file[:num_test]]

In [39]:
print(train_file[0])


__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^



## Preprocessing

Next, we'll have to extract out the labels from the sentences. The data is the format __label__1/2 <sentence>, therefore we can easily split it accordingly. Positive sentiment labels are stored as 1 and negative are stored as 0.

We will also change all urls to a standard "<url>" as the exact url is irrelevant to the sentiment in most cases.
    

In [40]:
# Extracting labels from sentences

train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file]

    
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file]

# Some simple cleaning of data

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

# Modify URLs to <url>

for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

### Tokenization
After quickly cleaning the data, we will do tokenization of the sentences, which is a standard NLP task. Tokenization is the task of splitting a sentence into the individual tokens, which can be words or punctuation, etc. There are many NLP libraries that are able to do this, such as spaCy or Scikit-learn, but we will be using NLTK here as it has one of the faster tokenizers.

The words will then be stored in a dictionary mapping the word to its number of appearances. These words will become our vocabulary.

In [41]:
words = Counter() #Dictionary that will map a word to the number of times it appeared in all the training sentences
for i, sentence in enumerate(train_sentences):
    #The sentences will be stored as a list of words/tokens
    train_sentences[i] = []
    for word in nltk.word_tokenize(sentence): #Tokenizing the words
        words.update([word.lower()]) #Converting all the words to lower case
        train_sentences[i].append(word)
    if i%20000 == 0:
        print(str((i*100)/num_train) + "% done")
print("100% done")

0.0% done
6.666666666666667% done
13.333333333333334% done
20.0% done
26.666666666666668% done
33.333333333333336% done
40.0% done
46.666666666666664% done
53.333333333333336% done
60.0% done
66.66666666666667% done
73.33333333333333% done
80.0% done
86.66666666666667% done
93.33333333333333% done
100% done


To remove typos and words that likely don't exist, we'll remove all words from the vocab that only appear once throughout. To account for unknown words and padding, we'll have to add them to our vocabulary as well. Each word in the vocabulary will then be assigned an integer index and thereafter mapped to this integer.

In [42]:
# Removing the words that only appear once
words = {k:v for k,v in words.items() if v>1}
# Sorting the words according to the number of appearances, with the most common word being first
words = sorted(words, key=words.get, reverse=True)
# Adding padding and unknown to our vocabulary so that they will be assigned an index
words = ['_PAD','_UNK'] + words
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

With the mappings, we'll convert the words in the sentences to their corresponding indexes.

In [43]:
for i, sentence in enumerate(train_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sentence]

for i, sentence in enumerate(test_sentences):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(sentence)]


In the last pre-processing step, we'll be padding the sentences with 0s and shortening the lengthy sentences so that the data can be trained in batches to speed things up.

In [44]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [45]:
seq_len = 200 #The length that the sentences will be padded/shortened to

train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

In [46]:
# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)


A padded sentence will look something like this, where 0 represents the padding:

In [47]:
print(test_sentences[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0    40   106    13    28  1486  3728    57    31    10     3
    40  1811    10    84  1620     2     5    26   925     8    11   106
    17   151     6     5   141    89     9     2    69     5   122    14
     7    42  1848     9   202    59   241   108     2     7   139  1848
    47 35127    38  3195    14     3  2583     2    11   106    47 17011
   157     2   967    30     1     1     6   603    47  1280     2    31
    10   157    21  2416  4254     2    11    12     7  3461 16717   106
    14    28    22     2   182   101   128   146   

In [48]:
# split the training into train/validation
split_frac = 0.5
split_id = int(split_frac * len(test_sentences))
val_sentences, test_sentences = test_sentences[:split_id], test_sentences[split_id:]
val_labels, test_labels = test_labels[:split_id], test_labels[split_id:]

Next, this is the point where we’ll start working with the PyTorch library. We’ll first define the datasets from the sentences and labels, followed by loading them into a data loader. We set the batch size to 256. This can be tweaked according to your needs.

### Use Dataset

In [49]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

batch_size = 400

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)


In [50]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


## LSTM

At this point, we will be defining the architecture of the model. At this stage, we can create Neural Networks that have deep layers or and large number of LSTM layers stacked on top of each other. However, a simple model such as the one below works quite well and requires much less training time. We will be training our own word embeddings in the first layer before the sentences are fed into the LSTM layer.

The final layer is a fully connected layer with a sigmoid function to classify whether the review is of positive/negative sentiment.



In [51]:
import torch.nn as nn

class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

Take note that we can actually load pre-trained word embeddings such as GloVe or fastText which can increase the model’s accuracy and decrease training time.

With this, we can instantiate our model after defining the arguments. The output dimension will only be 1 as it only needs to output 1 or 0. The learning rate, loss function and optimizer are defined as well.

In [52]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)
print(model)

SentimentNet(
  (embedding): Embedding(121325, 400)
  (lstm): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [53]:
lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Finally, we can start training the model. For every 1000 steps, we’ll be checking the output of our model against the validation dataset and saving the model if it performed better than the previous time. The state_dict is the model’s weights in PyTorch and can be loaded into a model with the same architecture at a separate time or script altogether.

In [54]:
epochs = 2
counter = 0
print_every = 1000
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    h = model.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inp, lab in val_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 2/2... Step: 1000... Loss: 0.129837... Val Loss: 0.187997
Validation loss decreased (inf --> 0.187997).  Saving model ...


In [55]:
#Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

<All keys matched successfully>

In [56]:
test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze()) #rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
        
print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 0.186
Test accuracy: 93.166%


This result was achieved with just a few simple layers and without any hyperparameter tuning. There are so many other improvements that can be made to increase the model’s effectiveness, and you are free to attempt to beat this accuracy by implementing these improvements!



## Test your review

In [99]:
test_sen1 = 'it is so amazing.'
test_sen_word1 = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(test_sen1)]
print(test_sen1)
print(test_sen_word1)

test_sen2 = 'you should be very careful.'
test_sen_word2 = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(test_sen2)]

test_sen3 = 'you should be very careful. I have returned'
test_sen_word3 = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(test_sen3)]


test_sen_word = [test_sen_word1, test_sen_word2, test_sen_word3]
test_sen_final =  pad_input(test_sen_word, seq_len)
print(test_sen_final)


it is so amazing.
[9, 12, 36, 326, 2]
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0

In [103]:
test_sen_final_torch = torch.from_numpy(test_sen_final)
inp_test = test_sen_final_torch.to(device)
h = model.init_hidden(len(test_sen_word))
output, h = model(inp_test, h)
print('1', test_sen1, '\n2 ', test_sen2,'\n3 ',test_sen3)
print(output)

1 it is so amazing. 
2  you should be very careful. 
3  you should be very careful. I have returned
tensor([0.9917, 0.4249, 0.1015], device='cuda:0', grad_fn=<SelectBackward0>)
