In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set default to run on the GPU if available (for the speed up)
if torch.cuda.is_available():
    torch.cuda.set_device(device)
torch.set_default_tensor_type('torch.cuda.FloatTensor')


from torch.utils.data import Dataset, DataLoader, random_split

### Imdb sentiment analysis.

This time we are going to look at imdb sentiment analysis. The imdb dataset is a dataset containing movie reviews and a label for each review wheter it is positive or negative. We are going to create a model that will predict if a review is positive or negative. 

We will make use of torchtext for loading and proprocessing the dataset.

Read and run the following cell:

In [2]:
# First we setup the imdb dataset
from torchtext import data, datasets
# set up fields, one for the text in the review and one for the label. 
# We will make each review of length 100 (Set to a smaller number for faster training)
TEXT = data.Field(lower=True, batch_first=True, fix_length=100)
LABEL = data.Field(sequential=False)

# load the dataset. This prompts a download, which will take a minute or 2
train, test = datasets.IMDB.splits(TEXT, LABEL) 

# build the vocabulary. We will only use the 1000 most common words. All the other words will be mapped to the <unk> token
TEXT.build_vocab(train, max_size=1000)
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = data.Iterator.splits((train, test), batch_size=64, shuffle=True)

Let's eplore the things that we have created.

First the vocabulairy. This can be seen as a dictonairy that maps each possible word to a number.

In [8]:
# A word that it does know
print(TEXT.vocab['good'])
print(TEXT.vocab['bad'])
# An unknown word
print(TEXT.vocab["notaword"])

56
97
0


Next up we have the iterator. We will print one batch. 

In [10]:
#### explore the output of the iterator

for i, batch in enumerate(train_iter):
    print(batch)
    print(batch.text[0])
    print(batch.label)
    break


[torchtext.data.batch.Batch of size 64 from IMDB]
	[.text]:[torch.cuda.LongTensor of size 64x100 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 64 (GPU 0)]
tensor([  0, 552, 139,   9, 273, 182,   5,  10,  19,   0,   0,  57,   0,   0,
         15,   9, 273,   3,   0,   0, 131,   4,   0,   6,   0,   0,  23,   3,
        615, 431,  34,   2,   0,   5,   0,  90, 208, 547,   6,   0, 237,   0,
          3,   0,  12,  14,  38,  97,  12,  95,  87, 171,   6,   0,   0,   0,
          0,   0, 101, 197,   4,   0,   0,  90,   0,   9, 118, 100, 129,  22,
        247,   8,   0,   0,   6,   0,   8,  19,   2,   0,   5,   3, 312, 342,
         40,   0,  18,   0,   0, 134,   3, 105,   0,  16, 111,   0,  44, 753,
          4, 860])
tensor([1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 1,
        1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1,
        2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1])


As you can see the text is now a tensor containing a number for each word.
The label are 1 -> negative and 2 -> positive. We will substract 1 during training to make sure it becomes 0 and 1. 

Now that we have the data al set up we can build a model. 

### The model

Build a simple lstm model, using an embedding layer as the first layer, then a recurrent layer (lstm or gru) and to finish a fully connected layer that maps to 1 value, and finally use a sigmoid to make sure that this value is between 0 and 1. 

In [11]:
import torch.nn as nn 
from torch.nn import functional as F

In [16]:


class LSTMModel(nn.Module):

    ### your code here ###
    def __init__(self, num_words, n_hidden):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(num_words, n_hidden)
        self.LSTM = nn.GRU(n_hidden, n_hidden, batch_first=True, dropout=0.2)
        
        self.fc = nn.Linear(n_hidden , 1)

    def forward(self, review):
        
        x = self.embedding(review)
        
        rnn_out, _ = self.LSTM(x)
     
        fc_out = self.fc(rnn_out[:, -1 ]) # Only need the last output of the rnn
        
        return torch.sigmoid(fc_out)
    ### your code here end ###
    



In [17]:
# Initialize the model

### your code here ###
num_words  = len(TEXT.vocab)
n_hidden = 128
model = LSTMModel(num_words + 2, n_hidden)
model.to(device)
### your code here end ###

LSTMModel(
  (embedding): Embedding(1004, 128)
  (LSTM): GRU(128, 128, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [18]:

# A train and test function made for you. 
def train(train_iterator, model, criterion, optimizer, n_epochs, test_callback=None):
    loss_history = []
    test_history = []
    for epoch in range(1, n_epochs + 1):
        epoch_total_loss = 0
        
        total = 0
        correct = 0
        
        for i, batch in enumerate(train_iterator):
            optimizer.zero_grad() # Clears existing gradients from previous epoch

            output = model(batch.text)

            loss = criterion(output, (batch.label).float().view(-1, 1) - 1) ### Make sure the labels are either 0 or 1. 

            loss.backward() # Does backpropagation and calculates gradients
            optimizer.step() # Updates the weights accordingly

            epoch_total_loss += loss.item() # Keep track of the total loss
            
            #Caclucate the accuracy
            total += len(batch)
            predicted = torch.round(output.data)
            correct += (predicted == (batch.label -1).view(-1, 1)).sum().item()
            
        loss_history.append(epoch_total_loss/len(train_iter))
        
        if test_callback != None:
            test_history.append(test_callback(model, criterion))

        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(epoch_total_loss/ len(train_iter)))
        print("accuracy: {:.4f}".format(correct/total))
    return loss_history, test_history


def test(model, criterion, dataloader):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        total_loss = 0
        
        for i, batch in enumerate(dataloader):
            output = model(batch.text)

            loss = criterion(output, (batch.label - 1).float().view(-1, 1))
            total_loss += loss.item()
            predicted = torch.round(output.data)
    
            total += len(batch)
            
            correct += (predicted == (batch.label -1).view(-1, 1)).sum().item()
        print("Test loss: {:.4f}".format(total_loss/len(dataloader)))
        print("test accuracy: {:.4f}".format(correct/total))
    model.train()
    return total_loss/len(dataloader)
            


    

In [19]:
# Call the train and test functions with the appropiate inputs.

### your code here ###
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters() , 1e-4)
test_callback = lambda model, criterion: test(model, criterion, test_iter) # Creates a callback that can be used to test the network on the given dataloader

train(train_iter, model, criterion, optimizer, 20, test_callback=test_callback)

### your code here end ###



Test loss: 0.6913
test accuracy: 0.5263
Epoch: 1/20............. Loss: 0.6928
accuracy: 0.5152
Test loss: 0.6892
test accuracy: 0.5400
Epoch: 2/20............. Loss: 0.6881
accuracy: 0.5404
Test loss: 0.6759
test accuracy: 0.5757
Epoch: 3/20............. Loss: 0.6816
accuracy: 0.5640
Test loss: 0.6171
test accuracy: 0.6737
Epoch: 4/20............. Loss: 0.6428
accuracy: 0.6336
Test loss: 0.5958
test accuracy: 0.6854
Epoch: 5/20............. Loss: 0.5962
accuracy: 0.6898
Test loss: 0.5556
test accuracy: 0.7212
Epoch: 6/20............. Loss: 0.5573
accuracy: 0.7190
Test loss: 0.5254
test accuracy: 0.7406
Epoch: 7/20............. Loss: 0.5259
accuracy: 0.7416
Test loss: 0.5095
test accuracy: 0.7485
Epoch: 8/20............. Loss: 0.5022
accuracy: 0.7570
Test loss: 0.4946
test accuracy: 0.7584
Epoch: 9/20............. Loss: 0.4857
accuracy: 0.7676
Test loss: 0.4889
test accuracy: 0.7623
Epoch: 10/20............. Loss: 0.4693
accuracy: 0.7770
Test loss: 0.4785
test accuracy: 0.7674
Epoch: 11

([0.6927843439914382,
  0.6880668511476054,
  0.6816020373188322,
  0.6427847087535712,
  0.5961577418210257,
  0.5573450424482146,
  0.5259499501084428,
  0.5022134029346964,
  0.48571262647733665,
  0.4693160699608991,
  0.4597640252479202,
  0.44735416480342444,
  0.4385412686773578,
  0.4297178965395369,
  0.42292078491062157,
  0.4152733393184974,
  0.40824065145934024,
  0.40643808443832885,
  0.39659374982804596,
  0.3902866379607974],
 [0.6913130013534176,
  0.689219329973011,
  0.6759096562405071,
  0.6171248371491347,
  0.5957660841210114,
  0.5556196610793434,
  0.5253527847397358,
  0.5095439605090929,
  0.4945890554000654,
  0.4888618900190534,
  0.4784829840635705,
  0.48088316402167003,
  0.48147571117371857,
  0.47148097819074647,
  0.46796259691800607,
  0.474194520536591,
  0.473863754316669,
  0.46879534392862976,
  0.4797502360151857,
  0.46634570484423576])

Lastly we look at the result of the model. 


Make a function that given a review and your trained model, gives back a score on how positive it was. 0 being negative and 1 being positive.

Make sure to call model.eval() and model.train()

In [20]:

import numpy as np
def predict(review, model):
    ### your code here ###
    model.eval()
    text = torch.tensor([[TEXT.vocab[token] for token in TEXT.tokenize(review)]])
    result = model(text)
    model.train()
    return result

    ### your code here end ###

In [22]:
predict("bad", model)

tensor([[0.2686]], grad_fn=<SigmoidBackward>)

In [29]:
reviews = [
    "excellent movie",
    "i really liked this movie ",
    "this movie really sucked !",
    "bad movie!",
]

In [30]:
for review in reviews:
    print(review, predict(review, model) , '\n')

excellent movie tensor([[0.7247]], grad_fn=<SigmoidBackward>) 

i really liked this movie  tensor([[0.6467]], grad_fn=<SigmoidBackward>) 

this movie really sucked ! tensor([[0.4783]], grad_fn=<SigmoidBackward>) 

bad movie! tensor([[0.2956]], grad_fn=<SigmoidBackward>) 



### Improving the model

Next up you will be improving this model

For the embedding layer we will use the a pretrained word embedding called glove. See: https://nlp.stanford.edu/projects/glove/

We will first load glove into word vectors and then preprocess the dataset with these vectors. This is needed because each word must map to the same index that is used in the glove word embedding. 

In [31]:
from torchtext.vocab import Vectors
glove = Vectors(name="./glove/glove.6B.100d.txt")

In [32]:
from torchtext import data, datasets

# set up fields again.
TEXT = data.Field(lower=True, batch_first=True, fix_length=100)
LABEL = data.Field(sequential=False)

# make splits for data
train_set, test_set = datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary, this time with help of the glove vectors. 
#We provide the vectors, such that we only add words that we have a vector for in our vocab
TEXT.build_vocab(train_set, vectors=glove, max_size=1000)
LABEL.build_vocab(train_set)

# make iterator for splits
train_iter_glove, test_iter_glove = data.Iterator.splits(
    (train_set, test_set), batch_size=64)

Now we look at the output of the vocab. Notice that the indexes are still the same, that is because they are sorted on the frequency of occurences. 

In [41]:
# A word that it does know
print(TEXT.vocab['good'])
print(TEXT.vocab['bad'])
# An unknown word
print(TEXT.vocab["notaword"])

56
97
0


Now we look at the output of the iterator. This has changed.

In [43]:
for i, batch in enumerate(train_iter):
    print(batch)
    print(batch.text)
    print(batch.text.shape)
    print(batch.label)
    break


[torchtext.data.batch.Batch of size 64 from IMDB]
	[.text]:[torch.cuda.LongTensor of size 64x100 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 64 (GPU 0)]
tensor([[ 10,  20,   0,  ...,   1,   1,   1],
        [ 46,  25,  22,  ..., 127,   0, 123],
        [  9, 120, 560,  ..., 773,   0, 521],
        ...,
        [  9,   0,  10,  ...,   0,   0,  44],
        [  2,   0,   0,  ...,   0, 992,   0],
        [ 10,  24,  14,  ...,   0,   0,   0]])
torch.Size([64, 100])
tensor([2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1,
        2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2,
        1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2])


#### Building the improved model

Build the improved model. Use the same architecture as before, but instead of a standard word embedding vector use the glove word embedding vector.

See https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding for a hint on how to use the pretrained glove. 

In [44]:



class GloveModel(nn.Module):
    ### your code here ###
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        
        
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim, )
        self.emb.weight.data.copy_(pretrained_vec) # load pretrained vectors
        self.emb.weight.requires_grad = True # make embedding non trainable
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden, batch_first=True)
        self.out = nn.Linear(self.n_hidden, 1)
        
    def forward(self, seq):
        embs = self.emb(seq)
        gru_out, self.h = self.gru(embs)
        outp = self.out(gru_out[:, -1])
        return torch.sigmoid(outp)
    ### your code here end ###


In [45]:
# Create your model

### your code here ###
vocab_size = len(TEXT.vocab)
embedding_dim = 100
n_hidden = 128
m = GloveModel(vocab_size, embedding_dim, n_hidden, 1, 
                             train_set.fields['text'].vocab.vectors)
m.to(device)

### your code here end ###

GloveModel(
  (emb): Embedding(1002, 100)
  (gru): GRU(100, 128, batch_first=True)
  (out): Linear(in_features=128, out_features=1, bias=True)
)

In [46]:
#Finaly train the model

### your code here ###

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(m.parameters() , 1e-2)
test_callback = lambda model, criterion: test(model, criterion, test_iter_glove) # Creates a callback that can be used to test the network on the given dataloader

train(train_iter_glove, m, criterion, optimizer, 20, test_callback=test_callback)

### your code here end ###

Test loss: 0.4794
test accuracy: 0.7646
Epoch: 1/20............. Loss: 0.5642
accuracy: 0.6866
Test loss: 0.4990
test accuracy: 0.7572
Epoch: 2/20............. Loss: 0.4718
accuracy: 0.7735
Test loss: 0.4861
test accuracy: 0.7608
Epoch: 3/20............. Loss: 0.4713
accuracy: 0.7724
Test loss: 0.4828
test accuracy: 0.7626
Epoch: 4/20............. Loss: 0.4615
accuracy: 0.7765
Test loss: 0.4861
test accuracy: 0.7592
Epoch: 5/20............. Loss: 0.4520
accuracy: 0.7860
Test loss: 0.4897
test accuracy: 0.7616
Epoch: 6/20............. Loss: 0.4438
accuracy: 0.7918
Test loss: 0.4939
test accuracy: 0.7560
Epoch: 7/20............. Loss: 0.4482
accuracy: 0.7872
Test loss: 0.4931
test accuracy: 0.7585
Epoch: 8/20............. Loss: 0.4404
accuracy: 0.7927
Test loss: 0.5036
test accuracy: 0.7533
Epoch: 9/20............. Loss: 0.4411
accuracy: 0.7899
Test loss: 0.5027
test accuracy: 0.7539
Epoch: 10/20............. Loss: 0.4483
accuracy: 0.7849
Test loss: 0.5015
test accuracy: 0.7546
Epoch: 11

([0.5641851957191897,
  0.47183393586017286,
  0.47126758472084085,
  0.4614529875690675,
  0.452008411250151,
  0.44377186452336326,
  0.44821373657192415,
  0.44036970655326646,
  0.4410761331810671,
  0.4483019495406724,
  0.4479513313916638,
  0.459264839472978,
  0.45289395273189104,
  0.45318664896213795,
  0.45171852657557143,
  0.4495169467785779,
  0.4609440773954172,
  0.4652392120312547,
  0.4594708702448384,
  0.4550710610873864],
 [0.479434857283102,
  0.49895469508969875,
  0.4861139400535837,
  0.4827610689127232,
  0.48609102008592747,
  0.4896632599861116,
  0.4938630996762639,
  0.49308759492376575,
  0.5036234739415176,
  0.5026691366568246,
  0.5015409544605733,
  0.5049723195450385,
  0.5145088083603803,
  0.511075034242152,
  0.5076713148895127,
  0.5096205844141334,
  0.5195085641063387,
  0.5194622777459567,
  0.5360191105043187,
  0.5187718299648646])

Lastly we will evualate this model. 

In [49]:
# Create a function that takes in a review and your glove model and gives back the sentiment score. 
def predict_glove(review, model):
    ### Your code here ###
    model.eval()
    text = torch.tensor([[TEXT.vocab[token] for token in TEXT.tokenize(review)]])
    result = model(text)
    model.train()
    return result
    ### Your code here end ###

In [50]:
reviews = [
    "excellent movie",
    "I really liked this movie !",
    "This movie is really bad !",
    "best movie ever!",
    "A Eggsellent movie"
]

In [51]:
for review in reviews:
    print(review, predict_glove(review, m) , '\n')

excellent movie tensor([[0.9945]], grad_fn=<SigmoidBackward>) 

I really liked this movie ! tensor([[0.8516]], grad_fn=<SigmoidBackward>) 

This movie is really bad ! tensor([[0.0107]], grad_fn=<SigmoidBackward>) 

best movie ever! tensor([[0.9756]], grad_fn=<SigmoidBackward>) 

A Eggsellent movie tensor([[0.6233]], grad_fn=<SigmoidBackward>) 

