In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set default to run on the GPU if available (for the speed up)
if torch.cuda.is_available():
    torch.cuda.set_device(device)
torch.set_default_tensor_type('torch.cuda.FloatTensor')


from torch.utils.data import Dataset, DataLoader, random_split

### Imdb sentiment analysis.

This time we are going to look at imdb sentiment analysis. The imdb dataset is a dataset containing movie reviews and a label for each review wheter it is positive or negative. We are going to create a model that will predict if a review is positive or negative. 

We will make use of torchtext for loading and proprocessing the dataset.

Read and run the following cell (this can take a while):

In [2]:
# First we setup the imdb dataset
from torchtext import data, datasets
# set up fields, one for the text in the review and one for the label. 
# We will make each review of length 100 (Set to a smaller number for faster training)
TEXT = data.Field(lower=True, batch_first=True, fix_length=100)
LABEL = data.Field(sequential=False)

# load the dataset. This prompts a download, which will take a minute or 2
train, test = datasets.IMDB.splits(TEXT, LABEL) 

# build the vocabulary. We will only use the 1000 most common words. All the other words will be mapped to the <unk> token
TEXT.build_vocab(train, max_size=1000)
LABEL.build_vocab(train)

# make iterator for splits
train_iter, test_iter = data.Iterator.splits((train, test), batch_size=64, shuffle=True)

downloading aclImdb_v1.tar.gz


.data\imdb\aclImdb_v1.tar.gz: 100%|████████████████████████████████████████████████| 84.1M/84.1M [03:06<00:00, 452kB/s]


Let's eplore the things that we have created.

First the vocabulairy. This can be seen as a dictonairy that maps each possible word to a number.

In [3]:
# A word that it does know
print(TEXT.vocab['good'])
print(TEXT.vocab['bad'])
# An unknown word
print(TEXT.vocab["notaword"])

56
97
0


Next up we have the iterator. We will print one batch. 

In [4]:
#### explore the output of the iterator

for i, batch in enumerate(train_iter):
    print(batch)
    print(batch.text[0])
    print(batch.label)
    break


[torchtext.data.batch.Batch of size 64 from IMDB]
	[.text]:[torch.cuda.LongTensor of size 64x100 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 64 (GPU 0)]
tensor([   9,  226,    0,    5,   48,    0,    0,   14,  695,    8,    2,  155,
           4,   81,    2,    0,    5,    2,    0,   11,   27,   14,    0,    6,
          28,    0,   46,   10,    7,    0,  427,   19,    0,    0,    0,    9,
         226,    0,   11,   12,    7,    3,    0,    0,    9,  226,   81,    0,
           5,    2,    0,    0,   27,   81,   95,    3, 1000,    8,    2,  435,
           0,   20,   43,    2,    0,   11,    2,    0,  160,  347,  101,  316,
          11,    0,    6,   21,    0,    0,    0,   11,   27,  199,   12,   14,
           2,  116,    0,    0,  125,    0,    9,   54,   37,    6,  118,   63,
           0,   11,    7,    0])
tensor([2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1,
        1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2,
       

As you can see the text is now a tensor containing a number for each word.
The label are 1 -> negative and 2 -> positive. We will substract 1 during training to make sure it becomes 0 and 1. 

Now that we have the data al set up we can build a model. 

### The model

Build a simple lstm model, using an embedding layer as the first layer, then a recurrent layer (lstm or gru) and to finish a fully connected layer that maps to 1 value, and finally use a sigmoid to make sure that this value is between 0 and 1. 

In [5]:
import torch.nn as nn 
from torch.nn import functional as F

In [6]:


class LSTMModel(nn.Module):

    ### your code here ###
   
    ### your code here end ###
    



In [7]:
# Initialize the model

### your code here ###

### your code here end ###



LSTMModel(
  (embedding): Embedding(1004, 128)
  (LSTM): GRU(128, 128, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [8]:

# A train and test function made for you. 
def train(train_iterator, model, criterion, optimizer, n_epochs, test_callback=None):
    loss_history = []
    test_history = []
    for epoch in range(1, n_epochs + 1):
        epoch_total_loss = 0
        
        total = 0
        correct = 0
        
        for i, batch in enumerate(train_iterator):
            optimizer.zero_grad() # Clears existing gradients from previous epoch

            output = model(batch.text)

            loss = criterion(output, (batch.label).float().view(-1, 1) - 1) ### Make sure the labels are either 0 or 1. 

            loss.backward() # Does backpropagation and calculates gradients
            optimizer.step() # Updates the weights accordingly

            epoch_total_loss += loss.item() # Keep track of the total loss
            
            #Caclucate the accuracy
            total += len(batch)
            predicted = torch.round(output.data)
            correct += (predicted == (batch.label -1).view(-1, 1)).sum().item()
            
        loss_history.append(epoch_total_loss/len(train_iter))
        
        if test_callback != None:
            test_history.append(test_callback(model, criterion))

        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(epoch_total_loss/ len(train_iter)))
        print("accuracy: {:.4f}".format(correct/total))
    return loss_history, test_history


def test(model, criterion, dataloader):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        total_loss = 0
        
        for i, batch in enumerate(dataloader):
            output = model(batch.text)

            loss = criterion(output, (batch.label - 1).float().view(-1, 1))
            total_loss += loss.item()
            predicted = torch.round(output.data)
    
            total += len(batch)
            
            correct += (predicted == (batch.label -1).view(-1, 1)).sum().item()
        print("Test loss: {:.4f}".format(total_loss/len(dataloader)))
        print("test accuracy: {:.4f}".format(correct/total))
    model.train()
    return total_loss/len(dataloader)
            


    

In [9]:
# Call the train and test functions with the appropiate inputs.

### your code here ###

### your code here end ###



Test loss: 0.6915
test accuracy: 0.5267
Epoch: 1/20............. Loss: 0.6925
accuracy: 0.5207
Test loss: 0.6872
test accuracy: 0.5448
Epoch: 2/20............. Loss: 0.6873
accuracy: 0.5472
Test loss: 0.6656
test accuracy: 0.6028
Epoch: 3/20............. Loss: 0.6745
accuracy: 0.5822
Test loss: 0.6170
test accuracy: 0.6700
Epoch: 4/20............. Loss: 0.6309
accuracy: 0.6516
Test loss: 0.5669
test accuracy: 0.7129
Epoch: 5/20............. Loss: 0.5782
accuracy: 0.7017
Test loss: 0.5389
test accuracy: 0.7304
Epoch: 6/20............. Loss: 0.5347
accuracy: 0.7342
Test loss: 0.5250
test accuracy: 0.7404
Epoch: 7/20............. Loss: 0.5075
accuracy: 0.7525
Test loss: 0.5108
test accuracy: 0.7496
Epoch: 8/20............. Loss: 0.4854
accuracy: 0.7672
Test loss: 0.4969
test accuracy: 0.7560
Epoch: 9/20............. Loss: 0.4688
accuracy: 0.7758
Test loss: 0.4995
test accuracy: 0.7580
Epoch: 10/20............. Loss: 0.4562
accuracy: 0.7838
Test loss: 0.4831
test accuracy: 0.7652
Epoch: 11

([0.692466125006566,
  0.6872996508008073,
  0.6745399485158798,
  0.630903163529418,
  0.5781750868805839,
  0.5347197629759074,
  0.5075281070321417,
  0.4854333762012784,
  0.46878297554562465,
  0.4562239401480731,
  0.44645765355176026,
  0.43265173349843916,
  0.42788938907406215,
  0.4162208127319965,
  0.40872080006715283,
  0.4033765041690958,
  0.395789208924374,
  0.39031493568511877,
  0.38187781795668785,
  0.3728023030416435],
 [0.6914828742861443,
  0.6872391164150384,
  0.6656061054190712,
  0.6169612239236417,
  0.566850199571351,
  0.5389149401651319,
  0.5249890650782134,
  0.5108112409672774,
  0.49685966328281883,
  0.4994553518493462,
  0.4830516484539832,
  0.48829825919912295,
  0.47860247617030083,
  0.47432800590077323,
  0.49444912225389115,
  0.4909029400257198,
  0.4746713745395851,
  0.4758910871756351,
  0.47896208985687216,
  0.4849302517559827])

Lastly we look at the result of the model. 


Make a function that given a review and your trained model, gives back a score on how positive it was. 0 being negative and 1 being positive.

Make sure to call model.eval() and model.train()

In [14]:

import numpy as np
def predict(review, model):
    ### your code here ###

    ### your code here end ###

In [15]:
predict("bad", model)

tensor([[0.3583]], grad_fn=<SigmoidBackward>)

In [16]:
reviews = [
    "excellent movie",
    "i really liked this movie ",
    "this movie really sucked !",
    "bad movie!",
]

In [17]:
for review in reviews:
    print(review, predict(review, model) , '\n')

excellent movie tensor([[0.7028]], grad_fn=<SigmoidBackward>) 

i really liked this movie  tensor([[0.6523]], grad_fn=<SigmoidBackward>) 

this movie really sucked ! tensor([[0.4710]], grad_fn=<SigmoidBackward>) 

bad movie! tensor([[0.3076]], grad_fn=<SigmoidBackward>) 



### Improving the model

Next up you will be improving this model

For the embedding layer we will use the a pretrained word embedding called glove. See: https://nlp.stanford.edu/projects/glove/

We will first load glove into word vectors and then preprocess the dataset with these vectors. This is needed because each word must map to the same index that is used in the glove word embedding. 

In [19]:
from torchtext.vocab import Vectors
glove = Vectors(name="../glove/glove.6B.100d.txt")

100%|███████████████████████████████████████████████████████████████████████▉| 399999/400000 [00:40<00:00, 9998.15it/s]


In [20]:
from torchtext import data, datasets

# set up fields again.
TEXT = data.Field(lower=True, batch_first=True, fix_length=100)
LABEL = data.Field(sequential=False)

# make splits for data
train_set, test_set = datasets.IMDB.splits(TEXT, LABEL)

# build the vocabulary, this time with help of the glove vectors. 
#We provide the vectors, such that we only add words that we have a vector for in our vocab
TEXT.build_vocab(train_set, vectors=glove, max_size=1000)
LABEL.build_vocab(train_set)

# make iterator for splits
train_iter_glove, test_iter_glove = data.Iterator.splits(
    (train_set, test_set), batch_size=64)

Now we look at the output of the vocab. Notice that the indexes are still the same, that is because they are sorted on the frequency of occurences. 

In [21]:
# A word that it does know
print(TEXT.vocab['good'])
print(TEXT.vocab['bad'])
# An unknown word
print(TEXT.vocab["notaword"])

56
97
0


Now we look at the output of the iterator. This has changed.

In [22]:
for i, batch in enumerate(train_iter):
    print(batch)
    print(batch.text)
    print(batch.text.shape)
    print(batch.label)
    break


[torchtext.data.batch.Batch of size 64 from IMDB]
	[.text]:[torch.cuda.LongTensor of size 64x100 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 64 (GPU 0)]
tensor([[ 10,   7, 382,  ...,   4,  41,  45],
        [ 10,   0,   5,  ...,   0,   0,   5],
        [  9,  37, 123,  ...,   7,   0,   0],
        ...,
        [  9, 178, 250,  ..., 614,   8,   0],
        [ 50,   9,  62,  ...,   0,   0,  46],
        [  9,  98,  11,  ...,   1,   1,   1]])
torch.Size([64, 100])
tensor([2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2,
        1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 2,
        1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2])


#### Building the improved model

Build the improved model. Use the same architecture as before, but instead of a standard word embedding vector use the glove word embedding vector.

See https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding for a hint on how to use the pretrained glove. 

In [23]:



class GloveModel(nn.Module):
    ### your code here ###

    ### your code here end ###


In [24]:
# Create your model

### your code here ###

### your code here end ###

GloveModel(
  (emb): Embedding(1002, 100)
  (gru): GRU(100, 128, batch_first=True)
  (out): Linear(in_features=128, out_features=1, bias=True)
)

In [25]:
#Finaly train the model

### your code here ###

### your code here end ###

Test loss: 0.4788
test accuracy: 0.7664
Epoch: 1/20............. Loss: 0.5547
accuracy: 0.6992
Test loss: 0.4784
test accuracy: 0.7718
Epoch: 2/20............. Loss: 0.4679
accuracy: 0.7747
Test loss: 0.5008
test accuracy: 0.7505
Epoch: 3/20............. Loss: 0.4635
accuracy: 0.7797
Test loss: 0.5022
test accuracy: 0.7533
Epoch: 4/20............. Loss: 0.4749
accuracy: 0.7715
Test loss: 0.4897
test accuracy: 0.7624
Epoch: 5/20............. Loss: 0.4655
accuracy: 0.7751
Test loss: 0.5074
test accuracy: 0.7470
Epoch: 6/20............. Loss: 0.4726
accuracy: 0.7715
Test loss: 0.5083
test accuracy: 0.7490
Epoch: 7/20............. Loss: 0.4814
accuracy: 0.7626
Test loss: 0.5055
test accuracy: 0.7501
Epoch: 8/20............. Loss: 0.4796
accuracy: 0.7661
Test loss: 0.4985
test accuracy: 0.7548
Epoch: 9/20............. Loss: 0.4696
accuracy: 0.7752
Test loss: 0.4921
test accuracy: 0.7585
Epoch: 10/20............. Loss: 0.4642
accuracy: 0.7792
Test loss: 0.5081
test accuracy: 0.7490
Epoch: 11

([0.5546873286557015,
  0.4679495672435712,
  0.4634969122422016,
  0.47491328437310043,
  0.46554286515011506,
  0.4725953018878732,
  0.4814195406558873,
  0.4795529649538152,
  0.4695868889998902,
  0.4642304591358165,
  0.46360078926586434,
  0.47200196303065173,
  0.47655868568383825,
  0.4839686896185131,
  0.5036174357699617,
  0.4970815979763675,
  0.48870639628766444,
  0.4784226937367178,
  0.4735247378459062,
  0.4783393055429239],
 [0.47880952913895286,
  0.4784197878197331,
  0.5007757043747036,
  0.5022006344307414,
  0.4896800354161226,
  0.5073573013095904,
  0.5082996977717066,
  0.5055322560203045,
  0.4984945202117686,
  0.49206945902246346,
  0.5080820442465566,
  0.5070649440712331,
  0.5209832135063913,
  0.5187466839314117,
  0.5376601321313083,
  0.5384633570071071,
  0.5372882766644363,
  0.5157905725566932,
  0.5235775001061237,
  0.5171376735048221])

Lastly we will evualate this model. 

In [29]:
# Create a function that takes in a review and your glove model and gives back the sentiment score. 
def predict_glove(review, model):
    ### Your code here ###

    ### Your code here end ###

In [30]:
reviews = [
    "excellent movie",
    "I really liked this movie !",
    "This movie is really bad !",
    "best movie ever!",
    "A Eggsellent movie"
]

In [31]:
for review in reviews:
    print(review, predict_glove(review, m) , '\n')

excellent movie tensor([[0.9883]], grad_fn=<SigmoidBackward>) 

I really liked this movie ! tensor([[0.8884]], grad_fn=<SigmoidBackward>) 

This movie is really bad ! tensor([[0.0405]], grad_fn=<SigmoidBackward>) 

best movie ever! tensor([[0.9424]], grad_fn=<SigmoidBackward>) 

A Eggsellent movie tensor([[0.5038]], grad_fn=<SigmoidBackward>) 



What do you notice about the output? Compare it to the previous model. 