# Lab 02: LSTM - exercise

In [1]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    # find automatically the path of the folder containing "file_name" :
    file_name = 'lstm_exercise.ipynb'
    import subprocess
    path_to_file = subprocess.check_output('find . -type f -name ' + str(file_name), shell=True).decode("utf-8")
    path_to_file = path_to_file.replace(file_name,"").replace('\n',"")
    # if previous search failed or too long, comment the previous line and simply write down manually the path below :
    #path_to_file = '/content/gdrive/My Drive/AI6103_2020_codes/codes/labs_lecture10/lab02_lstm'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd
    

In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

### With or without GPU?

It is recommended to run this code on GPU:<br> 
* Time for 1 epoch on CPU : 274 sec ( 4.56 min)<br> 
* Time for 1 epoch on GPU : 10.1 sec w/ GeForce GTX 1080 Ti <br>

In [3]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

cuda


### Download Penn Tree Bank (the tensor train_data should consists of 20 columns of ~50,000 words)

In [4]:
from utils import check_ptb_dataset_exists
data_path=check_ptb_dataset_exists()

train_data  =  torch.load(data_path+'ptb/train_data.pt')
test_data   =  torch.load(data_path+'ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


### Some constants associated with the data set

In [5]:
bs = 20

vocab_size = 10000


### Make a recurrent net class

In [6]:
class three_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super(three_layer_recurrent_net, self).__init__()
        
        self.layer1 = nn.Embedding(vocab_size, hidden_size) # COMPLETE HERE
        self.layer2 = nn.LSTM(hidden_size, hidden_size) # COMPLETE HERE
        self.layer3 = nn.Linear(hidden_size, vocab_size) # COMPLETE HERE

        
    def forward(self, word_seq, h_init, c_init ):
        
        g_seq                      = self.layer1(word_seq)  # COMPLETE HERE
        h_seq , (h_final,c_final)  = self.layer2(g_seq, (h_init, c_init))  # COMPLETE HERE (don't forget the extra parenthesis around h_init and c_init)
        score_seq                  = self.layer3(h_seq)  # COMPLETE HERE
        
        return score_seq,  h_final , c_final


### Build the net. Choose the hidden size to be 300. How many parameters in total?

In [7]:
hidden_size= 300 # COMPLETE HERE

net = three_layer_recurrent_net( hidden_size )

print(net)

utils.display_num_param(net)

three_layer_recurrent_net(
  (layer1): Embedding(10000, 300)
  (layer2): LSTM(300, 300)
  (layer3): Linear(in_features=300, out_features=10000, bias=True)
)
There are 6732400 (6.73 million) parameters in this neural network


### Send the weights of the networks to the GPU

In [8]:
net = net.to(device)

### Set up manually the weights of the embedding module and Linear module

In [9]:
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

print('')




### Choose the criterion, as well as the following important hyperparameters: 
* initial learning rate = 5
* sequence length = 35

In [10]:
criterion = nn.CrossEntropyLoss()

my_lr = 5 # COMPLETE HERE

seq_length = 35 # COMPLETE HERE

### Function to evaluate the network on the test set

In [11]:
def eval_on_test_set():

    running_loss=0
    num_batches=0    
       
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)
   
    h=h.to(device)
    c=c.to(device)
       
    for count in range( 0 , 4120-seq_length ,  seq_length) :
               
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                  
        scores, h, c  = net( minibatch_data, h , c)
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
        c=c.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
        

### Do 8 passes through the training set.

In [12]:
start=time.time()

for epoch in range(8):
    
    # keep the learning rate to 1 during the first 2 epochs, then divide by 3 at every epoch    
    if epoch >= 2:
        my_lr = my_lr / 3 # COMPLETE HERE
        
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quatities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
       
    # set the initial h and c to be the zero vector
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)

    # send them to the gpu    
    h=h.to(device)
    c=c.to(device)
    
    for count in range( 0 , 46478-seq_length ,  seq_length):
        
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data = train_data[count:count+seq_length]
        minibatch_label = train_data[count+1:count+seq_length+1]     
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # Detach to prevent from backpropagating all the way to the beginning
        # Then tell Pytorch to start tracking all operations that will be done on h and c
        h= h.detach() # COMPLETE HERE
        c= c.detach() # COMPLETE HERE
        h= h.requires_grad_() # COMPLETE HERE
        c= c.requires_grad_() # COMPLETE HERE
                       
        # forward the minibatch through the net        
        scores, h, c  = net(minibatch_data, h, c) # COMPLETE HERE
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          = scores.view(bs*seq_length, vocab_size)  # COMPLETE HERE          
        minibatch_label = minibatch_label.view(bs*seq_length) # COMPLETE HERE       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(scores,minibatch_label)# COMPLETE HERE
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward() # COMPLETE HERE

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        utils.normalize_gradient(net)
        optimizer.step() # COMPLETE HERE
        
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set() 



epoch= 0 	 time= 14.196570873260498 	 lr= 5 	 exp(loss)= 280.59900173036544
test: exp(loss) =  175.7411789767482

epoch= 1 	 time= 28.7273907661438 	 lr= 5 	 exp(loss)= 127.33319904730308
test: exp(loss) =  135.64098634514667

epoch= 2 	 time= 43.38536834716797 	 lr= 1.6666666666666667 	 exp(loss)= 81.62260358425358
test: exp(loss) =  114.42534445620049

epoch= 3 	 time= 58.07672071456909 	 lr= 0.5555555555555556 	 exp(loss)= 67.44378954866842
test: exp(loss) =  110.6723867754826

epoch= 4 	 time= 72.99125504493713 	 lr= 0.1851851851851852 	 exp(loss)= 62.601272326595904
test: exp(loss) =  109.0023871171007

epoch= 5 	 time= 87.76577830314636 	 lr= 0.0617283950617284 	 exp(loss)= 60.84959695732308
test: exp(loss) =  108.08457436029224

epoch= 6 	 time= 102.34269523620605 	 lr= 0.0205761316872428 	 exp(loss)= 60.20247731012772
test: exp(loss) =  107.6307383326605

epoch= 7 	 time= 117.21051836013794 	 lr= 0.006858710562414266 	 exp(loss)= 59.966557787689254
test: exp(loss) =  107.39581

### Choose one sentence (taken from the test set)

In [13]:
sentence1 = "some analysts expect oil prices to remain relatively"

sentence2 = "over the next days and weeks they say investors should look for stocks to"

sentence3 = "prices averaging roughly $ N a barrel higher in the third"

sentence4 = "i think my line has been very consistent mrs. hills said at a news"

sentence5 = "this appears particularly true at gm which had strong sales in"

# or make your own sentence.  No capital letter or punctuation allowed. Each word must be in the allowed vocabulary.
sentence6= "he was very"

# SELECT THE SENTENCE HERE
mysentence = sentence1

### Convert the sentence into a vector, then send to GPU

In [14]:
minibatch_data=utils.sentence2vector(mysentence)
      
minibatch_data=minibatch_data.to(device)

print(minibatch_data)

tensor([[ 307],
        [1140],
        [ 334],
        [1486],
        [1786],
        [  64],
        [ 719],
        [ 377]], device='cuda:0')


### Set the initial hidden state to zero, then run the LSTM.

In [15]:
h = torch.zeros(1, 1, hidden_size)
c = torch.zeros(1, 1, hidden_size)
h=h.to(device)
c=c.to(device)

scores , h, c = net(minibatch_data , h, c)

### Display the network prediction for the next word

In [16]:
print(mysentence, '... \n')

utils.show_next_word(scores)

some analysts expect oil prices to remain relatively ... 

11.5%	 high
8.1%	 strong
6.0%	 low
5.3%	 small
4.1%	 flat
4.1%	 <unk>
2.1%	 good
1.6%	 thin
1.4%	 higher
1.3%	 greater
1.2%	 complicated
1.2%	 adjusted
0.9%	 positive
0.8%	 slow
0.8%	 volatile
0.7%	 well
0.7%	 stable
0.7%	 different
0.7%	 weak
0.7%	 significant
0.7%	 few
0.6%	 profitable
0.6%	 bullish
0.6%	 large
0.6%	 heavy
0.6%	 minor
0.6%	 available
0.6%	 modest
0.5%	 active
0.5%	 lower
