### Train a 2 layer LSTM on a PTB dataset

#### Learning schedule
- 1.42 initial learning rate
- Divided by 1.4 every epoch
- Additional division by 1.4 on epochs 4,5,8,11,15

#### Hyperparameters
- Initial batch size 20
- Hidden layer size 560

# ----------------------------------------------------------------------------------

In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

In [2]:
device= torch.device("cuda")
print(device)

cuda


In [3]:
train_data  =  torch.load('../data/ptb/train_data.pt')
test_data   =  torch.load('../data/ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


In [39]:
bs = 20

vocab_size = 10000

In [40]:
class three_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super().__init__()
        
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = nn.LSTM(      hidden_size , hidden_size, num_layers=2, bidirectional=True  )
        self.layer3 = nn.Linear(    hidden_size*2 , vocab_size   )

        
    def forward(self, word_seq ):
        
        g_seq                     =   self.layer1( word_seq )  
        h_seq , (h_fin,c_fin)     =   self.layer2( g_seq )
        
        h_direc_1  = h_fin[2,:,:]
        h_direc_2  = h_fin[3,:,:]
        h_direc_12 = torch.cat( (h_direc_1, h_direc_2)  , dim=1) 
        
        score_seq                 =   self.layer3( h_seq )
        
        return score_seq,  (h_fin,c_fin)

In [41]:
hidden_size= 560

net = three_layer_recurrent_net( hidden_size )

print(net)

utils.display_num_param(net)

three_layer_recurrent_net(
  (layer1): Embedding(10000, 560)
  (layer2): LSTM(560, 560, num_layers=2, bidirectional=True)
  (layer3): Linear(in_features=1120, out_features=10000, bias=True)
)
There are 29371920 (29.37 million) parameters in this neural network


In [42]:
net = net.to(device)

In [43]:
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

tensor([[ 0.0608, -0.0990,  0.0603,  ...,  0.0214, -0.0137,  0.0947],
        [-0.0317,  0.0665,  0.0983,  ...,  0.0804,  0.0502,  0.0894],
        [-0.0230,  0.0614, -0.0191,  ...,  0.0858, -0.0075, -0.0081],
        ...,
        [-0.0021, -0.0790,  0.0028,  ...,  0.0892,  0.0578, -0.0505],
        [-0.0770,  0.0289,  0.0382,  ...,  0.0613, -0.0875,  0.0640],
        [ 0.0192,  0.0602, -0.0744,  ...,  0.0487, -0.0022,  0.0673]],
       device='cuda:0')

In [44]:
criterion = nn.CrossEntropyLoss()

my_lr = 1.42

seq_length = 20

In [45]:
def eval_on_test_set():

    running_loss=0
    num_batches=0    
    
    with torch.no_grad():
       
        h = torch.zeros(1, bs, hidden_size)

        h=h.to(device)


        for count in range( 0 , 4120-seq_length ,  seq_length) :

            minibatch_data =  test_data[ count   : count+seq_length   ]
            minibatch_label = test_data[ count+1 : count+seq_length+1 ]

            minibatch_data=minibatch_data.to(device)
            minibatch_label=minibatch_label.to(device)

            scores, h  = net( minibatch_data)
            
            minibatch_label =   minibatch_label.view(  bs*seq_length ) 
            scores          =            scores.view(  bs*seq_length , vocab_size)

            loss = criterion(  scores ,  minibatch_label )    

            running_loss += loss.item()
            num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
        

In [46]:
start=time.time()

for epoch in range(30):
    
    # keep the learning rate to 1 during the first 4 epochs, then divide by 1.1 at every epoch
    if epoch == 4 or epoch == 5 or epoch == 8 or epoch == 11 or epoch == 15:
        my_lr = my_lr / 1.4
        
    if epoch >= 4:
        my_lr = my_lr / 1.4
        
    
    # create a new optimizer and give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    
    for count in range( 0 , 46478-seq_length ,  seq_length):
             
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                       
        # forward the minibatch through the net        
        scores, h  = net( minibatch_data )
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        utils.normalize_gradient(net)
        optimizer.step()
        
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set() 




epoch= 0 	 time= 26.305980443954468 	 lr= 1.42 	 exp(loss)= 16.727946023250478
test: exp(loss) =  3.7215441238313307

epoch= 1 	 time= 53.18167996406555 	 lr= 1.42 	 exp(loss)= 2.5461094258427304
test: exp(loss) =  2.0729269419200125

epoch= 2 	 time= 80.21796131134033 	 lr= 1.42 	 exp(loss)= 1.7046225477939347
test: exp(loss) =  1.6875763200311193

epoch= 3 	 time= 107.12412643432617 	 lr= 1.42 	 exp(loss)= 1.4449222336263272
test: exp(loss) =  1.5516426766441127

epoch= 4 	 time= 134.14594197273254 	 lr= 0.7244897959183674 	 exp(loss)= 1.2896318065707295
test: exp(loss) =  1.4684482268082386

epoch= 5 	 time= 161.09005284309387 	 lr= 0.3696376509787589 	 exp(loss)= 1.222270736930623
test: exp(loss) =  1.454693853759221

epoch= 6 	 time= 188.0818543434143 	 lr= 0.26402689355625636 	 exp(loss)= 1.1880529560529531
test: exp(loss) =  1.4568458502268484

epoch= 7 	 time= 214.9742579460144 	 lr= 0.18859063825446884 	 exp(loss)= 1.164550359211816
test: exp(loss) =  1.4602903752917837

epoc

# ----------------------------------------------------------------------------------

#### Inference
* In each cell is a sentence taken from ptb test set. Use utils.text2tensor() to convert this sentence into a LongTensor. 
* Feed the sentence to the network
* The network should compute a probability vector over the full vocabulary of 10,000 words. This vector tells you which words are likely to come next. Display the 30 most likely words according to the network.

In [101]:
sentence = "prices averaging roughly $ N a barrel higher in the third"

x = utils.text2tensor(sentence)
x = x.unsqueeze(0)

scores, (h,c) = net( x.to(device) )

print(sentence, '... \n')

p = F.softmax(scores[0][-1],dim=0)

utils.show_most_likely_words(p)

prices averaging roughly $ N a barrel higher in the third ... 

4.9%	 to
2.6%	 issue
2.5%	 <eos>
1.8%	 <unk>
1.7%	 new
1.5%	 parties
1.1%	 it
1.0%	 the
1.0%	 stock
0.9%	 world
0.9%	 that
0.9%	 assets
0.9%	 creditors
0.8%	 prices
0.8%	 commission
0.7%	 big
0.7%	 rates
0.7%	 issues
0.7%	 court
0.6%	 center
0.6%	 market
0.6%	 supply
0.5%	 street
0.5%	 guard
0.5%	 investors
0.5%	 fund
0.5%	 actions
0.5%	 treasury
0.5%	 house
0.5%	 bonds


In [102]:

sentence = "i think my line has been very consistent mrs. hills said at a news"

x = utils.text2tensor(sentence)
x = x.unsqueeze(0)

scores, (h,c) = net( x.to(device) )

print(sentence, '... \n')

p = F.softmax(scores[0][-1],dim=0)

utils.show_most_likely_words(p)  

i think my line has been very consistent mrs. hills said at a news ... 

38.2%	 of
19.3%	 <eos>
5.6%	 to
4.4%	 the
2.7%	 and
2.4%	 that
2.2%	 's
1.4%	 they
0.9%	 he
0.7%	 with
0.7%	 for
0.7%	 it
0.6%	 from
0.6%	 as
0.5%	 but
0.5%	 a
0.5%	 there
0.5%	 is
0.4%	 in
0.4%	 during
0.3%	 will
0.3%	 on
0.3%	 because
0.2%	 might
0.2%	 expects
0.2%	 <unk>
0.2%	 since
0.2%	 i
0.2%	 fell
0.2%	 three


In [103]:
sentence = "this appears particularly true at gm which had strong sales in the"

x = utils.text2tensor(sentence)
x = x.unsqueeze(0)

scores, (h,c) = net( x.to(device) )

print(sentence, '... \n')

p = F.softmax(scores[0][-1],dim=0)

utils.show_most_likely_words(p)

this appears particularly true at gm which had strong sales in the ... 

4.3%	 company
4.1%	 federal
3.9%	 <unk>
2.1%	 market
1.7%	 board
1.4%	 stock
1.2%	 new
1.2%	 first
1.1%	 u.s.
1.0%	 same
0.9%	 government
0.9%	 american
0.9%	 most
0.9%	 big
0.8%	 firm
0.7%	 state
0.6%	 sale
0.6%	 biggest
0.6%	 economy
0.5%	 issue
0.5%	 city
0.5%	 house
0.5%	 price
0.5%	 fed
0.5%	 past
0.5%	 next
0.5%	 gain
0.5%	 fact
0.5%	 number
0.5%	 current


In [105]:
sentence = "some analysts expect oil prices to remain relatively"

x = utils.text2tensor(sentence)
x = x.unsqueeze(0)

scores, (h,c) = net( x.to(device) )

print(sentence, '... \n')

p = F.softmax(scores[0][-1],dim=0)

utils.show_most_likely_words(p)

some analysts expect oil prices to remain relatively ... 

5.1%	 high
4.0%	 <unk>
3.2%	 sales
2.0%	 analyst
1.4%	 tax
0.9%	 manager
0.9%	 buying
0.8%	 with
0.8%	 three
0.8%	 at
0.8%	 few
0.6%	 small
0.5%	 civil
0.5%	 parts
0.5%	 special
0.5%	 law
0.4%	 widespread
0.4%	 trader
0.4%	 combined
0.4%	 liability
0.4%	 months
0.4%	 large
0.4%	 ahead
0.4%	 joint
0.4%	 night
0.3%	 filed
0.3%	 hours
0.3%	 friday
0.3%	 offering
0.3%	 representatives


### Making 3 original sentences and see what the network predicts

In [108]:
sentence= "the trade war is expected to continue to have an effect on the"

x = utils.text2tensor(sentence)
x = x.unsqueeze(0)

scores, (h,c) = net( x.to(device) )

print(sentence, '... \n')

p = F.softmax(scores[0][-1],dim=0)

utils.show_most_likely_words(p)

the trade war is expected to continue to have an effect on the ... 

4.3%	 company
4.1%	 federal
3.9%	 <unk>
2.1%	 market
1.7%	 board
1.4%	 stock
1.2%	 new
1.2%	 first
1.1%	 u.s.
1.0%	 same
0.9%	 government
0.9%	 american
0.9%	 most
0.9%	 big
0.8%	 firm
0.7%	 state
0.6%	 sale
0.6%	 biggest
0.6%	 economy
0.5%	 issue
0.5%	 city
0.5%	 house
0.5%	 price
0.5%	 fed
0.5%	 past
0.5%	 next
0.5%	 gain
0.5%	 fact
0.5%	 number
0.5%	 current


In [115]:
sentence= "the economy is currently facing a "

x = utils.text2tensor(sentence)
x = x.unsqueeze(0)

scores, (h,c) = net( x.to(device) )

print(sentence, '... \n')

p = F.softmax(scores[0][-1],dim=0)

utils.show_most_likely_words(p)   

the economy is currently facing a  ... 

6.1%	 <unk>
4.5%	 year
2.6%	 new
1.9%	 stock
1.4%	 series
1.4%	 company
1.3%	 week
1.3%	 deal
1.2%	 report
1.1%	 recent
1.1%	 market
1.0%	 federal
1.0%	 lot
1.0%	 firm
0.9%	 month
0.9%	 big
0.9%	 small
0.9%	 time
0.8%	 major
0.7%	 house
0.7%	 large
0.7%	 share
0.7%	 potential
0.6%	 unit
0.6%	 law
0.6%	 high
0.5%	 public
0.5%	 day
0.5%	 number
0.5%	 sale


In [117]:
sentence= "the company has been increasing prices in order to drive"

x = utils.text2tensor(sentence)
x = x.unsqueeze(0)

scores, (h,c) = net( x.to(device) )

print(sentence, '... \n')

p = F.softmax(scores[0][-1],dim=0)

utils.show_most_likely_words(p)


the company has been increasing prices in order to drive ... 

5.7%	 at
4.4%	 <unk>
3.6%	 sales
3.0%	 before
2.2%	 down
1.9%	 to
1.9%	 claims
1.7%	 than
1.4%	 a
1.3%	 as
1.1%	 year
1.1%	 profit
1.0%	 off
0.9%	 up
0.9%	 was
0.9%	 about
0.9%	 after
0.8%	 lost
0.8%	 five
0.8%	 modest
0.8%	 office
0.8%	 nearly
0.7%	 within
0.7%	 director
0.7%	 due
0.7%	 with
0.7%	 analyst
0.7%	 almost
0.7%	 two
0.6%	 in
