In [1]:
# Some part of the code was referenced from below.
# https://github.com/pytorch/examples/tree/master/word_language_model 

import torch
import torch.nn as nn
import numpy as np
from data_utils import Dictionary, Corpus
from torch.nn.utils import clip_grad_norm_


# import EarlyStopping
from pytorchtools import EarlyStopping

ModuleNotFoundError: No module named 'pytorchtools'

In [4]:
# Device config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Hyper-parameters

num_epochs = 5
learning_rate = 0.001
batch_size = 20
embedding_dim = 128
hidden_size = 1024
seq_length = 30
num_layers = 1
num_samples = 1000 # number of words to be sampled, For testing the model purpose

In [6]:
# Load Dataset

corpus = Corpus()
ids = corpus.get_data(path="train.txt", batch_size=batch_size) # Size = (100, 248708)
vocab_size = len(corpus.dictionary) # size = 10000  ???
num_batches = ids.size(0) // seq_length # = 10
print("vocab_size:", vocab_size)
print("num_batches:", num_batches)
print("Final ids size:", ids.size())

Len of tokens: 24870824
Len of token: 929590
ids size: torch.Size([24870824])
ids ex: tensor(0)
length of dict: 10000
vocab_size: 10000
num_batches: 0
Final ids size: torch.Size([20, 1243541])


In [7]:
# Illustration of torch.view() function

x = torch.LongTensor(100)
print(x.size())
print(x.view(20, -1).size())

torch.Size([100])
torch.Size([20, 5])


In [8]:
# RNN based Language maodel

class RNNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        print("vocab_size:", vocab_size)
        print("embedding_dim:", embedding_dim)
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim) # Args = (Num of unique words, Num of features)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, h):
        
        # Embed word ids to vectors
        x = self.embed(x)
        
        # Forward Propagate
        out, (h, c) = self.lstm(x, h)
        
        # Reshape output to (batch_size*sequence_length, hidden_size)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        
        out = self.linear(out)
        return out, (h, c)

In [9]:
# Initilize model, optimizer
model = RNNLM(vocab_size, embedding_dim, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

vocab_size: 10000
embedding_dim: 128


In [8]:
# Truncated backpropagation
def detach(states):
    return ([state.detach() for state in states]) # Detach is used on torch tensors to stop gradient computation on the tensors

In [9]:
# Train the model
for epoch in range(num_epochs):
    # Set initial hidden and cell states
    states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
              torch.zeros(num_layers, batch_size, hidden_size).to(device))
               
    for i in range(0, ids.size(1) - seq_length, seq_length): # Get 10 ids at a time (seq_length = 10)
        
        # Get mini-batch inputs and targets
        inputs = ids[:, i:i+seq_length].to(device)
        targets = ids[:, (i+1):(i+1)+seq_length].to(device)
        
        # Forward pass
        states = detach(states)
        predictions, states = model(inputs, states)
        loss = criterion(predictions, targets.reshape(-1)) # Flatten targets to 1D array
        
        # Backward and optimize
        model.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5) # Similar to decreasing the learning rate, it clips the resulting gradient right before performing one step.
        # Clipping grads is for forcing the gradients to be reasonably small, which means that the parameter updates will not push the parameters too far from their previous values.
        optimizer.step()
               
        step = (i+1) // seq_length
        if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'.format(epoch+1, num_epochs, step, num_batches, loss.item(), np.exp(loss.item())))
    

Epoch [1/5], Step[0/0], Loss: 9.1786, Perplexity: 9687.74


KeyboardInterrupt: 

In [None]:
# Test the model
with torch.no_grad():
    with open('sample.txt', 'w') as f:
        # Set initial and hidden cell states
        states = (torch.zeros(num_layers, 1, hidden_size).to(device),
                 torch.zeros(num_layers, 1, hidden_size).to(device))
        
        # Select one word id randomly
        prob = torch.ones(vocab_size)
        input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device) # Multinomial function is used to 
        # Unsqueeze just breaks up the tensor into 1 row or n rows of n values (see example below)
        
        for i in range(num_samples):
            # Forward prop
            output, states = model(input, states)
            
        

In [35]:
# Unsqueeze eample
print("squeezed", torch.multinomial(prob, num_samples=3))
print("unsqueezed(0)", torch.multinomial(prob, num_samples=3).unsqueeze(0))
print("unsqueezed(1)", torch.multinomial(prob, num_samples=3).unsqueeze(1))

squeezed tensor([6979,  416, 9343])
unsqueezed(0) tensor([[7874, 1951,  709]])
unsqueezed(1) tensor([[2766],
        [4362],
        [4025]])


In [10]:
range(0, ids.size(1) - seq_length, seq_length)

range(0, 1243511, 30)