# LSTM based language model

The task is to develop an LSTM based language model. We will do it in the following steps

In [1]:
import nltk

In [2]:
from nltk.corpus import gutenberg

In [3]:
sents = gutenberg.sents('shakespeare-hamlet.txt')

In [75]:
sents = [[w.lower() for w in sent] for sent in sents if len(sent)>1]

For each unique word (including punctuations) in the vocabulary create a unique index.

In [17]:
from collections import Counter

In [76]:
def word2index(sents):
    words = Counter()
    for sent in sents:
        for w in sent:
            words[w] = 1
    
    cnt = 0
    w2i = {}  # maps each unique word to index
    i2w = {}  # maps a index to its corresponding word (will be required during generation)
    for w in words:
        w2i[w] = cnt
        i2w[cnt] = w
        cnt+=1
    return w2i, i2w    
        

In [77]:
w2i, i2w = word2index(sents)

In [78]:
len(w2i)

4715

Create an LSTM based language model. The model should include an embedding module for mapping from the word to the embedding. 

In [79]:
import torch
import torch.nn as nn

In [80]:
class LSTMLM(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size): # note output_size is same as vocabulary size
        super(LSTMLM,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, inp):
        x = self.embedding(inp)
        hidden, cell_state = self.initHidden()
        inp_1 = x.view(inp.shape[0],1,-1)
        out, (h_n, c_n) = self.lstm(x.view(inp.shape[0],1,-1), (hidden, cell_state))
        out = self.h2o(out)
        return out
        
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size), torch.zeros(1,1,self.hidden_size)

Write a function that takes a sententes and creates a tensor with the words/tokens replace by their corresponding index.

In [81]:
def encodeSentence(sent, w2i):
    return torch.tensor([w2i[w] for w in sent],dtype=torch.long)

In [82]:
sents[15]

['ophe', '.']

In [83]:
encodeSentence(sents[15], w2i)

tensor([130,  11])

In [84]:
embedding_size = 100
hidden_size = 200
output_size = len(w2i)
model = LSTMLM(embedding_size,hidden_size,output_size)

Train the model considering the each sentence as a sequence and predicting the next word in the sequence. You can follow the training model discussed in the class.

<div>
<img src="model.png" width="500"/>
</div>

In [23]:
from torch.optim import Adam

In [24]:
import random

In [108]:
def train(model, sents, w2i, epochs=1, learning_rate=0.0001):
    optimizer = Adam(model.parameters(),lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    for _ in range(epochs):
        random.shuffle(sents)
        for sent in sents:
            e_sent = encodeSentence(sent, w2i)
            inp = e_sent[:-1]
            labels = e_sent[1:]
            out = model(inp)
            loss = criterion(out.squeeze(1), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [109]:
train(model, sents, w2i)

Write a function that takes the trained model and generate senetnces starting from a random word. You can continue generating until '.' is generated or a predefined length is reached

In [116]:
def generate(model,w2i,i2w,length, start_word = None):
    if start_word is None:
        sequence = torch.tensor([random.randint(0,len(w2i)-1)],dtype=torch.long) # initialize sequence with a random word
    else:
        sequence = torch.tensor([w2i[start_word]],dtype=torch.long)
    
    i = 1
    while i<length:
        out = model(sequence)
        val, ind = out.squeeze(1).max(1)
        sequence = torch.cat((sequence, ind))
        i+=1
        if i2w[ind.item()]=='.':
            break
    return ' '.join([i2w[i] for i in sequence.numpy()])

In [117]:
generate(model, w2i, i2w, 10)

'sworne .'