## Implementation of the paper
# A Neural Probabilistic Language Model Bengio et al, Journal of Machine Learning Research, 2003

## Architecture from paper

![NN architecture](neural_prob_model_pic_1.png "Image Credit Figure 1 in paper")  

Image credit - Figure 1 from paper

## All imports

In [1]:
import numpy as np
import torch
import pandas 
import torch.nn as nn
import torch.optim as optim



# The Model

## Defining model class

In [2]:
class NPM(nn.Module):
    def __init__(self, architecture_params):
        '''
        architecture_params is a dictionary containing all parameters for defining the architecture - 
        Vocab size V, embedding size m, no of hidden units in layer 1 h and no of previous words to be used as an input to predict
        current word n
        
        '''
        super(NPM,self).__init__()
        self.V = architecture_params['V'] ## V is the vocab size
        self.m = architecture_params['m']  ## m is the embedding size desired
        self.n = architecture_params['n'] ## No of prev words used as input to predict current word
        self.h = architecture_params['h'] ##  no of hidden units in layer 1
        
        self.embedding = nn.Embedding(self.V, self.m)  ##  to create an embedding representation where each word in vocab
                                                       ## is reprented by a vector of dimension m
        self.f1 = nn.Linear(self.n*self.m, self.h) ## n prev words , each of dimension m flattened to create n*m inputs, output of this layer is hidden size h
        self.f2 = nn.Linear(self.h, self.V)  ## output is of size of vocab as we are predicting next word
        
    def forward(self, inputs):
        '''
        assumes a batched input
        input : tensor of dimension batch_size * 
        output : tensor of probabilities :  dimension batch_size * V
        '''
        batch_size = inputs.shape[0]
        embeds = self.embedding(inputs).reshape(batch_size,1,-1) ## to flatten - before reshaping, (after embedding) of  dimension batch_size * no of words * embedding size; after flattening, of dimension batch_size * (no of words * embedding size)
        h1 = torch.tanh(self.f1(embeds))  ## The tanh transformation in the first hidden layer
        outp = torch.log_softmax(self.f2(h1), dim=-1) ## final output - use log_softmax which is  log of softmax as implementation is more stable and efficient
                                                    ## dim=-1 because we want softmax applied across the last dimension of the tensor
        return outp
        
        

        
        

## Defining architecture parameters

In [45]:
context_length = 2 ## use 2 words before to predict current word
num_hidden_units = 5  ## in the hidden layer
embedding_size = 10  


# The Data
 

## Function to Convert text to tuple of input words and target for LM

In [4]:
def generate_tuples_for_embedding(text_list, use_both_sides = False, CONTEXT_SIZE = 5):
    '''Aim : generate list of tuples of form ([xi-k,xi-(k-1)]...xi-1],xi) . Used as input for different NN models'''
    list_of_tup = []
    for text in text_list:
        text = text.split()
        
        if use_both_sides :
            start_index_target_word = CONTEXT_SIZE
            end_index_target_word = len(text)-CONTEXT_SIZE      

        else: ## use only words before context word
            start_index_target_word = CONTEXT_SIZE
            end_index_target_word = len(text)
        for i in range(start_index_target_word,end_index_target_word):
            feature_list = []
            if use_both_sides :
                max_response_index = i + CONTEXT_SIZE
            else:
                max_response_index = i


            for j in range( (i-CONTEXT_SIZE),max_response_index):
                if j==i:
                    pass
                else:
                    feature_list.append(text[j])
            list_of_tup.append((feature_list,text[i]))
    return list_of_tup


In [5]:
generate_tuples_for_embedding(["I went to school yesterday"], False, 2)

[(['I', 'went'], 'to'),
 (['went', 'to'], 'school'),
 (['to', 'school'], 'yesterday')]

## Function to get w2index and index2word from input corpus (list of sentences)


In [6]:
def get_w2index_index2word(sentence_list):
    '''Given a list of sentences, create word to index and index to word from it'''
    list_of_words =  " ".join(sentences).split()

    list_of_words = list(set(list_of_words))
    word_to_index_dict = {w : i for i,w in enumerate(list_of_words)}
    index_to_word_dict = {i : w for i,w in enumerate(list_of_words)}
    
    return word_to_index_dict, index_to_word_dict
    
    

## Function to batch input data, expected as tuples

In [41]:
def generate_batch(TUP = None, batch_size = None):
    curr_index = 0
    print(curr_index)
    while curr_index <= max(0,len(TUP)-batch_size):
        last_index = min(curr_index + batch_size,len(TUP))        
        curr_tup = TUP[curr_index : last_index]
        print(curr_tup)
        curr_index = curr_index + batch_size
        yield curr_tup

## Function to convert tuple to tensor

In [8]:
def convert_tup_to_tensor(TUP, word_to_index):
    context_list = []
    response_list = []
    for tup in TUP:
        curr_context = tup[0]
        curr_response = tup[1]
        context_as_index = [word_to_index[word] for word in curr_context]
        context_as_index = torch.tensor(context_as_index,dtype = torch.long)
        target_as_index = torch.tensor([word_to_index[curr_response]],dtype = torch.long)
        
        context_list.append(context_as_index)
        response_list.append(target_as_index)
    return torch.stack(context_list), torch.stack(response_list)

In [9]:
sentences = ["I went to school today", "I will not go to the shop tomorrow", "Hi, How are you ? "]
word_to_index_dict, index_to_word_dict = get_w2index_index2word(sentences)
lm_tuples = generate_tuples_for_embedding(text_list=sentences, use_both_sides=False, CONTEXT_SIZE=context_length)

In [10]:
lm_tuples

[(['I', 'went'], 'to'),
 (['went', 'to'], 'school'),
 (['to', 'school'], 'today'),
 (['I', 'will'], 'not'),
 (['will', 'not'], 'go'),
 (['not', 'go'], 'to'),
 (['go', 'to'], 'the'),
 (['to', 'the'], 'shop'),
 (['the', 'shop'], 'tomorrow'),
 (['Hi,', 'How'], 'are'),
 (['How', 'are'], 'you'),
 (['are', 'you'], '?')]

In [11]:
input_tensor, target_tensor = convert_tup_to_tensor(lm_tuples, word_to_index_dict)

In [13]:
target_tensor


tensor([[13],
        [ 4],
        [ 7],
        [ 8],
        [ 6],
        [13],
        [ 0],
        [11],
        [10],
        [ 2],
        [ 5],
        [ 1]])

## Instantiating model class

In [12]:
vocab_size = len(word_to_index_dict.keys())
architecture_params = {}
architecture_params['V'] = vocab_size
architecture_params['n'] = context_length
architecture_params['m'] = embedding_size
architecture_params['h'] = num_hidden_units
##
model = NPM(architecture_params)

In [14]:
architecture_params

{'V': 16, 'n': 2, 'm': 10, 'h': 5}

In [15]:
model

NPM(
  (embedding): Embedding(16, 10)
  (f1): Linear(in_features=20, out_features=5, bias=True)
  (f2): Linear(in_features=5, out_features=16, bias=True)
)

## Defining the loss

In [16]:
loss = nn.NLLLoss()

## Defining the optimizer

In [17]:
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [18]:
optimizer

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    momentum: 0
    nesterov: False
    weight_decay: 0
)

# The training

In [43]:
num_epochs = 1
batch_size = 32

def train():
    all_losses = []
    for epoch in range(0, num_epochs):
        print(epoch)
        epoch_loss = 0
        for tup in generate_batch(TUP=lm_tuples, batch_size=batch_size):
            context_as_tensor_batch, target_as_tensor_batch = convert_tup_to_tensor(tup, word_to_index_dict)

            model.zero_grad()
            outp_prob = model(context_as_tensor_batch)
            outp_prob_reshape = outp_prob.reshape(len(outp_prob),outp_prob.shape[2])
            target_as_tensor_batch_reshape = target_as_tensor_batch.reshape(len(target_as_tensor_batch))
            curr_loss = loss(outp_prob_reshape,target_as_tensor_batch_reshape)
            print('%%%% curr loss %f'%(curr_loss.item()))
            epoch_loss = epoch_loss + curr_loss.item()
            curr_loss.backward()
            optimizer.step()
        print('epoch %d : loss %f'%(epoch,epoch_loss))
    all_losses.append(epoch_loss)
    print(all_losses)
         
    

In [44]:
train()

0
0
[(['I', 'went'], 'to'), (['went', 'to'], 'school'), (['to', 'school'], 'today'), (['I', 'will'], 'not'), (['will', 'not'], 'go'), (['not', 'go'], 'to'), (['go', 'to'], 'the'), (['to', 'the'], 'shop'), (['the', 'shop'], 'tomorrow'), (['Hi,', 'How'], 'are'), (['How', 'are'], 'you'), (['are', 'you'], '?')]
tensor([[ 3,  9],
        [ 9, 13],
        [13,  4],
        [ 3, 12],
        [12,  8],
        [ 8,  6],
        [ 6, 13],
        [13,  0],
        [ 0, 11],
        [14, 15],
        [15,  2],
        [ 2,  5]]) tensor([[13],
        [ 4],
        [ 7],
        [ 8],
        [ 6],
        [13],
        [ 0],
        [11],
        [10],
        [ 2],
        [ 5],
        [ 1]])
hi
tensor([[-2.1360, -2.5079, -2.9242, -2.9128, -2.6829, -3.2109, -2.2670, -3.0448,
         -2.8601, -3.0864, -2.7498, -3.3964, -2.9572, -3.2118, -2.3863, -3.0384],
        [-2.2113, -2.8212, -2.5103, -2.9681, -2.6770, -2.7890, -2.7073, -2.8605,
         -3.0057, -2.8274, -2.8515, -2.8250, -3.0172, -3.0