# N-Gram Language Model

Part of **#30DaysOfBasics**, Let's train a N-gram language model using Deep learning

In [158]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

In [None]:
HI_FILE_PATH = '/Users/impyadav/Desktop/data/data/hi/hi_sample.txt'

In [None]:
CONTEXT_SIZE = 3

EMBD_DIM = 128

In [None]:
#Prepare the vocabulary

def create_vocab(text_file):
    with open(text_file, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    vocab = set(content.split())
    word_to_idx = {word: i for word, i in enumerate(set(vocab))}
    idx_to_words = {i: word for word, i in enumerate(set(vocab))}
    return vocab, word_to_idx, idx_to_words, content.split()

In [None]:
#N gram generation from content

def generate_n_grams(list_of_tokens, n):
    n_grams = [
        
        ([list_of_tokens[idx-idx1-1] for idx1 in range(n)], list_of_tokens[idx]) for idx in range(n, len(list_of_tokens))
    ]
    return n_grams

In [None]:
hi_vocab, hi_word_to_idx, hi_idx_to_word, hi_content = create_vocab(HI_FILE_PATH)

In [None]:
print('Lenght of Vocab: ', len(hi_vocab))

print('Hi Content snippet: ', hi_content[:50])

In [None]:
hi_ngram = generate_n_grams(hi_content, 3)

In [None]:
print('N-gram snippet: ', hi_ngram[:5])

In [None]:
class NGramLM(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, context_size):
        super(NGramLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(context_size * embed_dim, 128)
        self.fc2 = nn.Linear(128, vocab_size)
        
#         self.relu = nn.functional.relu()
#         self.softmax = nn.functional.log_softmax()
        
    
    def forward(self, inputs):
        embeddings = self.embedding(inputs).view((1,-1))
        out = F.relu(self.fc1(embeddings))
        
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        
        return log_probs

In [None]:
net = NGramLM(len(hi_vocab),EMBD_DIM, CONTEXT_SIZE)

In [None]:
loss_fn = nn.NLLLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)

In [None]:
#Model Training

losses = []

for epoch in range(20):
    
    total_loss = 0
    
    for context, target in hi_ngram:
        
        context_idxs = torch.tensor([hi_idx_to_word[word] for word in context], dtype=torch.long)
        
        net.zero_grad()
        
        words_prob = net(context_idxs)
        
        loss = loss_fn(words_prob, torch.tensor([hi_idx_to_word[target]], dtype=torch.long))
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
    
    print('Epoch {}/{} and loss: {}'.format(epoch, 20, total_loss))
    
    losses.append(total_loss)

# print('Losses: ', losses)

In [None]:
#generate the embedding with traied model

exmaple_embedd = net.embedding.weight[hi_idx_to_word['प्रतिबद्ध']]
print(exmaple_embedd.shape)

print(exmaple_embedd)