In [16]:
import sys, re
import numpy as np
import math

###############################################################################

def preprocess(s):
    """Tokenise a line"""
    o = re.sub('([^a-zA-Z0-9\']+)', ' \g<1> ', s.strip())
    return ['<BOS>'] + re.sub('  *', ' ', o).strip().split(' ')

###############################################################################

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

EMBEDDING_DIM = 4
CONTEXT_SIZE = 2 #!!!#
HIDDEN_DIM = 6

# Trigram Neural Network Model
class TrigramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(TrigramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view(
                (-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

In [17]:
import sys, re
import numpy as np
import math


###############################################################################

training_samples = []
vocabulary = set(['<UNK>'])
training_file = open('train.txt', 'r')
lines = training_file.readlines()

for line in lines:
    tokens = preprocess(line)
    for i in tokens: vocabulary.add(i) 
    training_samples.append(tokens)

word2idx = {k: v for v, k in enumerate(vocabulary)}
idx2word = {v: k for k, v in word2idx.items()}

x_train = []
y_train = []
for tokens in training_samples:
    for i in range(len(tokens) - 2): #!!!#
        x_train.append([word2idx[tokens[i]],word2idx[tokens[i+1]]]) #!!!#
        y_train.append([word2idx[tokens[i+2]]]) #!!!#

x_train = np.array(x_train)
y_train = np.array(y_train)

###############################################################################

BATCH_SIZE = 1
NUM_EPOCHS = 10

train_set = np.concatenate((x_train, y_train), axis=1)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)

loss_function = nn.NLLLoss()
model = TrigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(NUM_EPOCHS):
    for i, data_tensor in enumerate(train_loader):
        context_tensor = data_tensor[:,0:2] #!!!#
        target_tensor = data_tensor[:,2] #!!!#

        model.zero_grad()

        log_probs = model(context_tensor)
        loss = loss_function(log_probs, target_tensor)

        loss.backward()
        optimiser.step()    

    print('Epoch:', epoch, 'loss:', float(loss))

torch.save({'model': model.state_dict(), 'vocab': idx2word}, 'model.lm')

print('Model saved.')

Epoch: 0 loss: 2.7779366970062256
Epoch: 1 loss: 2.3805863857269287
Epoch: 2 loss: 1.9992704391479492
Epoch: 3 loss: 1.7564469575881958
Epoch: 4 loss: 1.6033422946929932
Epoch: 5 loss: 1.4789083003997803
Epoch: 6 loss: 1.3662440776824951
Epoch: 7 loss: 1.2539446353912354
Epoch: 8 loss: 1.1312713623046875
Epoch: 9 loss: 1.0010528564453125
Model saved.


In [20]:
import sys, re
import numpy as np
import math


###############################################################################

blob = torch.load('model.lm')
idx2word = blob['vocab']
word2idx = {k: v for v, k in idx2word.items()}
vocabulary = set(idx2word.values())

model = TrigramNNmodel(len(vocabulary), EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM)
model.load_state_dict(blob['model'])

###############################################################################

BATCH_SIZE = 1

testing_file = open('test.txt', 'r')
lines = testing_file.readlines()

for line in lines:
    tokens = preprocess(line)
    
    x_test = []
    y_test = []
    for i in range(len(tokens) - 2): #!!!#
        x_test.append([word2idx[tokens[i]],word2idx[tokens[i+1]]]) #!!!#
        y_test.append([word2idx[tokens[i+2]]]) #!!!#
    
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    
    print(x_test)
    print(y_test)
    
    test_set = np.concatenate((x_test, y_test), axis=1)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)
    
    total_prob = 1.0
    for i, data_tensor in enumerate(test_loader):
        context_tensor = data_tensor[:,0:2] #!!!#
        target_tensor = data_tensor[:,2] #!!!#
        log_probs = model(context_tensor)
        probs = torch.exp(log_probs)
        predicted_label = int(torch.argmax(probs, dim=1)[0])
    
        true_label = y_test[i][0]
        true_word = idx2word[true_label]
    
        prob_true = float(probs[0][true_label])
        total_prob *= prob_true
    
    print('%.6f\t%.6f\t' % (total_prob, math.log(total_prob)), tokens)
    
    # line = sys.stdin.readline()

[[ 2 10]
 [10 12]
 [12  3]]
[[12]
 [ 3]
 [ 5]]
0.019606	-3.931920	 ['<BOS>', 'where', 'are', 'you', '?']
[[ 2 11]
 [11  3]
 [ 3  0]
 [ 0 14]]
[[ 3]
 [ 0]
 [14]
 [ 5]]
0.022908	-3.776251	 ['<BOS>', 'were', 'you', 'in', 'england', '?']
[[ 2 12]
 [12  3]
 [ 3  0]
 [ 0  1]]
[[3]
 [0]
 [1]
 [5]]
0.024515	-3.708455	 ['<BOS>', 'are', 'you', 'in', 'mexico', '?']
[[ 2 15]
 [15  7]
 [ 7  0]
 [ 0  1]]
[[7]
 [0]
 [1]
 [8]]
0.000025	-10.586203	 ['<BOS>', 'i', 'am', 'in', 'mexico', '.']
[[ 2 12]
 [12  3]
 [ 3  4]
 [ 4  0]
 [ 0  1]]
[[3]
 [4]
 [0]
 [1]
 [5]]
0.000015	-11.103380	 ['<BOS>', 'are', 'you', 'still', 'in', 'mexico', '?']
