In [1]:
'''
First, we will import everything we need. We will also define a couple of useful functions.
'''
import torch
from torch import nn
from torch import optim

import random

# This is a function that prints the number of trainable parameters 
# of a model.
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# This functions prints all parameters (and their gradients) of a model.
def print_parameters(model):
    for name, param in model.named_parameters():
        print(name)
        print(param.data)
        print(param.grad)

In [2]:
'''
Then, we need to define our model. Remember, we want to build a bigram language model.
'''
class FeedforwardLM(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_layer = nn.Linear(embedding_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, vocab_size)
        
        self.relu = nn.ReLU()
        
    def forward(self, input):

        embedding = self.embedding(input)
        hidden_rep = self.relu(self.hidden_layer(embedding))
        output = self.output_layer(hidden_rep)
        
        return output

In [3]:
def load_data(filename, old_vocab=False):
    
    # TODO: Load data, convert text into tensors, construct vocabulary, return data and vocab
    if not old_vocab:
        vocab = {'<UNK>': 0}
    else:
        vocab = old_vocab
    data = list()
    file = open(filename)
    for line in file:
        line_text = line.split()
        line_text = ['<s>'] + line_text + ['</s>']
        
        if not old_vocab:
            # form vocabulary
            for word in line_text:
                if word not in vocab:
                    vocab[word] = len(vocab)
        
        # add words to data
        for i, word in enumerate(line_text):
            if i < len(line_text) - 1:
                if word in vocab:
                    idx1 = vocab[word]
                else: 
                    idx1 = vocab['<UNK>']
                if line_text[i + 1] in vocab:
                    idx2 = vocab[line_text[i + 1]]
                else: 
                    idx2 = vocab['<UNK>']
                data.append((torch.tensor(idx1), torch.tensor(idx2)))
                
    return vocab, data

In [14]:
# Let's put it all together. 

# 1) Load the data, and shuffle the training data.
# TODO
vocab, train_data = load_data('bible.train.txt')
print(len(vocab))
_, dev_data = load_data('bible.dev.txt', vocab)
print(len(vocab))
_, test_data = load_data('bible.test.txt', vocab)
print(len(vocab))

print(train_data[0])
random.shuffle(train_data)
print(train_data[0])

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2086: character maps to <undefined>

In [None]:
# 2) Initialize our model.
# TODO

our_lm = FeedforwardLM(len(vocab), 10, 15)
count_parameters(our_lm)
#print_parameters(our_lm)

In [None]:
# 3) Now we train our model. 
# TODO

epochs = 10
ce = nn.CrossEntropyLoss()
softmax = nn.Softmax(dim=0)
optimizer = optim.SGD(our_lm.parameters(), lr=0.1)

for i in range(epochs):
    print('### Epoch: ' + str(i+1) + ' ###')
    av_loss = 0
    our_lm.train()
    for (x, y) in train_data[:10000]:
        optimizer.zero_grad()
        
        # a) calculate probs / get an output
        y_raw = our_lm(x)
        y_hat = softmax(y_raw)
        
        # b) compute loss
        loss = ce(y_raw.unsqueeze(0),y.unsqueeze(0))
        av_loss += loss
        
        # c) get the gradient
        loss.backward()

        # d) update the weights
        optimizer.step()
    validate(our_lm, dev_data)
    print(av_loss/len(train_data[:10000]))

In [None]:
# Use the model to predict some words!
# TODO

words = ['walk', 'to']

for i in range(10):
    word = words[i]
    idx = vocab[word]
    tensor_idx = torch.tensor(idx)
    
    raw_output = our_lm(tensor_idx)
    probs = softmax(raw_output)
    
    pred = torch.argmax(probs)
    
    # Print prediction.
    for w, v in vocab.items():
        if v == pred:
            print(word + ' ' + w)
            words.append(w)
    

In [None]:
# Note: Perplexity is just exp(2, cross-entropy). So we just use the loss here.
def validate(model, data):
    
    model.eval()
    
    # TODO: Implement validation function
    av_loss = 0
    for (x, y) in data[:1000]:
        
        # a) calculate probs / get an output
        y_raw = model(x)
        
        # b) compute loss
        loss = ce(y_raw.unsqueeze(0),y.unsqueeze(0))
        av_loss += loss

    av_loss = av_loss/len(data[:1000])
    
    print("Average loss: " + str(av_loss))
    