In [None]:
import nltk
import csv
import random
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
device

'cuda'

# Text Extraction and Preprocessing

In [None]:
def get_examples(filename):
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t')
        next(csv_reader)
        example_list = [(mr, ref) for (mr, ref) in csv_reader]
    return example_list

In [None]:
MAX_LEN = 50
START = '<s>'
END = '</s>'
UNK = '<unk>'
PAD = '<pad>'
def extract_mr_ref(example_list):
    example_list_ = []
    ref_tokens_list = []
    mr_tokens_list = []
    lengths_list = []
    for mr, ref in example_list:
        # Reference Tokens
        ref_tokens = nltk.word_tokenize(ref.lower(), language='english')
        if len(ref_tokens) > MAX_LEN:
            continue
        lengths_list.append(len(ref_tokens))
        if len(ref_tokens) < MAX_LEN:
            ref_tokens += [PAD]*(MAX_LEN-len(ref_tokens))
        ref_tokens = [START,] + ref_tokens + [END,]

        # Meaning Representation Tokens
        mr_tokens = mr.lower().split(',')
        mr_tokens = [token.strip() for token in mr_tokens]

        example_list_.append((mr, ref))
        ref_tokens_list.append(ref_tokens)
        mr_tokens_list.append(mr_tokens)
    return example_list_, ref_tokens_list, mr_tokens_list, lengths_list

Getting our examples

In [None]:
train_examples = get_examples('train.txt')
train_examples, train_ref_tokens, train_mr_tokens, train_lengths = extract_mr_ref(train_examples)

Building our vocabulary

In [None]:
# Now build vocabulary
from torchtext.vocab import build_vocab_from_iterator

ref_vocab = build_vocab_from_iterator(train_ref_tokens, min_freq=1,  ## TODO: Maybe this will cause issues
                                      max_tokens=50000, specials=[UNK,])
ref_vocab.set_default_index(ref_vocab[UNK])
mr_vocab = build_vocab_from_iterator(train_mr_tokens, min_freq=1,
                                     max_tokens=50000)
train_ref_tokens = [torch.Tensor([ref_vocab[token] for token in sentence]).type(torch.long).to(device) for sentence in train_ref_tokens]
train_mr_tokens = [torch.Tensor([mr_vocab[token] for token in sentence]).type(torch.long).to(device) for sentence in train_mr_tokens]



# Neural Network

## Encoder

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=hidden_size)
        self.recurrent = nn.GRU(input_size=hidden_size,
                                hidden_size=self.hidden_size)

    def initialize_hidden_state(self):
        return torch.zeros(1, 1, self.hidden_size).to(device)

    def forward(self, input_seq, hidden_state):
        embeddings = self.embedding(input_seq)
        hidden_state_seq, new_hidden_state = self.recurrent(embeddings,
                                                            hidden_state)
        return new_hidden_state

HIDDEN_SIZE = 128

device = 'cuda' if torch.cuda.is_available() else 'cpu'
encoder = Encoder(len(mr_vocab), HIDDEN_SIZE).to(device)
print(encoder)

Encoder(
  (embedding): Embedding(79, 128)
  (recurrent): GRU(128, 128)
)


## Decoder

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Decoder(nn.Module):
    def __init__(self, hidden_size, vocab_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=hidden_size)
        self.recurrent = nn.GRU(input_size=hidden_size,
                                hidden_size=self.hidden_size)
        self.output = nn.Linear(in_features=hidden_size,
                                out_features=vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, input_seq, hidden_state):
        embeddings = self.embedding(input_seq)
        _, new_hidden_state = self.recurrent(embeddings, hidden_state)
        output = self.output(new_hidden_state)
        output = self.softmax(output)
        return output, new_hidden_state

decoder = Decoder(HIDDEN_SIZE, len(ref_vocab)).to(device)
print(decoder)

Decoder(
  (embedding): Embedding(2750, 128)
  (recurrent): GRU(128, 128)
  (output): Linear(in_features=128, out_features=2750, bias=True)
  (softmax): LogSoftmax(dim=2)
)


# Training

## Train Function

In [None]:
# One training example

def train(input_seq, target_seq, target_length, target_vocab, encoder, decoder,
          encoder_optim, decoder_optim, loss_fn, teacher_forcing_ratio=0.5):
    encoder.zero_grad()
    decoder.zero_grad()
    loss = 0
    encoder_hidden_state = encoder.initialize_hidden_state()
    encoder_hidden_state = encoder(input_seq, encoder_hidden_state)
    decoder_hidden_state = encoder_hidden_state
    prev_word = torch.Tensor([target_vocab[START]]).type(torch.long).unsqueeze(dim=-1).to(device)

    idx = 1
    while idx < target_length:
        decoder_output, decoder_hidden_state = decoder(prev_word, decoder_hidden_state)
        loss += loss_fn(decoder_output.squeeze(), target_seq[idx].squeeze())
        if teacher_forcing_ratio > random.random():
            prev_word = target_seq[idx].unsqueeze(dim=-1)
        else:
            prev_word = decoder_output.argmax(dim=-1).detach()
        if prev_word.item() == ref_vocab[END]:
            break
        idx += 1

    loss.backward()
    encoder_optim.step()
    decoder_optim.step()
    return loss.item()/target_length

## Optimisers and Loss Function

In [None]:
encoder_optim = torch.optim.Adam(encoder.parameters(), lr=0.001)
decoder_optim = torch.optim.Adam(decoder.parameters(), lr=0.001)
loss_fn = nn.NLLLoss()

## Main Training Loop

In [None]:
idx = 0
epoch_loss = 0
while idx < len(train_examples):
    input_seq = train_mr_tokens[idx].unsqueeze(dim=-1)
    target_seq = train_ref_tokens[idx].unsqueeze(dim=-1)
    epoch_loss += train(input_seq, target_seq, 52, ref_vocab, encoder, decoder,
                        encoder_optim, decoder_optim, loss_fn, 0.5)
    idx += 1
    if idx % 200 == 0:
        print(f"{idx} / {len(train_examples)} examples")

200 / 42022 examples
400 / 42022 examples
600 / 42022 examples
800 / 42022 examples
1000 / 42022 examples
1200 / 42022 examples
1400 / 42022 examples
1600 / 42022 examples
1800 / 42022 examples
2000 / 42022 examples
2200 / 42022 examples
2400 / 42022 examples
2600 / 42022 examples
2800 / 42022 examples
3000 / 42022 examples
3200 / 42022 examples
3400 / 42022 examples
3600 / 42022 examples
3800 / 42022 examples
4000 / 42022 examples
4200 / 42022 examples
4400 / 42022 examples
4600 / 42022 examples
4800 / 42022 examples
5000 / 42022 examples
5200 / 42022 examples
5400 / 42022 examples
5600 / 42022 examples
5800 / 42022 examples
6000 / 42022 examples
6200 / 42022 examples
6400 / 42022 examples
6600 / 42022 examples
6800 / 42022 examples
7000 / 42022 examples
7200 / 42022 examples
7400 / 42022 examples
7600 / 42022 examples
7800 / 42022 examples
8000 / 42022 examples
8200 / 42022 examples
8400 / 42022 examples
8600 / 42022 examples
8800 / 42022 examples
9000 / 42022 examples
9200 / 42022 e

## Saving our model

In [None]:
torch.save(encoder.state_dict(), 'encoder.pth')
torch.save(decoder.state_dict(), 'decoder.pth')

In [None]:
from google.colab import files
files.download('encoder.pth')
files.download('decoder.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Testing

## Loading the test examples

In [None]:
test_examples = get_examples('test.txt')
test_examples, test_ref_tokens, test_mr_tokens, test_lengths = extract_mr_ref(test_examples)
test_ref_tokens = [torch.Tensor([ref_vocab[token] for token in sentence]).type(torch.long).to(device) for sentence in test_ref_tokens]
test_mr_tokens = [torch.Tensor([mr_vocab[token] for token in sentence]).type(torch.long).to(device) for sentence in test_mr_tokens]

## Text Generation Function

In [None]:
def generate(input_seq, target_length, target_vocab, encoder, decoder):
    with torch.no_grad():
        encoder_hidden_state = encoder.initialize_hidden_state()
        encoder_hidden_state = encoder(input_seq, encoder_hidden_state)
        decoder_hidden_state = encoder_hidden_state
        prev_word = torch.Tensor([target_vocab[START]]).type(torch.long).unsqueeze(dim=-1).to(device)

        idx = 1
        while idx < target_length:
            decoder_output, decoder_hidden_state = decoder(prev_word, decoder_hidden_state)
            prev_word = decoder_output.argmax(dim=-1)
            print(f"{ref_vocab.lookup_token(prev_word.item())}", end=" ")
            idx += 1

In [None]:
generate(test_mr_tokens[1].unsqueeze(dim=-1), 52, ref_vocab, encoder, decoder)

alimentum is a family-friendly restaurant in the city centre . it is not family-friendly . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

In [None]:
for i in range(20):
    idx = random.randint(0, len(test_examples)-1)
    input_seq = test_mr_tokens[idx].unsqueeze(dim=-1)
    print(test_examples[idx])
    generate(input_seq, 50, ref_vocab, encoder, decoder)
    print()
    print()

('name[Cotto], eatType[coffee shop], food[English], priceRange[Â£20-25], customer rating[high], area[riverside], near[The Portland Arms]', 'Located on the river near The Portland Arms, The Cotto offers a classy place to grab a bite with its five star rating.')
the portland arms , the portland arms , the wrestlers is a high customer rating and a price range of â£20-25 . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

('name[Aromi], eatType[coffee shop], food[Chinese], customer rating[average], area[riverside], familyFriendly[yes]', 'Aromi is a family friendly Chinese food coffee shop in the riverside area.')
aromi is a coffee shop that serves indian food in the riverside area . it is a friendly . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>