In [1]:
pip install torchdata==0.3.0 torchvision==0.12.0 torchtext==0.12.0 torch

Defaulting to user installation because normal site-packages is not writeable
Collecting torchdata==0.3.0
  Downloading torchdata-0.3.0-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 2.6 MB/s eta 0:00:011
[?25hCollecting torchvision==0.12.0
  Downloading torchvision-0.12.0-cp37-cp37m-manylinux1_x86_64.whl (21.0 MB)
[K     |████████████████████████████████| 21.0 MB 8.3 MB/s eta 0:00:01
Installing collected packages: torchdata, torchvision
Successfully installed torchdata-0.3.0 torchvision-0.12.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import gensim
import nltk
import numpy as np
import pandas as pd
import gzip
import torch
from nltk.corpus import brown

nltk.download('brown')
nltk.download('punkt')

# Output, save, and load brown embeddings

model = gensim.models.Word2Vec(brown.sents())
model.save('brown.embedding')

w2v = gensim.models.Word2Vec.load('brown.embedding')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from torchtext import datasets

In [3]:
def loadDF():
    data = {"question": [], "answer": []}
    index = 0
    train_iter, dev_iter = datasets.SQuAD2()
    for context, question, answers, indices in train_iter:
        if answers[0]:
            data["question"].append(question)
            data["answer"].append(answers[0])
        index += 1
    df =  pd.DataFrame.from_dict(data)
    return df
#### note: this function is from a comment on the forum here - https://knowledge.udacity.com/questions/888774

In [4]:
data = loadDF()

In [5]:
import nltk
from nltk.tokenize import RegexpTokenizer

def prepare_text(sentence):
    '''

    Our text needs to be cleaned with a tokenizer. This function will perform that task.
    https://www.nltk.org/api/nltk.tokenize.html

    '''
    #tokens = word_tokenize(sentence)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    tokens = [token.lower() for token in tokens]
    return tokens

In [6]:
data['question_tokens'] = data['question'].apply(prepare_text)
data['answer_tokens'] = data['answer'].apply(prepare_text)

In [7]:
data

Unnamed: 0,question,answer,question_tokens,answer_tokens
0,When did Beyonce start becoming popular?,in the late 1990s,"[when, did, beyonce, start, becoming, popular]","[in, the, late, 1990s]"
1,What areas did Beyonce compete in when she was...,singing and dancing,"[what, areas, did, beyonce, compete, in, when,...","[singing, and, dancing]"
2,When did Beyonce leave Destiny's Child and bec...,2003,"[when, did, beyonce, leave, destiny, s, child,...",[2003]
3,In what city and state did Beyonce grow up?,"Houston, Texas","[in, what, city, and, state, did, beyonce, gro...","[houston, texas]"
4,In which decade did Beyonce become famous?,late 1990s,"[in, which, decade, did, beyonce, become, famous]","[late, 1990s]"
...,...,...,...,...
86816,In what US state did Kathmandu first establish...,Oregon,"[in, what, us, state, did, kathmandu, first, e...",[oregon]
86817,What was Yangon previously known as?,Rangoon,"[what, was, yangon, previously, known, as]",[rangoon]
86818,With what Belorussian city does Kathmandu have...,Minsk,"[with, what, belorussian, city, does, kathmand...",[minsk]
86819,In what year did Kathmandu create its initial ...,1975,"[in, what, year, did, kathmandu, create, its, ...",[1975]


In [8]:
from sklearn.model_selection import train_test_split
def split(SRC, TRG):
    
    '''
    Input: SRC, our list of questions from the dataset
            TRG, our list of responses from the dataset

    Output: Training and test datasets for SRC & TRG

    '''
    
    SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset = train_test_split(SRC, TRG, test_size=0.2, random_state=42)
    
    return SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset

In [9]:
SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset = split(data['question_tokens'], data['answer_tokens'])

In [10]:
SRC_train_dataset

34614                         [who, shot, queen, victoria]
84775    [how, many, people, are, on, the, territorial,...
75487            [where, does, sr, 94, merge, with, i, 15]
24088    [when, did, natural, bronze, start, to, be, us...
36068    [who, was, the, final, king, of, the, attalid,...
                               ...                        
6265                  [what, is, the, biggest, known, dog]
54886             [what, encoding, does, charis, sil, use]
76820            [who, made, the, demonstration, in, 1943]
860      [at, what, age, did, frédéric, start, giving, ...
15795       [where, is, corruption, even, more, prevalent]
Name: question_tokens, Length: 69456, dtype: object

In [11]:
TRG_train_dataset

34614                            [roderick, maclean]
84775                                     [nineteen]
75487                                  [at, miramar]
24088                                     [5500, bc]
36068                                 [attalus, iii]
                            ...                     
6265                              [english, mastiff]
54886    [graphite, opentype, or, aat, technologies]
76820                              [luria, delbrück]
860                                              [7]
15795                     [non, privatized, sectors]
Name: answer_tokens, Length: 69456, dtype: object

In [12]:
class Vocabulary:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        
        self.add_token('<UNK>')

    def add_token(self, token):
        if token not in self.word2index:
            self.word2index[token] = self.num_words
            self.word2count[token] = 1
            self.index2word[self.num_words] = token
            self.num_words += 1
        else:
            self.word2count[token] += 1

    def add_tokens(self, tokens):
        for token in tokens:
            self.add_token(token)
            
    def discard_rare_words(self, min_count):
        tokens_to_remove = []
        for token in self.word2count:
            if self.word2count[token] < min_count:
                tokens_to_remove.append(token)

        for token in tokens_to_remove:
            del self.word2index[token]
            del self.word2count[token]

        self.index2word = {index: token for token, index in self.word2index.items()}
        self.num_words = len(self.word2index)

    def __len__(self):
        return self.num_words

    def __str__(self):
        return f"Vocabulary size: {self.num_words}"

    def token_to_index(self, token):
        return self.word2index.get(token, self.word2index['<UNK>'])

    def index_to_token(self, index):
        return self.index2word.get(index, '<UNK>')

    def get_token_count(self, token):
        return self.word2count.get(token, 0)


In [13]:
vocabulary = Vocabulary()
vocabulary_src = Vocabulary()
vocabulary_trg = Vocabulary()

In [14]:
for row in SRC_train_dataset:
    vocabulary.add_tokens(row)
    vocabulary_src.add_tokens(row)
for row in SRC_test_dataset:
    vocabulary.add_tokens(row)
    vocabulary_src.add_tokens(row)

In [15]:
for row in TRG_train_dataset:
    vocabulary.add_tokens(row)
    vocabulary_trg.add_tokens(row)
for row in TRG_test_dataset:
    vocabulary.add_tokens(row)
    vocabulary_trg.add_tokens(row)

In [16]:
print(vocabulary)
print(vocabulary.token_to_index('how'))
print(vocabulary.index_to_token(3))
print(vocabulary.get_token_count('how'))

Vocabulary size: 52519
5
queen
9500


In [17]:
vocabulary.discard_rare_words(2)
print(vocabulary)

Vocabulary size: 30359


In [18]:
vocabulary_src.discard_rare_words(1)
print(vocabulary_src)

Vocabulary size: 36743


In [19]:
vocabulary_trg.discard_rare_words(1)
print(vocabulary_trg)

Vocabulary size: 37896


In [20]:
vocabulary.word2index['<UNK>'] = len(vocabulary.word2index)
vocabulary_src.word2index['<UNK>'] = len(vocabulary_src.word2index)
vocabulary_trg.word2index['<UNK>'] = len(vocabulary_trg.word2index)
def turn_to_indices_src(dataset):    
    # Create an empty list to store the indices
    dataset_indices = []

    # Iterate through each row in the dataset
    for row in dataset:
        # Create an empty list to store the indices of tokens in the row
        row_indices = []

        # Iterate through each token in the row
        for token in row:
        # Convert the token to its index using the token_to_index function
            if token in vocabulary_src.word2index:
                index = vocabulary_src.word2index[token]
            else:
                index = vocabulary_src.word2index['<UNK>']

            # Append the index to the row_indices list
            row_indices.append(index)

        # Append the row_indices list to the dataset_indices list
        dataset_indices.append(row_indices)
    return dataset_indices

def turn_to_indices_trg(dataset):    
    # Create an empty list to store the indices
    dataset_indices = []

    # Iterate through each row in the dataset
    for row in dataset:
        # Create an empty list to store the indices of tokens in the row
        row_indices = []

        # Iterate through each token in the row
        for token in row:
        # Convert the token to its index using the token_to_index function
            if token in vocabulary_trg.word2index:
                index = vocabulary_trg.word2index[token]
            else:
                index = vocabulary_trg.word2index['<UNK>']

            # Append the index to the row_indices list
            row_indices.append(index)

        # Append the row_indices list to the dataset_indices list
        dataset_indices.append(row_indices)
    return dataset_indices

In [21]:
questions_list_train = turn_to_indices_src(SRC_train_dataset)
questions_list_test = turn_to_indices_src(SRC_test_dataset)
answers_list_train = turn_to_indices_trg(TRG_train_dataset)
answers_list_test = turn_to_indices_trg(TRG_test_dataset)

In [22]:
def get_median(list_of_lists):

    # Calculate the lengths of the inner lists
    lengths = [len(inner_list) for inner_list in list_of_lists]

    # Sort the lengths
    sorted_lengths = sorted(lengths)
    
    # Find the median length
    if len(sorted_lengths) % 2 == 0:
        median_length = (sorted_lengths[len(sorted_lengths) // 2] + sorted_lengths[len(sorted_lengths) // 2 - 1]) / 2
    else:
        median_length = sorted_lengths[len(sorted_lengths) // 2]
        
    return median_length

In [23]:
get_median(questions_list_train)

10.0

In [24]:
get_median(answers_list_train)

2.0

In [25]:
def trim_inner_lists(list_of_lists, max_length):
    truncated_list_of_lists = [inner_list[:max_length] for inner_list in list_of_lists]
    return truncated_list_of_lists

In [26]:
max_length_question = 10
max_length_answer = 4

In [27]:
questions_list_train = trim_inner_lists(questions_list_train, max_length_question)
questions_list_test = trim_inner_lists(questions_list_test, max_length_question)
answers_list_train = trim_inner_lists(answers_list_train, max_length_answer)
answers_list_test = trim_inner_lists(answers_list_test, max_length_answer)

In [28]:
padded_questions_list_train = [seq + [0] * (max_length_question - len(seq)) for seq in questions_list_train]
padded_questions_list_test = [seq + [0] * (max_length_question - len(seq)) for seq in questions_list_test]
padded_answers_list_train = [seq + [0] * (max_length_answer - len(seq)) for seq in answers_list_train]
padded_answers_list_test = [seq + [0] * (max_length_answer - len(seq)) for seq in answers_list_test]

In [29]:
train_data = list(zip(padded_questions_list_train, padded_answers_list_train))
test_data = list(zip(padded_questions_list_test, padded_answers_list_test))

In [30]:
len(train_data)

69456

In [31]:
train_data_subset = train_data[0:9600]
test_data_subset = test_data[0:9600]

In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer

import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sequence, label = self.data[index]
        return torch.tensor(sequence), torch.tensor(label)

In [33]:
train_dataset = MyDataset(train_data)
test_dataset = MyDataset(test_data)

In [34]:
train_dataset_subset = MyDataset(train_data_subset)
test_dataset_subset = MyDataset(test_data_subset)

In [35]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [36]:
train_dataloader_subset = DataLoader(train_dataset_subset, batch_size=32, shuffle=True)
test_dataloader_subset = DataLoader(test_dataset_subset, batch_size=32, shuffle=True)

In [37]:
BATCH_SIZE = 32

# pytorch boilerplate that determines whether a GPU is present or not,
# this determines whether our dataset or model can to moved to a GPU
device = torch.device('cpu')

In [38]:
# pretend that we're iterating over the iterator and print out the print element
test_batch = next(iter(train_dataloader))
src,trg = test_batch
src = src.T
trg = trg.T

In [39]:
src.shape

torch.Size([10, 32])

In [40]:
trg.shape

torch.Size([4, 32])

Please note that the code for the Encoder, Decoder and Seq2Seq model was from this source - http://ethen8181.github.io/machine-learning/deep_learning/seq2seq/1_torch_seq2seq_intro.html

Use of this code was suggested by this mentor in this query - https://knowledge.udacity.com/questions/908515
Small adjustments were made to it for my specific code and dataset

In [41]:
# adjustable parameters
INPUT_DIM = len(vocabulary_src)
OUTPUT_DIM = len(vocabulary_trg)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [42]:
import torch.nn as nn

class Encoder(nn.Module):
    """
    Input :
        - source batch
    Layer : 
        source batch -> Embedding -> LSTM
    Output :
        - LSTM hidden state
        - LSTM cell state

    Parmeters
    ---------
    input_dim : int
        Input dimension, should equal to the source vocab size.
    
    emb_dim : int
        Embedding layer's dimension.
        
    hid_dim : int
        LSTM Hidden/Cell state's dimension.
        
    n_layers : int
        Number of LSTM layers.
        
    dropout : float
        Dropout for the LSTM layer.
    """

    def __init__(self, input_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

    def forward(self, src_batch: torch.LongTensor):
        """

        Parameters
        ----------
        src_batch : 2d torch.LongTensor
            Batched tokenized source sentence of shape [sent len, batch size].

        Returns
        -------
        hidden, cell : 3d torch.LongTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]
        """
        embedded = self.embedding(src_batch) # [sent len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs -> [sent len, batch size, hidden dim * n directions]
        return hidden, cell

In [43]:
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
hidden, cell = encoder(src)
hidden.shape, cell.shape

(torch.Size([2, 32, 512]), torch.Size([2, 32, 512]))

In [44]:
class Decoder(nn.Module):
    """
    Input :
        - first token in the target batch
        - LSTM hidden state from the encoder
        - LSTM cell state from the encoder
    Layer :
        target batch -> Embedding -- 
                                   |
        encoder hidden state ------|--> LSTM -> Linear
                                   |
        encoder cell state   -------
        
    Output :
        - prediction
        - LSTM hidden state
        - LSTM cell state

    Parmeters
    ---------
    output : int
        Output dimension, should equal to the target vocab size.
    
    emb_dim : int
        Embedding layer's dimension.
        
    hid_dim : int
        LSTM Hidden/Cell state's dimension.
        
    n_layers : int
        Number of LSTM layers.
        
    dropout : float
        Dropout for the LSTM layer.
    """

    def __init__(self, output_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.out = nn.Linear(hid_dim, output_dim)

    def forward(self, trg: torch.LongTensor, hidden: torch.FloatTensor, cell: torch.FloatTensor):
        """

        Parameters
        ----------
        trg : 1d torch.LongTensor
            Batched tokenized source sentence of shape [batch size].
            
        hidden, cell : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]

        Returns
        -------
        prediction : 2d torch.LongTensor
            For each token in the batch, the predicted target vobulary.
            Shape [batch size, output dim]

        hidden, cell : 3d torch.FloatTensor
            Hidden and cell state of the LSTM layer. Each state's shape
            [n layers * n directions, batch size, hidden dim]
        """
        # [1, batch size, emb dim], the 1 serves as sent len
        embedded = self.embedding(trg.unsqueeze(0))
        outputs, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.out(outputs.squeeze(0))
        return prediction, hidden, cell

In [45]:
trg.max()

tensor(32154)

In [46]:
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT).to(device)

# notice that we are not passing the entire the .trg
prediction, hidden, cell = decoder(trg[0], hidden, cell)
prediction.shape, hidden.shape, cell.shape

(torch.Size([32, 37896]), torch.Size([2, 32, 512]), torch.Size([2, 32, 512]))

In [47]:
import random

In [48]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            'Hidden dimensions of encoder and decoder must be equal!'
        assert encoder.n_layers == decoder.n_layers, \
            'Encoder and decoder must have equal number of layers!'

    def forward(self, src_batch: torch.LongTensor, trg_batch: torch.LongTensor,
                teacher_forcing_ratio: float=0.5):

        max_len, batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder's output
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        # last hidden & cell state of the encoder is used as the decoder's initial hidden state
        hidden, cell = self.encoder(src_batch)

        trg = trg_batch[0]
        for i in range(1, max_len):
            prediction, hidden, cell = self.decoder(trg, hidden, cell)
            outputs[i] = prediction

            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)

        return outputs

In [49]:
# note that this implementation assumes that the size of the hidden layer,
# and the number of layer are the same between the encoder and decoder
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(36743, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(37896, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=37896, bias=True)
  )
)

In [50]:
outputs = seq2seq(src, trg)
outputs.shape

torch.Size([4, 32, 37896])

In [214]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(seq2seq):,} trainable parameters')

The model has 45,904,648 trainable parameters


In [55]:
import torch.optim as optim
optimizer = optim.Adam(seq2seq.parameters())
criterion = nn.CrossEntropyLoss()
log_interval = 10

Please note that only a subset was used to train as with the full set of data, the model was training for over 4 hours and was still not close to being finished.

In [216]:
seq2seq.train()
num_epochs = 3
log_interval = 10  # Define the log interval
best_valid_loss = float('inf')

for epoch in range(num_epochs):
    for batch_idx, (src, trg)  in enumerate(train_dataloader_subset):
        src = src.to(device)
        src = src.T
        trg = trg.to(device)
        trg = trg.T

        optimizer.zero_grad()
        outputs = seq2seq(src, trg)
        
        outputs = outputs.view(-1, outputs.size(2))  # Reshape outputs tensor
        trg = trg.reshape(-1)  # Reshape trg tensor
        
        loss = criterion(outputs, trg)
        
        if loss < best_valid_loss:
            best_valid_loss = loss
            torch.save(seq2seq.state_dict(), 'tut1-model.pt')
        
        loss.backward()
        optimizer.step()

        if batch_idx % log_interval == 0:
            print(f"Epoch: {epoch+1}, Batch: {batch_idx+1}/{len(train_dataloader_subset)}, Loss: {loss.item()}")

Epoch: 1, Batch: 1/300, Loss: 10.529274940490723
Epoch: 1, Batch: 11/300, Loss: 5.48054313659668
Epoch: 1, Batch: 21/300, Loss: 6.142960548400879
Epoch: 1, Batch: 31/300, Loss: 6.650991439819336
Epoch: 1, Batch: 41/300, Loss: 6.264608383178711
Epoch: 1, Batch: 51/300, Loss: 6.003070831298828
Epoch: 1, Batch: 61/300, Loss: 6.554052829742432
Epoch: 1, Batch: 71/300, Loss: 5.9720377922058105
Epoch: 1, Batch: 81/300, Loss: 6.671511650085449
Epoch: 1, Batch: 91/300, Loss: 6.475656032562256
Epoch: 1, Batch: 101/300, Loss: 5.888495445251465
Epoch: 1, Batch: 111/300, Loss: 4.94251823425293
Epoch: 1, Batch: 121/300, Loss: 6.292538642883301
Epoch: 1, Batch: 131/300, Loss: 5.973886489868164
Epoch: 1, Batch: 141/300, Loss: 6.54754638671875
Epoch: 1, Batch: 151/300, Loss: 6.553583145141602
Epoch: 1, Batch: 161/300, Loss: 6.183943271636963
Epoch: 1, Batch: 171/300, Loss: 5.77127742767334
Epoch: 1, Batch: 181/300, Loss: 6.275703430175781
Epoch: 1, Batch: 191/300, Loss: 5.576815605163574
Epoch: 1, Bat

In [51]:
seq2seq.load_state_dict(torch.load('tut1-model.pt'))

<All keys matched successfully>

In [56]:
seq2seq.eval()

epoch_loss = 0
with torch.no_grad():
    for batch_idx, (src, trg)  in enumerate(test_dataloader_subset):
        src = src.to(device)
        src = src.T
        trg = trg.to(device)
        trg = trg.T
        
        # turn off teacher forcing
        outputs = seq2seq(src, trg)

        # trg = [trg sent len, batch size]
        # output = [trg sent len, batch size, output dim]
        outputs = outputs.view(-1, outputs.size(2))  # Reshape outputs tensor
        trg = trg.reshape(-1)  # Reshape trg tensor
        
        loss = criterion(outputs, trg)
        epoch_loss += loss.item()
        
        if batch_idx % log_interval == 0:
            print(f'Batch: {batch_idx+1}/{len(test_dataloader_subset)}')

Batch: 1/300
Batch: 11/300
Batch: 21/300
Batch: 31/300
Batch: 41/300
Batch: 51/300
Batch: 61/300
Batch: 71/300
Batch: 81/300
Batch: 91/300
Batch: 101/300
Batch: 111/300
Batch: 121/300
Batch: 131/300
Batch: 141/300
Batch: 151/300
Batch: 161/300
Batch: 171/300
Batch: 181/300
Batch: 191/300
Batch: 201/300
Batch: 211/300
Batch: 221/300
Batch: 231/300
Batch: 241/300
Batch: 251/300
Batch: 261/300
Batch: 271/300
Batch: 281/300
Batch: 291/300


In [57]:
print(f'| Test Loss: {epoch_loss / len(test_dataloader_subset)}')

| Test Loss: 6.262801626523336


As a bonus, here are my chatbot interaction functions. My chatbot was only trained on a subset of the data, so most responses are that it does not know.

In [58]:
def turn_to_indices(question):
    indexed_question = []
    for elem in question:
        indexed_question.append(vocabulary_src.token_to_index(elem))
    return indexed_question

In [59]:
def turn_to_indices_answer(answer):
    indexed_answer = []
    for elem in answer:
        indexed_answer.append(vocabulary_trg.token_to_index(elem))
    return indexed_answer

In [60]:
def size_question(question):
    if len(question)>10:
        return question[0:10]
    elif len(question)==10:
        return question
    else:
        padding = 10 - len(question)
        return question + [0] * padding

In [61]:
def size_answer(question):
    if len(question)>4:
        return question[0:4]
    elif len(question)==4:
        return question
    else:
        padding = 4 - len(question)
        return question + [0] * padding

In [62]:
def tensor_question(question):
    question = torch.tensor(question)
    return question.unsqueeze(0).T

In [63]:
def prepare_question(question):
    return tensor_question(size_question(turn_to_indices(question)))

In [64]:
def prepare_answer(answer):
    return tensor_question(size_answer(turn_to_indices_answer(answer)))

In [82]:
def ask_a_question(question, predicted_answer):
    question = prepare_text(question)
    answer = prepare_text(predicted_answer)
    
    question = prepare_question(question)
    answer = prepare_answer(answer)
    
    seq2seq.eval()
    outputs = seq2seq(question, answer)
    
    batch_size, num_examples, vocab_size = outputs.shape
    reshaped_output = outputs.reshape(batch_size * num_examples, vocab_size)
    #print(reshaped_output)
    predicted_indices = np.argmax(reshaped_output.detach().numpy(), axis=1)
    #print(predicted_indices)
    predicted_tokens = [vocabulary_trg.index_to_token(index) for index in predicted_indices]
    reshaped_predicted_tokens = np.array(predicted_tokens).reshape(num_examples, batch_size)
    
    response = list(reshaped_predicted_tokens[0]).remove('<UNK>')
    # Remove the <UNK> token from the list

    if response:
        r = ' '.join(response)
    else:
        r = 'I do not know, sorry.'
        
    return r

In [83]:
ask_a_question('what country are you from?', 'england')

'I do not know, sorry.'