In [33]:
#libraries
import os
import tqdm
import json
import zipfile
import tarfile
import pickle
import numpy as np
import urllib.request
import torch
import torch.nn as nn
import utils
from utils import tokenizer, clean_text, word_tokenize, build_vocab, build_embeddings, convert_idx
import layers
cuda = False
device = torch.device("cuda" if cuda else "cpu")

In [34]:
# experiment ID
exp = "exp-1"


# URL to download SQuAD dataset 2.0
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset"

# data directories
data_dir = "/home/cheeta/nsr/squad/Data/squad/"
train_dir = data_dir + "train/"
dev_dir = data_dir + "dev/"

# model paths
spacy_en = "/home/cheeta/nsr/squad/Data/spacy/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0"
glove = "/home/cheeta/nsr/squad/Data/glove.6B/" + "glove.6B.{}d.txt"
squad_models = "output/" + exp

# preprocessing values
max_words = -1
word_embedding_size = 100
char_embedding_size = 8
max_len_context = 400
max_len_question = 50
max_len_word = 25

# training hyper-parameters
num_epochs = 15
batch_size = 64
learning_rate = 0.5
drop_prob = 0.2
hidden_size = 100
char_channel_width = 5
char_channel_size = 100
cuda = False

# Utils



In [35]:
#MAKE DATESET

def download_squad(url, filename, out_dir):
    # path for local file.
    save_path = os.path.join(out_dir, filename)

    # check if the file already exists
    if not os.path.exists(save_path):
        # check if the output director exists, otherwise create it.
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        print("Downloading", filename, "...")

        # download the dataset
        url = os.path.join(url, filename)
        file_path, _ = urllib.request.urlretrieve(url=url, filename=save_path)

    print("File downloaded successfully!")

    if filename.endswith(".zip"):
        # unpack the zip-file.
        print("Extracting ZIP file...")
        zipfile.ZipFile(file=filename, mode="r").extractall(out_dir)
        print("File extracted successfully!")
    elif filename.endswith((".tar.gz", ".tgz")):
        # unpack the tar-ball.
        print("Extracting TAR file...")
        tarfile.open(name=filename, mode="r:gz").extractall(out_dir)
        print("File extracted successfully!")





In [37]:
# SQUAD PRE PROCESSING


class SquadPreprocessor:
    def __init__(self, data_dir, train_filename, dev_filename, tokenizer):
        self.data_dir = data_dir
        self.train_filename = train_filename
        self.dev_filename = dev_filename
        self.data = None
        self.tokenizer = utils.tokenizer

    def load_data(self, filename="train-v2.0.json"):
        filepath = os.path.join(self.data_dir, filename)
        with open(filepath) as f:
            self.data = json.load(f)

    def split_data(self, filename):
        self.load_data(filename)
        sub_dir = filename.split('-')[0]

        # create a subdirectory for Train and Dev data
        if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
            os.makedirs(os.path.join(self.data_dir, sub_dir))

        with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.labels'), 'w', encoding="utf-8") as labels_file:

            # loop over the data
            for article_id in tqdm.tqdm(range(len(self.data['data']))):
                list_paragraphs = self.data['data'][article_id]['paragraphs']
                # loop over the paragraphs
                for paragraph in list_paragraphs:
                    context = paragraph['context']
                    context = clean_text(context)
                    context_tokens = [w for w in word_tokenize(context) if w]
                    spans = convert_idx(context, context_tokens)
                    qas = paragraph['qas']
                    # loop over Q/A
                    for qa in qas:
                        question = qa['question']
                        question = clean_text(question)
                        question_tokens = [w for w in word_tokenize(question) if w]
                        if sub_dir == "train":
                            # select only one ground truth, the top answer, if any answer
                            answer_ids = 1 if qa['answers'] else 0
                        else:
                            answer_ids = len(qa['answers'])
                        labels = []
                        if answer_ids:
                            for answer_id in range(answer_ids):
                                answer = qa['answers'][answer_id]['text']
                                answer = clean_text(answer)
                                answer_tokens = [w for w in word_tokenize(answer) if w]
                                answer_start = qa['answers'][answer_id]['answer_start']
                                answer_stop = answer_start + len(answer)
                                answer_span = []
                                for idx, span in enumerate(spans):
                                    if not (answer_stop <= span[0] or answer_start >= span[1]):
                                        answer_span.append(idx)
                                if not answer_span:
                                    continue
                                labels.append(str(answer_span[0]) + ' ' + str(answer_span[-1]))

                            # write to file
                            context_file.write(' '.join([token for token in context_tokens]) + '\n')
                            question_file.write(' '.join([token for token in question_tokens]) + '\n')
                            answer_file.write(' '.join([token for token in answer_tokens]) + '\n')
                            labels_file.write("|".join(labels) + "\n")

    def preprocess(self):
        self.split_data(train_filename)
        self.split_data(dev_filename)

    def extract_features(self, max_len_context= max_len_context, max_len_question= max_len_question,
                         max_len_word= max_len_word, is_train=True):
        # choose the right directory
        directory = "train" if is_train else "dev"

        # load context
        with open(os.path.join(self.data_dir, directory, directory + ".context"), "r", encoding="utf-8") as c:
            context = c.readlines()
        # load questions
        with open(os.path.join(self.data_dir, directory, directory + ".question"), "r", encoding="utf-8") as q:
            question = q.readlines()
        # load answer
        with open(os.path.join(self.data_dir, directory, directory + ".labels"), "r", encoding="utf-8") as l:
            labels = l.readlines()

        # clean and tokenize context and question
        context = [[w for w in word_tokenize(clean_text(doc.strip('\n')))] for doc in context]
        question = [[w for w in word_tokenize(clean_text(doc.strip('\n')))] for doc in question]

        # download vocabulary if not done yet
        if directory == "train":
            labels = [np.array(l.strip("\n").split(), dtype=np.int32) for l in labels]

            word_vocab, word2idx, char_vocab, char2idx = build_vocab(directory + ".context", directory + ".question",
                                                                     "word_vocab.pkl", "word2idx.pkl", "char_vocab.pkl",
                                                                     "char2idx.pkl", is_train, max_words)
            # create an embedding matrix from the vocabulary with pretrained vectors (GloVe) for words
            build_embeddings(word_vocab, embedding_path= glove, output_path="word_embeddings.pkl",
                             vec_size= word_embedding_size)
            build_embeddings(char_vocab, embedding_path="", output_path="char_embeddings.pkl",
                             vec_size= char_embedding_size)

        else:
            labels = np.array([l.strip("\n") for l in labels])

            with open(os.path.join(self.data_dir, "train", "word2idx.pkl"), "rb") as wi,\
                 open(os.path.join(self.data_dir, "train", "char2idx.pkl"), "rb") as ci:
                    word2idx = pickle.load(wi)
                    char2idx = pickle.load(ci)

        print("Number of questions before filtering:", len(question))
        filter = [len(c) < max_len_context and max([len(w) for w in c]) < max_len_word and
                  len(q) < max_len_question and max([len(w) for w in q]) < max_len_word and
                  len(q) > 3 for c, q in zip(context, question)]
        context, question, labels = zip(*[(c, q, l) for c, q, l, f in zip(
                                          context, question, labels, filter) if f])
        print("Number of questions after filtering ", len(question))

        # replace the tokenized words with their associated ID in the vocabulary
        context_idxs = []
        context_char_idxs = []
        question_idxs = []
        question_char_idxs = []
        for i, (c, q) in tqdm.tqdm(enumerate(zip(context, question))):
            # create empty numpy arrays
            context_idx = np.zeros([max_len_context], dtype=np.int32)
            question_idx = np.zeros([max_len_question], dtype=np.int32)
            context_char_idx = np.zeros([max_len_context, max_len_word], dtype=np.int32)
            question_char_idx = np.zeros([max_len_question, max_len_word], dtype=np.int32)

            # replace 0 values with word and char IDs
            for j, word in enumerate(c):
                if word in word2idx:
                    context_idx[j] = word2idx[word]
                else:
                    context_idx[j] = 1
                for k, char in enumerate(word):
                    if char in char2idx:
                        context_char_idx[j, k] = char2idx[char]
                    else:
                        context_char_idx[j, k] = 1
            context_idxs.append(context_idx)
            context_char_idxs.append(context_char_idx)

            for j, word in enumerate(q):
                if word in word2idx:
                    question_idx[j] = word2idx[word]
                else:
                    question_idx[j] = 1
                for k, char in enumerate(word):
                    if char in char2idx:
                        question_char_idx[j, k] = char2idx[char]
                    else:
                        question_char_idx[j, k] = 1
            question_idxs.append(question_idx)
            question_char_idxs.append(question_char_idx)
        # save features as numpy arrays
        np.savez(os.path.join(self.data_dir, directory, directory + "_features"),
                 context_idxs=np.array(context_idxs),
                 context_char_idxs=np.array(context_char_idxs),
                 question_idxs=np.array(question_idxs),
                 question_char_idxs=np.array(question_char_idxs),
                 label=np.array(labels))



In [None]:
# Check ...

train_filename = "train-v2.0.json"
dev_filename = "dev-v2.0.json"

download_squad(url, train_filename, data_dir)
download_squad(url, dev_filename, data_dir)

p = SquadPreprocessor(data_dir, train_filename, dev_filename, utils.tokenizer)
p.preprocess()

p.extract_features(max_len_context, max_len_question,
                   max_len_word, is_train=True)
p.extract_features(max_len_context, max_len_question,
                   max_len_word, is_train=False)

In [None]:
# Data Loader  -- pytorch


class SquadDataset(data.Dataset):
    """Custom Dataset for SQuAD data compatible with torch.utils.data.DataLoader."""

    def __init__(self, w_context, c_context, w_question, c_question, labels):
        """Set the path for context, question and labels."""
        self.w_context = w_context
        self.c_context = c_context
        self.w_question = w_question
        self.c_question = c_question
        self.labels = labels

    def __getitem__(self, index):
        """Returns one data tuple of the form ( word context, character context, word question,
         character question, answer)."""
        return self.w_context[index], self.c_context[index], self.w_question[index], self.c_question[index],\
               self.labels[index]

    def __len__(self):
        return len(self.w_context)

In [None]:
# Model

class BiDAF(nn.Module):
    """Baseline BiDAF model for SQuAD.
    Based on the paper:
    "Bidirectional Attention Flow for Machine Comprehension"
    by Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, Hannaneh Hajishirzi
    (https://arxiv.org/abs/1611.01603).
    Follows a high-level structure commonly found in SQuAD models:
        - Embedding layer: Embed word indices to get word vectors.
        - Encoder layer: Encode the embedded sequence.
        - Attention layer: Apply an attention mechanism to the encoded sequence.
        - Model encoder layer: Encode the sequence again.
        - Output layer: Simple layer (e.g., fc + softmax) to get final outputs.
    Args:
        word_vectors (torch.Tensor): Pre-trained word vectors.
        hidden_size (int): Number of features in the hidden state at each layer.
        drop_prob (float): Dropout probability.
    """
    def __init__(self, word_vectors, char_vectors, hidden_size, drop_prob=0.):
        super(BiDAF, self).__init__()
        self.emb = layers.Embedding(word_vectors=word_vectors,
                                    char_vectors=char_vectors,
                                    hidden_size=hidden_size,
                                    drop_prob=drop_prob)

        self.enc = layers.RNNEncoder(input_size=hidden_size * 2,
                                     hidden_size=hidden_size,
                                     num_layers=1,
                                     drop_prob=drop_prob)

        self.att = layers.BiDAFAttention(hidden_size=2 * hidden_size,
                                         drop_prob=drop_prob)

        self.mod = layers.RNNEncoder(input_size=8 * hidden_size,
                                     hidden_size=hidden_size,
                                     num_layers=2,
                                     drop_prob=drop_prob)

        self.out = layers.BiDAFOutput(hidden_size=hidden_size,
                                      drop_prob=drop_prob)

    def forward(self, cw_idxs, cc_idxs, qw_idxs, qc_idxs):
        c_mask = torch.zeros_like(cw_idxs) != cw_idxs
        q_mask = torch.zeros_like(qw_idxs) != qw_idxs
        c_len, q_len = c_mask.sum(-1), q_mask.sum(-1)

        c_emb = self.emb(cw_idxs, cc_idxs)         # (batch_size, c_len, hidden_size)
        q_emb = self.emb(qw_idxs, qc_idxs)         # (batch_size, q_len, hidden_size)

        c_enc = self.enc(c_emb, c_len)    # (batch_size, c_len, 2 * hidden_size)
        q_enc = self.enc(q_emb, q_len)    # (batch_size, q_len, 2 * hidden_size)

        att = self.att(c_enc, q_enc,
                       c_mask, q_mask)    # (batch_size, c_len, 8 * hidden_size)

        mod = self.mod(att, c_len)        # (batch_size, c_len, 2 * hidden_size)

        out = self.out(att, mod, c_mask)  # 2 tensors, each (batch_size, c_len)

        return out

In [None]:
# TRAIN

# preprocessing values used for training
prepro_params = {
    "max_words": max_words,
    "word_embedding_size": word_embedding_size,
    "char_embedding_size": char_embedding_size,
    "max_len_context": max_len_context,
    "max_len_question": max_len_question,
    "max_len_word": max_len_word
}

# hyper-parameters setup
hyper_params = {
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    "learning_rate": learning_rate,
    "hidden_size": hidden_size,
    "char_channel_width": char_channel_width,
    "char_channel_size": char_channel_size,
    "drop_prob": drop_prob,
    "cuda": cuda,
    "pretrained": pretrained
}

experiment_params = {"preprocessing": prepro_params, "model": hyper_params}

# train on GPU if CUDA variable is set to True (a GPU with CUDA is needed to do so)
device = torch.device("cuda" if hyper_params["cuda"] else "cpu")
torch.manual_seed(42)

# define a path to save experiment logs
experiment_path = "output/{}".format(exp)
if not os.path.exists(experiment_path):
    os.mkdir(experiment_path)

# save the preprocesisng and model parameters used for this training experiemnt
with open(os.path.join(experiment_path, "config_{}.json".format(exp)), "w") as f:
    json.dump(experiment_params, f)

# start TensorboardX writer
writer = SummaryWriter(experiment_path)

# open features file and store them in individual variables (train + dev)
train_features = np.load(os.path.join(train_dir, "train_features.npz"))
t_w_context, t_c_context, t_w_question, t_c_question, t_labels = train_features["context_idxs"],\
                                                                 train_features["context_char_idxs"],\
                                                                 train_features["question_idxs"],\
                                                                 train_features["question_char_idxs"],\
                                                                 train_features["label"]

dev_features = np.load(os.path.join(dev_dir, "dev_features.npz"))
d_w_context, d_c_context, d_w_question, d_c_question, d_labels = dev_features["context_idxs"],\
                                                                 dev_features["context_char_idxs"],\
                                                                 dev_features["question_idxs"],\
                                                                 dev_features["question_char_idxs"],\
                                                                 dev_features["label"]

# load the embedding matrix created for our word vocabulary
with open(os.path.join(train_dir, "word_embeddings.pkl"), "rb") as e:
    word_embedding_matrix = pickle.load(e)
with open(os.path.join(train_dir, "char_embeddings.pkl"), "rb") as e:
    char_embedding_matrix = pickle.load(e)

# load mapping between words and idxs
with open(os.path.join(train_dir, "word2idx.pkl"), "rb") as f:
    word2idx = pickle.load(f)

idx2word = dict([(y, x) for x, y in word2idx.items()])

# transform them into Tensors
word_embedding_matrix = torch.from_numpy(np.array(word_embedding_matrix)).type(torch.float32)
char_embedding_matrix = torch.from_numpy(np.array(char_embedding_matrix)).type(torch.float32)

# load datasets
train_dataset = SquadDataset(t_w_context, t_c_context, t_w_question, t_c_question, t_labels)
valid_dataset = SquadDataset(d_w_context, d_c_context, d_w_question, d_c_question, d_labels)

# load data generators
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=hyper_params["batch_size"],
                              num_workers=4)

valid_dataloader = DataLoader(valid_dataset,
                              shuffle=True,
                              batch_size=hyper_params["batch_size"],
                              num_workers=4)

print("Length of training data loader is:", len(train_dataloader))
print("Length of valid data loader is:", len(valid_dataloader))

# load the model
model = BiDAF(word_vectors=word_embedding_matrix,
              char_vectors=char_embedding_matrix,
              hidden_size=hyper_params["hidden_size"],
              drop_prob=hyper_params["drop_prob"])
if hyper_params["pretrained"]:
    model.load_state_dict(torch.load(os.path.join(experiment_path, "model.pkl"))["state_dict"])
model.to(device)

# define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adadelta(model.parameters(), hyper_params["learning_rate"], weight_decay=1e-4)

# best loss so far
if hyper_params["pretrained"]:
    best_valid_loss = torch.load(os.path.join(experiment_path, "model.pkl"))["best_valid_loss"]
    epoch_checkpoint = torch.load(os.path.join(experiment_path, "model_last_checkpoint.pkl"))["epoch"]
    print("Best validation loss obtained after {} epochs is: {}".format(epoch_checkpoint, best_valid_loss))
else:
    best_valid_loss = 100
    epoch_checkpoint = 0

# train the Model
print("Starting training...")
for epoch in range(hyper_params["num_epochs"]):
    print("##### epoch {:2d}".format(epoch + 1))
    model.train()
    train_losses = 0
    for i, batch in enumerate(train_dataloader):
        w_context, c_context, w_question, c_question, label1, label2 = batch[0].long().to(device),\
                                                                       batch[1].long().to(device), \
                                                                       batch[2].long().to(device), \
                                                                       batch[3].long().to(device), \
                                                                       batch[4][:, 0].long().to(device),\
                                                                       batch[4][:, 1].long().to(device)
        optimizer.zero_grad()
        pred1, pred2 = model(w_context, c_context, w_question, c_question)
        loss = criterion(pred1, label1) + criterion(pred2, label2)
        train_losses += loss.item()

        loss.backward()
        optimizer.step()

    writer.add_scalars("train", {"loss": np.round(train_losses / len(train_dataloader), 2),
                                 "epoch": epoch + 1})
    print("Train loss of the model at epoch {} is: {}".format(epoch + 1, np.round(train_losses /
                                                                                  len(train_dataloader), 2)))

    model.eval()
    valid_losses = 0
    valid_em = 0
    valid_f1 = 0
    n_samples = 0
    with torch.no_grad():
        for i, batch in enumerate(valid_dataloader):
            w_context, c_context, w_question, c_question, labels = batch[0].long().to(device), \
                                                                   batch[1].long().to(device), \
                                                                   batch[2].long().to(device), \
                                                                   batch[3].long().to(device), \
                                                                   batch[4]

            first_labels = torch.tensor([[int(a) for a in l.split("|")[0].split(" ")]
                                         for l in labels], dtype=torch.int64).to(device)
            pred1, pred2 = model(w_context, c_context, w_question, c_question)
            loss = criterion(pred1, first_labels[:, 0]) + criterion(pred2, first_labels[:, 1])
            valid_losses += loss.item()
            em, f1 = compute_batch_metrics(w_context, idx2word, pred1, pred2, labels)
            valid_em += em
            valid_f1 += f1
            n_samples += w_context.size(0)

        writer.add_scalars("valid", {"loss": np.round(valid_losses / len(valid_dataloader), 2),
                                     "EM": np.round(valid_em / n_samples, 2),
                                     "F1": np.round(valid_f1 / n_samples, 2),
                                     "epoch": epoch + 1})
        print("Valid loss of the model at epoch {} is: {}".format(epoch + 1, np.round(valid_losses /
                                                                                      len(valid_dataloader), 2)))
        print("Valid EM of the model at epoch {} is: {}".format(epoch + 1, np.round(valid_em / n_samples, 2)))
        print("Valid F1 of the model at epoch {} is: {}".format(epoch + 1, np.round(valid_f1 / n_samples, 2)))

    # save last model weights
    save_checkpoint({
        "epoch": epoch + 1 + epoch_checkpoint,
        "state_dict": model.state_dict(),
        "best_valid_loss": np.round(valid_losses / len(valid_dataloader), 2)
    }, True, os.path.join(experiment_path, "model_last_checkpoint.pkl"))

    # save model with best validation error
    is_best = bool(np.round(valid_losses / len(valid_dataloader), 2) < best_valid_loss)
    best_valid_loss = min(np.round(valid_losses / len(valid_dataloader), 2), best_valid_loss)
    save_checkpoint({
        "epoch": epoch + 1 + epoch_checkpoint,
        "state_dict": model.state_dict(),
        "best_valid_loss": best_valid_loss
    }, is_best, os.path.join(experiment_path, "model.pkl"))

# export scalar data to JSON for external processing
writer.export_scalars_to_json(os.path.join(experiment_path, "all_scalars.json"))
writer.close()

In [None]:

def eval(context, question):
    with open(os.path.join(data_dir, "train", "word2idx.pkl"), "rb") as wi, \
         open(os.path.join(data_dir, "train", "char2idx.pkl"), "rb") as ci, \
         open(os.path.join(data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \
         open(os.path.join(data_dir, "train", "char_embeddings.pkl"), "rb") as cb:
        word2idx = pickle.load(wi)
        char2idx = pickle.load(ci)
        word_embedding_matrix = pickle.load(wb)
        char_embedding_matrix = pickle.load(cb)

    # transform them into Tensors
    word_embedding_matrix = torch.from_numpy(np.array(word_embedding_matrix)).type(torch.float32)
    char_embedding_matrix = torch.from_numpy(np.array(char_embedding_matrix)).type(torch.float32)
    idx2word = dict([(y, x) for x, y in word2idx.items()])

    context = clean_text(context)
    context = [w for w in word_tokenize(context) if w]

    question = clean_text(question)
    question = [w for w in word_tokenize(question) if w]

    if len(context) > max_len_context:
        print("The context is too long. Maximum accepted length is", max_len_context, "words.")
    if max([len(w) for w in context]) > max_len_word:
        print("Some words in the context are longer than", max_len_word, "characters.")
    if len(question) > max_len_question:
        print("The question is too long. Maximum accepted length is", max_len_question, "words.")
    if max([len(w) for w in question]) > max_len_word:
        print("Some words in the question are longer than", .max_len_word, "characters.")
    if len(question) < 3:
        print("The question is too short. It needs to be at least a three words question.")

    context_idx = np.zeros([max_len_context], dtype=np.int32)
    question_idx = np.zeros([max_len_question], dtype=np.int32)
    context_char_idx = np.zeros([max_len_context, max_len_word], dtype=np.int32)
    question_char_idx = np.zeros([max_len_question, max_len_word], dtype=np.int32)

    # replace 0 values with word and char IDs
    for j, word in enumerate(context):
        if word in word2idx:
            context_idx[j] = word2idx[word]
        else:
            context_idx[j] = 1
        for k, char in enumerate(word):
            if char in char2idx:
                context_char_idx[j, k] = char2idx[char]
            else:
                context_char_idx[j, k] = 1

    for j, word in enumerate(question):
        if word in word2idx:
            question_idx[j] = word2idx[word]
        else:
            question_idx[j] = 1
        for k, char in enumerate(word):
            if char in char2idx:
                question_char_idx[j, k] = char2idx[char]
            else:
                question_char_idx[j, k] = 1

    model = BiDAF(word_vectors=word_embedding_matrix,
                  char_vectors=char_embedding_matrix,
                  hidden_size,
                  drop_prob)
    try:
        if cuda:
            model.load_state_dict(torch.load(os.path.join(squad_models, "model_final.pkl"))["state_dict"])
        else:
            model.load_state_dict(torch.load(os.path.join(squad_models, "model_final.pkl"),
                                             map_location=lambda storage, loc: storage)["state_dict"])
        print("Model weights successfully loaded.")
    except:
        pass
        print("Model weights not found, initialized model with random weights.")
    model.to(device)
    model.eval()
    with torch.no_grad():
        context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\
                                                                         torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device)

        pred1, pred2 = model(context_idx, context_char_idx, question_idx, question_char_idx)
        starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False)
        prediction = " ".join(context[starts.item(): ends.item() + 1])

    return prediction


if __name__ == "__main__":
    context = "Rafael Nadal was born in Manacor, a town on the island of Mallorca in the Balearic Islands," \
              " Spain to parents Ana María Parera and Sebastián Nadal. His father is a businessman, owner of an" \
              " insurance company, glass and window company Vidres Mallorca, and a restaurant, Sa Punta. Rafael has a" \
              " younger sister, María Isabel. His uncle, Miguel Ángel Nadal, is a retired professional footballer," \
              " who played for RCD Mallorca, FC Barcelona, and the Spanish national team. He idolized Barcelona striker" \
              " Ronaldo as a child, and via his uncle got access to the Barcelona dressing room to have a photo with" \
              " the Brazilian. Nadal supports football clubs Real Madrid and RCD Mallorca. Recognizing in Rafael a" \
              " natural talent, another uncle, Toni Nadal, a former professional tennis player, introduced him to" \
              " tennis when he was three years old."

    questions = ["Where was born Rafael Nadal?", "Who is Rafael Nadal's sister?",
                 "Who introduced Rafael Nadal to tennis?",
                 "When was Rafael Nadal introduced to tennis?",
                 "What striker was Rafael Nadal's idol?"]

    print("C:", context, "\n")
    for q in questions:
        print("Q:", q)
        answer = eval(context, q)
        print("A:", answer, "\n")