In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import unicodedata
import copy
import spacy
from collections import Counter
import sqlite3
import pandas as pd
import numpy as np

In [2]:
class StackedBRNN(nn.Module):
    """Stacked Bi-directional RNNs.

    Differs from standard PyTorch library in that it has the option to save
    and concat the hidden states between layers. (i.e. the output hidden size
    for each sequence input is num_layers * hidden_size).
    """

    def __init__(self, input_size, hidden_size, num_layers,
                 dropout_rate=0, dropout_output=False, rnn_type=nn.LSTM,
                 concat_layers=False, padding=False):
        super(StackedBRNN, self).__init__()
        self.padding = padding
        self.dropout_output = dropout_output
        self.dropout_rate = dropout_rate
        self.num_layers = num_layers
        self.concat_layers = concat_layers
        self.rnns = nn.ModuleList()
        for i in range(num_layers):
            input_size = input_size if i == 0 else 2 * hidden_size
            self.rnns.append(rnn_type(input_size, hidden_size,
                                      num_layers=1,
                                      bidirectional=True))

    def forward(self, x, x_mask):
        """Encode either padded or non-padded sequences.

        Can choose to either handle or ignore variable length sequences.
        Always handle padding in eval.

        Args:
            x: batch * len * hdim
            x_mask: batch * len (1 for padding, 0 for true)
        Output:
            x_encoded: batch * len * hdim_encoded
        """
        if x_mask.data.sum() == 0:
            # No padding necessary.
            output = self._forward_unpadded(x, x_mask)
        elif self.padding or not self.training:
            # Pad if we care or if its during eval.
            output = self._forward_padded(x, x_mask)
        else:
            # We don't care.
            output = self._forward_unpadded(x, x_mask)

        return output.contiguous()

    def _forward_unpadded(self, x, x_mask):
        """Faster encoding that ignores any padding."""
        # Transpose batch and sequence dims
        x = x.transpose(0, 1)

        # Encode all layers
        outputs = [x]
        for i in range(self.num_layers):
            rnn_input = outputs[-1]

            # Apply dropout to hidden input
            if self.dropout_rate > 0:
                rnn_input = F.dropout(rnn_input,
                                      p=self.dropout_rate,
                                      training=self.training)
            # Forward
            rnn_output = self.rnns[i](rnn_input)[0]
            outputs.append(rnn_output)

        # Concat hidden layers
        if self.concat_layers:
            output = torch.cat(outputs[1:], 2)
        else:
            output = outputs[-1]

        # Transpose back
        output = output.transpose(0, 1)

        # Dropout on output layer
        if self.dropout_output and self.dropout_rate > 0:
            output = F.dropout(output,
                               p=self.dropout_rate,
                               training=self.training)
        return output

    def _forward_padded(self, x, x_mask):
        """Slower (significantly), but more precise, encoding that handles
        padding.
        """
        # Compute sorted sequence lengths
        lengths = x_mask.data.eq(0).long().sum(1).squeeze()
        _, idx_sort = torch.sort(lengths, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        lengths = list(lengths[idx_sort])

        # Sort x
        x = x.index_select(0, idx_sort)

        # Transpose batch and sequence dims
        x = x.transpose(0, 1)

        # Pack it up
        rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths)

        # Encode all layers
        outputs = [rnn_input]
        for i in range(self.num_layers):
            rnn_input = outputs[-1]

            # Apply dropout to input
            if self.dropout_rate > 0:
                dropout_input = F.dropout(rnn_input.data,
                                          p=self.dropout_rate,
                                          training=self.training)
                rnn_input = nn.utils.rnn.PackedSequence(dropout_input,
                                                        rnn_input.batch_sizes)
            outputs.append(self.rnns[i](rnn_input)[0])

        # Unpack everything
        for i, o in enumerate(outputs[1:], 1):
            outputs[i] = nn.utils.rnn.pad_packed_sequence(o)[0]

        # Concat hidden layers or take final
        if self.concat_layers:
            output = torch.cat(outputs[1:], 2)
        else:
            output = outputs[-1]

        # Transpose and unsort
        output = output.transpose(0, 1)
        output = output.index_select(0, idx_unsort)

        # Pad up to original batch sequence length
        if output.size(1) != x_mask.size(1):
            padding = torch.zeros(output.size(0),
                                  x_mask.size(1) - output.size(1),
                                  output.size(2)).type(output.data.type())
            output = torch.cat([output, padding], 1)

        # Dropout on output layer
        if self.dropout_output and self.dropout_rate > 0:
            output = F.dropout(output,
                               p=self.dropout_rate,
                               training=self.training)
        return output

In [3]:
class SeqAttnMatch(nn.Module):
    """Given sequences X and Y, match sequence Y to each element in X.

    * o_i = sum(alpha_j * y_j) for i in X
    * alpha_j = softmax(y_j * x_i)
    """

    def __init__(self, input_size, identity=False):
        super(SeqAttnMatch, self).__init__()
        if not identity:
            self.linear = nn.Linear(input_size, input_size)
        else:
            self.linear = None

    def forward(self, x, y, y_mask):
        """
        Args:
            x: batch * len1 * hdim
            y: batch * len2 * hdim
            y_mask: batch * len2 (1 for padding, 0 for true)
        Output:
            matched_seq: batch * len1 * hdim
        """
        # Project vectors
        if self.linear:
            x_proj = self.linear(x.view(-1, x.size(2))).view(x.size())
            x_proj = F.relu(x_proj)
            y_proj = self.linear(y.view(-1, y.size(2))).view(y.size())
            y_proj = F.relu(y_proj)
        else:
            x_proj = x
            y_proj = y

        # Compute scores
        scores = x_proj.bmm(y_proj.transpose(2, 1))

        # Mask padding
        y_mask = y_mask.unsqueeze(1).expand(scores.size())
        scores.data.masked_fill_(y_mask.data, -float('inf'))

        # Normalize with softmax
        alpha_flat = F.softmax(scores.view(-1, y.size(1)), dim=-1)
        alpha = alpha_flat.view(-1, x.size(1), y.size(1))

        # Take weighted average
        matched_seq = alpha.bmm(y)
        return matched_seq

In [4]:
class BilinearSeqAttn(nn.Module):
    """A bilinear attention layer over a sequence X w.r.t y:

    * o_i = softmax(x_i'Wy) for x_i in X.

    Optionally don't normalize output weights.
    """

    def __init__(self, x_size, y_size, identity=False, normalize=True):
        super(BilinearSeqAttn, self).__init__()
        self.normalize = normalize

        # If identity is true, we just use a dot product without transformation.
        if not identity:
            self.linear = nn.Linear(y_size, x_size)
        else:
            self.linear = None

    def forward(self, x, y, x_mask):
        """
        Args:
            x: batch * len * hdim1
            y: batch * hdim2
            x_mask: batch * len (1 for padding, 0 for true)
        Output:
            alpha = batch * len
        """
        Wy = self.linear(y) if self.linear is not None else y
        xWy = x.bmm(Wy.unsqueeze(2)).squeeze(2)
        xWy.data.masked_fill_(x_mask.data, -float('inf'))
        if self.normalize:
            if self.training:
                # In training we output log-softmax for NLL
                alpha = F.log_softmax(xWy, dim=-1)
            else:
                # ...Otherwise 0-1 probabilities
                alpha = F.softmax(xWy, dim=-1)
        else:
            alpha = xWy.exp()
        return alpha

In [5]:
class LinearSeqAttn(nn.Module):
    """Self attention over a sequence:

    * o_i = softmax(Wx_i) for x_i in X.
    """

    def __init__(self, input_size):
        super(LinearSeqAttn, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x, x_mask):
        """
        Args:
            x: batch * len * hdim
            x_mask: batch * len (1 for padding, 0 for true)
        Output:
            alpha: batch * len
        """
        x_flat = x.view(-1, x.size(-1))
        scores = self.linear(x_flat).view(x.size(0), x.size(1))
        scores.data.masked_fill_(x_mask.data, -float('inf'))
        alpha = F.softmax(scores, dim=-1)
        return alpha

In [6]:
class RnnDocReader(nn.Module):
    def __init__(self, vocab_size, num_features, embedding_dim=300, normalize=True):
        super(RnnDocReader, self).__init__()
        # Word embeddings (+1 for padding)

        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim,
                                      padding_idx=0)

        # Projection for attention weighted question
        self.qemb_match = SeqAttnMatch(embedding_dim)

        # Input size to RNN: word emb + question emb + manual features
        doc_input_size = embedding_dim * 2+ num_features

        # RNN document encoder
        self.doc_rnn = StackedBRNN(
            input_size=doc_input_size,
            hidden_size=128,
            num_layers=3, # Number of encoding layers for document
            dropout_rate=0.4,
            dropout_output=True,
            concat_layers=True,
            rnn_type=nn.LSTM,
            padding=False, # Explicitly account for padding in RNN encoding
        )

        # RNN question encoder
        self.question_rnn = StackedBRNN(
            input_size=embedding_dim,
            hidden_size=128,
            num_layers=3,
            dropout_rate=0.4,
            dropout_output=True,
            concat_layers=True,
            rnn_type=nn.LSTM,
            padding=False,
        )

        # Output sizes of rnn encoders
        doc_hidden_size = 2 * 128 # 2 layers, 128 neurons
        question_hidden_size = 2 * 128
        # if concatenate rnn layers:
        doc_hidden_size *= 3
        question_hidden_size *= 3

        # Question merging
        self.self_attn = LinearSeqAttn(question_hidden_size)

        # Bilinear attention for span start/end
        self.start_attn = BilinearSeqAttn(
            doc_hidden_size,
            question_hidden_size,
            normalize=normalize,
        )
        self.end_attn = BilinearSeqAttn(
            doc_hidden_size,
            question_hidden_size,
            normalize=normalize,
        )
    
    def _weighted_avg(self, x, weights):
        """Return a weighted average of x (a sequence of vectors).

        Args:
            x: batch * len * hdim
            weights: batch * len, sum(dim = 1) = 1
        Output:
            x_avg: batch * hdim
        """
        return weights.unsqueeze(1).bmm(x).squeeze(1)

    def forward(self, x1, x1_f, x1_mask, x2, x2_mask, dropout_emb=0.3):
        """Inputs:
        x1 = document word indices             [batch * len_d]
        x1_f = document word features indices  [batch * len_d * nfeat]
        x1_mask = document padding mask        [batch * len_d]
        x2 = question word indices             [batch * len_q]
        x2_mask = question padding mask        [batch * len_q]
        """
        # Embed both document and question
        x1_emb = self.embedding(x1)
        x2_emb = self.embedding(x2)

        # Dropout on embeddings
        if dropout_emb > 0:
            x1_emb = nn.functional.dropout(x1_emb, p=dropout_emb,
                                           training=self.training)
            x2_emb = nn.functional.dropout(x2_emb, p=dropout_emb,
                                           training=self.training)

        # Form document encoding inputs
        drnn_input = [x1_emb]

        # Add attention-weighted question representation
        x2_weighted_emb = self.qemb_match(x1_emb, x2_emb, x2_mask)
        drnn_input.append(x2_weighted_emb)

        # Add manual features
        drnn_input.append(x1_f)

        # Encode document with RNN
        doc_hiddens = self.doc_rnn(torch.cat(drnn_input, 2), x1_mask)

        # Encode question with RNN + merge hiddens
        question_hiddens = self.question_rnn(x2_emb, x2_mask)
        q_merge_weights = self.self_attn(question_hiddens, x2_mask)
        question_hidden = self._weighted_avg(question_hiddens, q_merge_weights)

        # Predict start and end positions
        start_scores = self.start_attn(doc_hiddens, question_hidden, x1_mask)
        end_scores = self.end_attn(doc_hiddens, question_hidden, x1_mask)
        return start_scores, end_scores

In [7]:
class Dictionary(object):
    NULL = '<NULL>'
    UNK = '<UNK>'
    START = 2

    @staticmethod
    def normalize(token):
        return unicodedata.normalize('NFD', token)

    def __init__(self):
        self.tok2ind = {self.NULL: 0, self.UNK: 1}
        self.ind2tok = {0: self.NULL, 1: self.UNK}

    def __len__(self):
        return len(self.tok2ind)

    def __iter__(self):
        return iter(self.tok2ind)

    def __contains__(self, key):
        if type(key) == int:
            return key in self.ind2tok
        elif type(key) == str:
            return self.normalize(key) in self.tok2ind

    def __getitem__(self, key):
        if type(key) == int:
            return self.ind2tok.get(key, self.UNK)
        if type(key) == str:
            return self.tok2ind.get(self.normalize(key),
                                    self.tok2ind.get(self.UNK))

    def __setitem__(self, key, item):
        if type(key) == int and type(item) == str:
            self.ind2tok[key] = item
        elif type(key) == str and type(item) == int:
            self.tok2ind[key] = item
        else:
            raise RuntimeError('Invalid (key, item) types.')

    def add(self, token):
        token = self.normalize(token)
        if token not in self.tok2ind:
            index = len(self.tok2ind)
            self.tok2ind[token] = index
            self.ind2tok[index] = token

    def tokens(self):
        """Get dictionary tokens.

        Return all the words indexed by this dictionary, except for special
        tokens.
        """
        tokens = [k for k in self.tok2ind.keys()
                  if k not in {'<NULL>', '<UNK>'}]
        return tokens

In [8]:
class Tokens(object):
    """A class to represent a list of tokenized text."""
    TEXT = 0
    TEXT_WS = 1
    SPAN = 2
    POS = 3
    LEMMA = 4
    NER = 5

    def __init__(self, data, annotators, opts=None):
        self.data = data
        self.annotators = annotators
        self.opts = opts or {}

    def __len__(self):
        """The number of tokens."""
        return len(self.data)

    def slice(self, i=None, j=None):
        """Return a view of the list of tokens from [i, j)."""
        new_tokens = copy.copy(self)
        new_tokens.data = self.data[i: j]
        return new_tokens

    def untokenize(self):
        """Returns the original text (with whitespace reinserted)."""
        return ''.join([t[self.TEXT_WS] for t in self.data]).strip()

    def words(self, uncased=False):
        """Returns a list of the text of each token

        Args:
            uncased: lower cases text
        """
        if uncased:
            return [t[self.TEXT].lower() for t in self.data]
        else:
            return [t[self.TEXT] for t in self.data]

    def offsets(self):
        """Returns a list of [start, end) character offsets of each token."""
        return [t[self.SPAN] for t in self.data]

    def pos(self):
        """Returns a list of part-of-speech tags of each token.
        Returns None if this annotation was not included.
        """
        if 'pos' not in self.annotators:
            return None
        return [t[self.POS] for t in self.data]

    def lemmas(self):
        """Returns a list of the lemmatized text of each token.
        Returns None if this annotation was not included.
        """
        if 'lemma' not in self.annotators:
            return None
        return [t[self.LEMMA] for t in self.data]

    def entities(self):
        """Returns a list of named-entity-recognition tags of each token.
        Returns None if this annotation was not included.
        """
        if 'ner' not in self.annotators:
            return None
        return [t[self.NER] for t in self.data]

    def ngrams(self, n=1, uncased=False, filter_fn=None, as_strings=True):
        """Returns a list of all ngrams from length 1 to n.

        Args:
            n: upper limit of ngram length
            uncased: lower cases text
            filter_fn: user function that takes in an ngram list and returns
              True or False to keep or not keep the ngram
            as_string: return the ngram as a string vs list
        """
        def _skip(gram):
            if not filter_fn:
                return False
            return filter_fn(gram)

        words = self.words(uncased)
        ngrams = [(s, e + 1)
                  for s in range(len(words))
                  for e in range(s, min(s + n, len(words)))
                  if not _skip(words[s:e + 1])]

        # Concatenate into strings
        if as_strings:
            ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams]

        return ngrams

    def entity_groups(self):
        """Group consecutive entity tokens with the same NER tag."""
        entities = self.entities()
        if not entities:
            return None
        non_ent = self.opts.get('non_ent', 'O')
        groups = []
        idx = 0
        while idx < len(entities):
            ner_tag = entities[idx]
            # Check for entity tag
            if ner_tag != non_ent:
                # Chomp the sequence
                start = idx
                while (idx < len(entities) and entities[idx] == ner_tag):
                    idx += 1
                groups.append((self.slice(start, idx).untokenize(), ner_tag))
            else:
                idx += 1
        return groups

In [9]:
class Tokenizer(object):
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')

    def tokenize(self, text, return_raw_data=False):
        clean_text = text.replace('\n', ' ')

        tokens = self.nlp(clean_text)

        data = []
        for i in range(len(tokens)):
            # Get whitespace
            start_ws = tokens[i].idx
            if i + 1 < len(tokens):
                end_ws = tokens[i + 1].idx
            else:
                end_ws = tokens[i].idx + len(tokens[i].text)

            data.append((
                tokens[i].text,
                text[start_ws: end_ws],
                (tokens[i].idx, tokens[i].idx + len(tokens[i].text)),
                tokens[i].tag_,
                tokens[i].lemma_,
                tokens[i].ent_type_,
            ))
        
        if return_raw_data:
            return data
        return Tokens(data=data, annotators=('lemma', 'pos', 'ner'), opts={'non_ent': ''})

In [14]:
class Model():
    def __init__(self, model_file):
        saved_params = torch.load(model_file)
        self.word_dict = saved_params['word_dict']
        self.feature_dict = saved_params['feature_dict']
        self.state_dict = saved_params['state_dict']
        
        self.vocab_size = len(self.word_dict)
        self.num_features = len(self.feature_dict)
        self.model = RnnDocReader(self.vocab_size, self.num_features)
        self.model.load_state_dict(self.state_dict)

        self.device = self._get_device()
        # Move model to gpu
        self.model = self.model.to(self.device)

    def _get_device(self, show_info = False):
        if torch.cuda.is_available():    
            device = torch.device("cuda")

            if show_info:
                print('There are %d GPU(s) available.' % torch.cuda.device_count())
                print('We will use the GPU:', torch.cuda.get_device_name(0))

        else:
            device = torch.device("cpu")

            if show_info:
                print('No GPU available, using the CPU instead.')

        return device
    
    def _tokenize(self, tokenizer_class, text):
        tokens = tokenizer_class.tokenize(text)
        output = {
            'words': tokens.words(),
            'offsets': tokens.offsets(),
            'pos': tokens.pos(),
            'lemma': tokens.lemmas(),
            'ner': tokens.entities(),
        }
        return output

    def _vectorize(self,ex, word_dict, feature_dict, single_answer=False):
        """Torchify a single example."""

        # Index words
        document = torch.LongTensor([word_dict[w] for w in ex['document']])
        question = torch.LongTensor([word_dict[w] for w in ex['question']])

        # Create extra features vector
        if len(feature_dict) > 0:
            features = torch.zeros(len(ex['document']), len(feature_dict))
        else:
            features = None

        # f_{exact_match}
        q_words_cased = {w for w in ex['question']}
        q_words_uncased = {w.lower() for w in ex['question']}
        q_lemma = {w for w in ex['qlemma']}
        for i in range(len(ex['document'])):
            if ex['document'][i] in q_words_cased:
                features[i][feature_dict['in_question']] = 1.0
            if ex['document'][i].lower() in q_words_uncased:
                features[i][feature_dict['in_question_uncased']] = 1.0
            if q_lemma and ex['lemma'][i] in q_lemma:
                features[i][feature_dict['in_question_lemma']] = 1.0

        # f_{token} (POS)
        for i, w in enumerate(ex['pos']):
            f = 'pos=%s' % w
            if f in feature_dict:
                features[i][feature_dict[f]] = 1.0

        # f_{token} (NER)
        for i, w in enumerate(ex['ner']):
            f = 'ner=%s' % w
            if f in feature_dict:
                features[i][feature_dict[f]] = 1.0

        # f_{token} (TF)
        counter = Counter([w.lower() for w in ex['document']])
        l = len(ex['document'])
        for i, w in enumerate(ex['document']):
            features[i][feature_dict['tf']] = counter[w.lower()] * 1.0 / l

        # Maybe return without target
        if 'answers' not in ex:
            return document, features, question, ex['id']

        # ...or with target(s) (might still be empty if answers is empty)
        if single_answer:
            assert(len(ex['answers']) > 0)
            start = torch.LongTensor(1).fill_(ex['answers'][0][0])
            end = torch.LongTensor(1).fill_(ex['answers'][0][1])
        else:
            start = [a[0] for a in ex['answers']]
            end = [a[1] for a in ex['answers']]

        return document, features, question, start, end, ex['id']

    def _batchify(self, batch):    
        """Gather a batch of individual examples into one batch."""
        NUM_INPUTS = 3
        NUM_TARGETS = 2
        NUM_EXTRA = 1

        ids = [ex[-1] for ex in batch]
        docs = [ex[0] for ex in batch]
        features = [ex[1] for ex in batch]
        questions = [ex[2] for ex in batch]

        
        # Batch documents and features
        max_length = max([d.size(0) for d in docs])
        x1 = torch.LongTensor(len(docs), max_length).zero_()
        x1_mask = torch.BoolTensor(len(docs), max_length).fill_(1) # ByteTensor
        if features[0] is None:
            x1_f = None
        else:
            x1_f = torch.zeros(len(docs), max_length, features[0].size(1))
        for i, d in enumerate(docs):
            x1[i, :d.size(0)].copy_(d)
            x1_mask[i, :d.size(0)].fill_(0)
            if x1_f is not None:
                x1_f[i, :d.size(0)].copy_(features[i])

        # Batch questions
        max_length = max([q.size(0) for q in questions])
        x2 = torch.LongTensor(len(questions), max_length).zero_()
        x2_mask = torch.BoolTensor(len(questions), max_length).fill_(1)
        for i, q in enumerate(questions):
            x2[i, :q.size(0)].copy_(q)
            x2_mask[i, :q.size(0)].fill_(0)

        # Maybe return without targets
        if len(batch[0]) == NUM_INPUTS + NUM_EXTRA:
            return x1, x1_f, x1_mask, x2, x2_mask, ids

        elif len(batch[0]) == NUM_INPUTS + NUM_EXTRA + NUM_TARGETS:
            # ...Otherwise add targets
            if torch.is_tensor(batch[0][3]):
                y_s = torch.cat([ex[3] for ex in batch])
                y_e = torch.cat([ex[4] for ex in batch])
            else:
                y_s = [ex[3] for ex in batch]
                y_e = [ex[4] for ex in batch]
        else:
            raise RuntimeError('Incorrect number of inputs per example.')
        
        return x1, x1_f, x1_mask, x2, x2_mask, y_s, y_e, ids

    def _decode(self, score_s, score_e, top_n=1, max_len=None):
        """Take argmax of constrained score_s * score_e.

        Args:
            score_s: independent start predictions
            score_e: independent end predictions
            top_n: number of top scored pairs to take
            max_len: max span length to consider
        """
        pred_s = []
        pred_e = []
        pred_score = []
        max_len = max_len or score_s.size(1)
        for i in range(score_s.size(0)):
            # Outer product of scores to get full p_s * p_e matrix
            scores = torch.ger(score_s[i], score_e[i])

            # Zero out negative length and over-length span scores
            scores.triu_().tril_(max_len - 1)

            # Take argmax or top n
            scores = scores.numpy()
            scores_flat = scores.flatten()
            if top_n == 1:
                idx_sort = [np.argmax(scores_flat)]
            elif len(scores_flat) < top_n:
                idx_sort = np.argsort(-scores_flat)
            else:
                idx = np.argpartition(-scores_flat, top_n)[0:top_n]
                idx_sort = idx[np.argsort(-scores_flat[idx])]
            s_idx, e_idx = np.unravel_index(idx_sort, scores.shape)
            pred_s.append(s_idx)
            pred_e.append(e_idx)
            pred_score.append(scores_flat[idx_sort])
            
        return pred_s, pred_e, pred_score

    def _predict(self, ex, candidates=None, top_n=1):
        """Forward a batch of examples only to get predictions.
        """
        # Eval mode
        self.model.eval()

        # Transfer to GPU
        inputs = [e if e is None else e.to(self.device) for e in ex[:5]]

        # Run forward
        with torch.no_grad():
            score_s, score_e = self.model(*inputs)

        # Decode predictions
        score_s = score_s.data.to('cpu')
        score_e = score_e.data.to('cpu')
        if candidates:
            args = (score_s, score_e, candidates, top_n, 15)
            # return decode_candidates(*args)
            print("CANDIDATES")
        else:
            args = (score_s, score_e, top_n, 15)
            return self._decode(*args)

    def predict_batch(self, batch, top_n=1):
        """Predict a batch of document - question pairs."""
        documents, questions, candidates = [], [], []
        for b in batch:
            documents.append(b[0])
            questions.append(b[1])
            candidates.append(b[2] if len(b) == 3 else None)
        candidates = candidates if any(candidates) else None

        tok = Tokenizer()
        # Tokenize the inputs, perhaps multi-processed.
        q_tokens = []
        for question in questions:
            q_tokens.append(tok.tokenize(question))
        d_tokens = []
        for document in documents:
            d_tokens.append(tok.tokenize(document))

        examples = []
        for i in range(len(questions)):
            examples.append({
                'id': i,
                'question': q_tokens[i].words(),
                'qlemma': q_tokens[i].lemmas(),
                'document': d_tokens[i].words(),
                'lemma': d_tokens[i].lemmas(),
                'pos': d_tokens[i].pos(),
                'ner': d_tokens[i].entities(),
            })

        # Stick document tokens in candidates for decoding
        if candidates:
            candidates = [{'input': d_tokens[i], 'cands': candidates[i]}
                            for i in range(len(candidates))]

        # Build the batch and run it through the model
        batch_exs = self._batchify([self._vectorize(e, self.word_dict, self.feature_dict) for e in examples])
        s, e, score = self._predict(batch_exs, candidates, top_n)

        # Retrieve the predicted spans
        results = []
        for i in range(len(s)):
            predictions = []
            for j in range(len(s[i])):
                span = d_tokens[i].slice(s[i][j], e[i][j] + 1).untokenize()
                predictions.append((span, score[i][j].item()))
            results.append(predictions)
            
        return results

### Try out pretrained DrQA Document Reader with a simple query

In [30]:
MODEL_FILE = 'models_lisandro/document_reader.mdl'

model_class = Model(MODEL_FILE)


query = "How long do Hamsters live?"

selected_ids = str(("Animal testing on Syrian hamsters", "Domestication of the Syrian hamster",
                "Golden hamster", "Hamster", "The Hamsters (album)"))

db_path = "data/wikipedia/docs.db"
connection = sqlite3.connect(db_path, check_same_thread=False)
cursor = connection.cursor()

cursor.execute("SELECT id, text FROM documents WHERE id IN " + selected_ids)
data_json = {"id": [], "text": [], "predicted_answer" : [], "score": []}
for r in cursor.fetchall():
    data_json["id"].append(r[0]); data_json["text"].append(r[1])

cursor.close()
connection.close()

examples = [[data_json['text'][idx], query] for idx in range(len(data_json["text"]))]

predictions = model_class.predict_batch(examples)

for p in predictions:
    data_json["predicted_answer"].append(p[0][0])
    data_json["score"].append(p[0][1])

data_df = pd.DataFrame.from_dict(data_json)

print(f"Question: {query}")
print("\nRelevant documents, with their corresponding answers:")
display(data_df.head(5))

# Argmax among all documents
answers = [p[0][0] for p in predictions]
scores = [p[0][1] for p in predictions]
idx = torch.argmax(torch.tensor(scores))

print(f"\nBest prediction : {answers[idx]}, with score : {scores[idx]}")

Question: How long do Hamsters live?

Relevant documents, with their corresponding answers:


Unnamed: 0,id,text,predicted_answer,score
0,Animal testing on Syrian hamsters,Animal testing on Syrian hamsters\n\nSyrian ha...,up to two weeks old,0.095373
1,Domestication of the Syrian hamster,Domestication of the Syrian hamster\n\nThe dom...,proliferated from these three colonies,0.054263
2,Golden hamster,"Golden hamster\n\nThe golden hamster, or Syria...",two to three years,0.286153
3,Hamster,Hamster\n\nHamsters are rodents belonging to t...,three weeks,0.014161
4,The Hamsters (album),The Hamsters (album)\n\nThe Hamsters (1993) (k...,1993,0.075838



Best prediction : two to three years, with score : 0.2861531972885132
