## QA over unstructured data

Using Match LSTM, Pointer Networks, as mentioned in paper https://arxiv.org/pdf/1608.07905.pdf

We start with the pre-processing provided by https://github.com/MurtyShikhar/Question-Answering to clean up the data and make neat para, ques files.


### @TODOs:

1. [done] _Figure out how to put in real, pre-trained embeddings in embeddings layer._
2. [done] _Explicitly provide batch size when instantiating model_
3. [done] is ./val.ids.* validation set or test set?: **validation**
4. [done:em] emInstead of test loss, calculate test acc metrics
    1. todo: new metrics like P, R, F1
5. [done] Update unit test codes

In [6]:
from __future__ import unicode_literals, print_function, division
from pandas_ml import ConfusionMatrix as confusion_matrix
from progressbar import ProgressBar
from pprint import pprint
import matplotlib.pyplot as plt
from io import open
import numpy as np
import unicodedata
import traceback
import pickle
import string
import random
import pylab
import time
import re
import os


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

# from networks import Encoder, MatchLSTMEncoder, PointerDecoder


# Importing models from - https://github.com/laddie132/Match-LSTM

from models import *
from models.loss import MyNLLLoss, RLLoss
from utils.load_config import init_logging, read_config



device = torch.device("cuda")

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

#### Debug Legend

- 5: Print everything that goes in every tensor.
- 4: ??
- 3: Check every model individually
- 2: Print things in training loops
- 1: ??

In [2]:
# Macros 
DATA_LOC = './data/squad'
MODEL_LOC = './models/mlstms/squad/'
DEBUG = 1

# nn Macros
QUES_LEN, PARA_LEN =  30, 30
VOCAB_SIZE = 120000
# VOCAB_SIZE = glove_file.shape[1]               # @TODO: get actual size
HIDDEN_DIM = 150
EMBEDDING_DIM = 300
BATCH_SIZE = 80                  # Might have total 100 batches.
EPOCHS = 30
TEST_EVERY_ = 1
LR = 0.001
CROP = None

In [7]:
global_config = read_config('config/base_model.yaml')
pprint(global_config)

{'encoder': {'add_features': 110,
             'bidirection': True,
             'char_cnn_filter_num': [75, 75, 75, 75],
             'char_cnn_filter_size': [2, 3, 4, 5],
             'char_embedding_size': 64,
             'char_encode_type': 'LSTM',
             'char_layers': 1,
             'char_trainable': True,
             'enable_char': True,
             'mix_encode': False,
             'word_embedding_size': 300,
             'word_layers': 1},
 'global': {'dropout_p': 0.4,
            'emb_dropout_p': 0.1,
            'hidden_mode': 'GRU',
            'hidden_size': 150,
            'layer_norm': False},
 'interaction': {'birnn_after_self': True,
                 'enable_self_match': False,
                 'gated_attention': True,
                 'match_lstm_bidirection': True,
                 'question_match': False,
                 'self_gated': False,
                 'self_match_bidirection': True},
 'output': {'answer_search': True,
            'init_ptr_hidden'

In [15]:
class CustomGloveEmbedding(layers.GloveEmbedding):
    
    def __init__(self, n_embeddings, len_embedding, weights):
        super(layers.GloveEmbedding, self).__init__()
        
        self.embedding_layer = torch.nn.Embedding(num_embeddings=n_embeddings, embedding_dim=len_embedding,
                                                  _weight=weights)
        self.embedding_layer.weight.requires_grad = False
        

In [33]:
class CustomMatchLSTM(MatchLSTM):
    def __init__(self, n_embeddings, len_embedding, weights):
        super(MatchLSTM, self).__init__()
                # set config
        hidden_size = 150
        dropout_p = 0.4
        emb_dropout_p = 0.1
        enable_layer_norm = False
        hidden_mode = 'LSTM'

        word_embedding_size = 300
        encoder_bidirection = True
        encoder_direction_num = 2 if encoder_bidirection else 1

        match_lstm_bidirection = True
        match_rnn_direction_num = 2 if match_lstm_bidirection else 1

        ptr_bidirection = True
        self.enable_search = True
        
        self.embedding = CustomGloveEmbedding(n_embeddings, len_embedding, weights)

        self.encoder = layers.MyRNNBase(mode=hidden_mode,
                                 input_size=word_embedding_size,
                                 hidden_size=hidden_size,
                                 bidirectional=encoder_bidirection,
                                 dropout_p=emb_dropout_p)
        encode_out_size = hidden_size * encoder_direction_num

        self.match_rnn = layers.MatchRNN(mode=hidden_mode,
                                  hp_input_size=encode_out_size,
                                  hq_input_size=encode_out_size,
                                  hidden_size=hidden_size,
                                  bidirectional=match_lstm_bidirection,
                                  gated_attention=True,
                                  dropout_p=dropout_p,
                                  enable_layer_norm=enable_layer_norm)
        match_rnn_out_size = hidden_size * match_rnn_direction_num

        self.pointer_net = layers.BoundaryPointer(mode=hidden_mode,
                                           input_size=match_rnn_out_size,
                                           hidden_size=hidden_size,
                                           bidirectional=ptr_bidirection,
                                           dropout_p=dropout_p,
                                           enable_layer_norm=enable_layer_norm)
    def forward(self, context, question, context_char=None, question_char=None, context_f=None, question_f=None):
        """
        context_char and question_char not used
        """

        # get embedding: (seq_len, batch, embedding_size)
        context_vec, context_mask = self.embedding.forward(context)
        question_vec, question_mask = self.embedding.forward(question)

        # encode: (seq_len, batch, hidden_size)
        context_encode, _ = self.encoder.forward(context_vec, context_mask)
        question_encode, _ = self.encoder.forward(question_vec, question_mask)

        # match lstm: (seq_len, batch, hidden_size)
        print("context mask is ", context_mask)
        qt_aware_ct, qt_aware_last_hidden, match_para = self.match_rnn.forward(context_encode, context_mask,
                                                                               question_encode, question_mask)
        vis_param = {'match': match_para}

        # pointer net: (answer_len, batch, context_len)
        ans_range_prop = self.pointer_net.forward(qt_aware_ct, context_mask)
        ans_range_prop = ans_range_prop.transpose(0, 1)

        # answer range
        if not self.training and self.enable_search:
            ans_range = answer_search(ans_range_prop, context_mask)
        else:
            _, ans_range = torch.max(ans_range_prop, dim=2)

        return ans_range_prop, ans_range, vis_param


In [35]:
DEBUG = 5
if DEBUG > 4:
    
    with torch.no_grad():
        
        macros = {
        "ques_len": QUES_LEN,
        "hidden_dim": HIDDEN_DIM, 
        "vocab_size": VOCAB_SIZE, 
        "batch_size": BATCH_SIZE,
        "para_len": PARA_LEN,
        "embedding_dim": EMBEDDING_DIM,
        "lr": LR,
        "debug":5,
        "device":device
    }
        
    glove_file = torch.randn((VOCAB_SIZE, EMBEDDING_DIM))
#     glove_emb = CustomGloveEmbedding(VOCAB_SIZE,EMBEDDING_DIM,glove_file)
    match_lstm = CustomMatchLSTM(VOCAB_SIZE,EMBEDDING_DIM,glove_file)
    print("embedding init")
    paragraph = torch.randint(0,VOCAB_SIZE-1,(PARA_LEN*BATCH_SIZE,)).view(BATCH_SIZE,PARA_LEN).long()
    question = torch.randint(0,VOCAB_SIZE-1,(QUES_LEN*BATCH_SIZE,)).view(BATCH_SIZE,QUES_LEN).long()
    a,b,c = match_lstm.forward(paragraph,question)
    
#     print(glove_emb.forward(torch.randint(0,VOCAB_SIZE-1,(PARA_LEN*BATCH_SIZE,)).view(BATCH_SIZE,PARA_LEN).long()))
    

embedding init
context mask is  tensor([[ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        ...,
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.]])


In [41]:
print(b[:,1])
print(b[:,0])
print(b)

tensor([ 14,  13,  24,  14,  28,  10,  28,  18,  10,   9,  24,   4,
         26,  13,  16,  15,  26,  26,  20,  23,  27,  29,  24,  11,
          5,  19,  25,  11,   4,  15,  26,   5,  28,  13,   6,   7,
         24,  13,   5,  25,  21,  22,  16,  23,  26,   2,  14,  15,
         28,  10,   4,  15,  15,   6,  24,  29,   6,  19,   7,   6,
         26,   5,  26,  10,  28,  11,  10,   7,   8,  13,  18,  15,
         13,  29,  21,   8,   1,  16,  20,  18])
tensor([ 14,  13,  24,  14,  28,  10,  28,  18,  10,   9,  24,   4,
         26,  13,  16,  15,  26,  26,  20,  23,  27,  29,  24,  11,
          5,  19,  25,  11,   4,  15,  26,   5,  28,  13,   6,   7,
         24,  13,   5,  25,  21,  22,  16,  23,  26,   2,  14,  15,
         28,  10,   4,  15,  15,   6,  24,  29,   6,  19,   7,   6,
         26,   5,  26,  10,  28,  11,  10,   7,   8,  13,  18,  15,
         13,  29,  21,   8,   1,  16,  20,  18])
tensor([[ 14,  14],
        [ 13,  13],
        [ 24,  24],
        [ 14,  14],
      

### Encoder 
Use a simple lstm class to have encoder for question and paragraph. 
The output of these will be used in the match lstm

$H^p = LSTM(P)$ 


$H^q = LSTM(Q)$

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, inputlen, macros, glove_file, device):
        super(Encoder, self).__init__()
        
        # Catch dim
        self.inputlen = inputlen
        self.hiddendim = macros['hidden_dim']
        self.embeddingdim =  macros['embedding_dim']
        self.vocablen = macros['vocab_size']
#         self.device = macros['device']
        self.batch_size = macros['batch_size']
        self.debug = macros['debug']
        
        # Embedding Layer
#         self.embedding = nn.Embedding(len(glove_file), self.embeddingdim)
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(glove_file))
        self.embedding.weight.requires_grad = True
       
        # LSTM Layer
        self.lstm = nn.LSTM(self.embeddingdim, self.hiddendim, bidirectional=True, batch_first=False)
        
    def init_hidden(self, batch_size, device):
        
        # Returns a new hidden layer var for LSTM
        return (torch.zeros((2, batch_size, self.hiddendim), device=device), 
                torch.zeros((2, batch_size, self.hiddendim), device=device))
    
    def forward(self, x, h, device):
        
        # Input: x (batch, len ) (current input)
        # Hidden: h (1, batch, hiddendim) (last hidden state)
        
        # Batchsize: b int (inferred)
        b = x.shape[0]
        
        if self.debug > 4: print("x:\t", x.shape)
        if self.debug > 4: print("h:\t", h[0].shape, h[1].shape)
        
        x_emb = self.embedding(x)
        if self.debug > 4: 
            print("x_emb:\t", x_emb.shape)
#             print("x_emb_wrong:\t", x_emb.transpose(1,0).shape)           
            
        ycap, h = self.lstm(x_emb.transpose(1,0), h)
        if self.debug > 4: 
            print("ycap:\t", ycap.shape)
        
        return ycap, h
    
    
# # with torch.no_grad():
# #     print ("Trying out question encoder LSTM")
# #     model = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
# #     dummy_x = torch.tensor([22,45,12], dtype=torch.long)
# #     hidden = model.init_hidden()
# #     ycap, h = model(dummy_x, hidden)
    
# #     print(ycap.shape)
# #     print(h[0].shape, h[1].shape)


if DEBUG > 4:
    with torch.no_grad():
        
        macros = {
        "ques_len": QUES_LEN,
        "hidden_dim": HIDDEN_DIM, 
        "vocab_size": VOCAB_SIZE, 
        "batch_size": BATCH_SIZE,
        "para_len": PARA_LEN,
        "embedding_dim": EMBEDDING_DIM,
        "lr": LR,
        "debug":5,
        "device":device
    }

        dummy_para = torch.randint(0,VOCAB_SIZE-1,(PARA_LEN*BATCH_SIZE,), device=device).view(BATCH_SIZE,PARA_LEN).long()
    #     print (dummy_para.shape)
        dummy_question = torch.randint(0,VOCAB_SIZE-1,(QUES_LEN*BATCH_SIZE,), device=device).view(BATCH_SIZE,QUES_LEN).long()
    #     print (dummy_question.shape)
        glove_file = torch.randn((VOCAB_SIZE, EMBEDDING_DIM))

    #     print("LSTM with batches")
        ques_model = Encoder(QUES_LEN, macros, glove_file, device).cuda(device)
        para_model = Encoder(QUES_LEN, macros, glove_file, device).cuda(device)
        ques_hidden = ques_model.init_hidden(BATCH_SIZE, device)
        para_hidden = para_model.init_hidden(BATCH_SIZE, device)
        ques_embedded,hidden_ques = ques_model(dummy_question,ques_hidden, device)
        para_embedded,hidden_para = para_model(dummy_para,para_hidden, device)
        
#         print (ques_embedded.shape) # question_length,batch,embedding_dim
#         print (para_embedded.shape) # para_length,batch,embedding_dim
#         print (hidden_para[0].shape,hidden_para[1].shape)

### Match LSTM

Use a match LSTM to compute a **summarized sequential vector** for the paragraph w.r.t the question.

Consider the summarized vector ($H^r$) as the output of a new decoder, where the inputs are $H^p, H^q$ computed above. 

1. Attend the para word $i$ with the entire question ($H^q$)
  
    1. $\vec{G}_i = tanh(W^qH^q + repeat(W^ph^p_i + W^r\vec{h^r_{i-1} + b^p}))$
    
    2. *Computing it*: Here, $\vec{G}_i$ is equivalent to `energy`, computed differently.
    
    3. Use a linear layer to compute the content within the $repeat$ fn.
    
    4. Add with another linear (without bias) with $H_q$
    
    5. $tanh$ the bloody thing
  
  
2. Softmax over it to get $\alpha$ weights.

    1. $\vec{\alpha_i} = softmax(w^t\vec{G}_i + repeat(b))$
    
3. Use the attention weight vector $\vec{\alpha_i}$ to obtain a weighted version of the question and concat it with the current token of the passage to form a vector $\vec{z_i}$

4. Use $\vec{z_i}$ to compute the desired $h^r_i$:

    1. $ h^r_i = LSTM(\vec{z_i}, h^r_{i-1}) $
    


In [None]:
class MatchLSTMEncoder(nn.Module):
    
    def __init__(self, macros, device):
        
        super(MatchLSTMEncoder, self).__init__()
        
        self.hidden_dim = macros['hidden_dim']
        self.ques_len = macros['ques_len']
        self.batch_size = macros['batch_size']
        self.debug = macros['debug']    
        
        # Catch lens and params
        self.lin_g_repeat_a_dense = nn.Linear(2*self.hidden_dim, self.hidden_dim)
        self.lin_g_repeat_b_dense = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
        self.lin_g_nobias = nn.Linear(2*self.hidden_dim, self.hidden_dim, bias=False)
        
        self.alpha_i_w = nn.Parameter(torch.rand((self.hidden_dim, 1)))
        self.alpha_i_b = nn.Parameter(torch.rand((1)))
        
        self.lstm_summary = nn.LSTM((self.ques_len+1)*2*self.hidden_dim, self.hidden_dim, batch_first=False)
                                      
    
    def forward(self, H_p, h_ri, H_q, hidden, device):
        """
            Ideally, we would have manually unrolled the lstm 
            but due to memory constraints, we do it in the module.
        """
        # Find batchsize
        batch_size = H_p.shape[1]
        H_r = []
        
        if self.debug > 4:
            print("H_p:\t", H_p.shape[0])
            print("h_ri:\t", h_ri.shape[0])
            print("H_q:\t", H_q.shape[0])
            print("hidden:\t", hidden.shape[0])
        
        for i in range(H_p.shape[0]):
            
            j = H_p.shape[0] - 1 - i
            
            # Get the G's first input
            G_input_a = self.lin_g_repeat_a_dense(H_q)
            
            #
            G_input_b_input_a_fwd = 
            
    
    
    def init_hidden(self, batch_size, device):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return torch.zeros((1, batch_size, self.hidden_dim), device=device)
#                 torch.zeros((1, batch_size, self.hidden_dim), device=device))



if DEBUG > 4:
    with torch.no_grad():
        
        macros = {
            "ques_len": QUES_LEN,
            "hidden_dim": HIDDEN_DIM, 
            "vocab_size": VOCAB_SIZE, 
            "batch_size": BATCH_SIZE,
            "para_len": PARA_LEN,
            "embedding_dim": EMBEDDING_DIM,
            "lr": LR,
            "debug":5,
            "device":device
        }
            
        matchLSTMEncoder = MatchLSTMEncoder(macros,device).cuda(device)
        hidden = matchLSTMEncoder.init_hidden(BATCH_SIZE,device)
        para_embedded = torch.rand((PARA_LEN, BATCH_SIZE, 2*HIDDEN_DIM), device=device)
        ques_embedded = torch.rand((QUES_LEN, BATCH_SIZE, 2*HIDDEN_DIM), device=device)
        h_ri = torch.randn(1, BATCH_SIZE, HIDDEN_DIM, device=device)
    #     if DEBUG:
    #         print ("init h_ri shape is: ", h_ri.shape)
    #         print ("the para length is ", len(para_embedded))
        H_r = matchLSTMEncoder(para_embedded.view(-1,BATCH_SIZE,2*HIDDEN_DIM),
                               h_ri, 
                               ques_embedded, 
                               hidden,
                               device)
        print("H_r: ", H_r.shape)
        
        
        

### Pointer Network

Using a ptrnet over $H_r$ to unfold and get most probable spans.
We use the **boundry model** to do that (predict start and end of seq).

- We first calculate $\vec{F}_k$ (energy) vector which will be softmaxed to get the attention weights ($\beta$).

  - $F_k = tanh(VH^r + (W^ah^a_{k-1} + b^a).repeat(P))$

  - $\beta_k = softmax(v^TF_k + c.repeat(P))$

- Using the $\beta$ values, we annotate the hidden states of the match-lstm output, which we directly throw to a decoder.

- Finally, the $\beta$ values are used as the **desired** output pointer.

- We unroll the decoder twice (manually) to get a **start** and an **end** pointer.

A simple energy -> softmax -> decoder. Where softmaxed energy is supervised.

In [None]:
class PointerDecoder(nn.Module):
    
    def __init__(self, macros, device):
        super(PointerDecoder, self).__init__()
        
        # Keep args
        self.hidden_dim = macros['hidden_dim']
        self.batch_size = macros['batch_size']
        self.para_len = macros['para_len']
        self.debug = macros['debug']
        
        self.lin_f_repeat = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.lin_f_nobias = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
        
        self.beta_k_w = nn.Parameter(torch.randn(self.hidden_dim, 1))
        self.beta_k_b = nn.Parameter(torch.randn(1))
        
        self.lstm = nn.LSTM(self.hidden_dim*self.para_len, self.hidden_dim, batch_first=False)

    
    def init_hidden(self, batch_size, device):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return torch.zeros((1, batch_size, self.hidden_dim), device=device)
#                 torch.zeros((1, batch_size, self.hidden_dim), device=device))
    
    def forward(self, h_ak, H_r, hidden, device):
        
        # h_ak (current decoder's last op) (1,batch,hiddendim)
        # H_r (weighted summary of para) (P, batch, hiddendim)
        batch_size = H_r.shape[1]
        
        if self.debug > 4:
            print("h_ak:\t\t\t", h_ak.shape)
            print("H_r:\t\t\t", H_r.shape)
            print("hidden:\t\t\t", hidden.shape)
            
        # Prepare inputs for the tanh used to compute energy
        f_input_b = self.lin_f_repeat(h_ak)
        if self.debug > 4: print("f_input_b unrepeated:  ", f_input_b.shape)
        
        #H_r shape is ([PARA_LEN, BATCHSIZE, EmbeddingDIM])
        f_input_b = f_input_b.repeat(H_r.shape[0], 1, 1)
        if self.debug > 4: print("f_input_b repeated:\t", f_input_b.shape)
            
        f_input_a = self.lin_f_nobias(H_r)
        if self.debug > 4: print("f_input_a:\t\t", f_input_a.shape)
            
        # Send it off to tanh now
        F_k = F.tanh(f_input_a+f_input_b)
        if self.debug > 4: print("F_k:\t\t\t", F_k.shape) #PARA_LEN,BATCHSIZE,EmbeddingDim
        
        # Attention weights
        beta_k_input_a = F_k.transpose(1,0).matmul(self.beta_k_w).view(batch_size, 1, -1)
        if self.debug > 4: print("beta_k_input_a:\t\t", beta_k_input_a.shape)
            
        beta_k_input = beta_k_input_a.add_(self.beta_k_b.repeat(1,1,self.para_len))
        if self.debug > 4: print("beta_k_input:\t\t", beta_k_input.shape)
            
        beta_k = F.softmax(beta_k_input, dim=-1)
        if self.debug > 4: print("beta_k:\t\t\t", beta_k.shape)
        
        lstm_input_a = H_r.transpose(1,0) * (beta_k.view(batch_size, self.para_len, -1).repeat(1,1,self.hidden_dim))
        if self.debug > 4: print("lstm_input_a:\t\t", lstm_input_a.shape)
        
        _, (h_ak, hidden) = self.lstm(lstm_input_a.transpose(1,0).contiguous().view(1, batch_size, -1), (h_ak, hidden))
        
        return h_ak, hidden, torch.log(beta_k)
            
if DEBUG > 4:
    with torch.no_grad():
        macros = {
            "ques_len": QUES_LEN,
            "hidden_dim": HIDDEN_DIM, 
            "vocab_size": VOCAB_SIZE, 
            "batch_size": BATCH_SIZE,
            "para_len": PARA_LEN,
            "embedding_dim": EMBEDDING_DIM,
            "lr": LR,
            "debug":5,
            "device":device
        }
        
        pointerDecoder = PointerDecoder(macros, device).cuda(device)
        h_ak = torch.randn(1,BATCH_SIZE,HIDDEN_DIM, device=device)
        H_r = torch.randn(PARA_LEN, BATCH_SIZE, HIDDEN_DIM, device=device)
        hidden = pointerDecoder.init_hidden(BATCH_SIZE, device)
        h_ak, hidden, beta_k = pointerDecoder(h_ak, H_r, hidden, device)
        print (beta_k.shape)

In [None]:
H_r.transpose(1,0) * \
(beta_k.view(batch_size, self.para_len, -1).repeat(1,1,self.hidden_dim))

# Pull the real data from disk.

Files stored in `./data/squad/train.ids.*`
Pull both train and test.

In [None]:
def prepare_data(data_loc, macros, crop=None):
    """
        Given the dataloc and the data available in a specific format, it would pick the data up, and make trainable matrices,
        Harvest train_P, train_Q, train_Y, test_P, test_Q, test_Y matrices in this format
        
        If crop given, will trim the data at a certain length
        
        **return_type**: np matrices
    """
    
    # Unpacking macros
    PARA_LEN = macros['para_len']
    QUES_LEN = macros['ques_len']
    
    train_q = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'train.ids.question')))])
    train_p = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'train.ids.context')))])
    train_y = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'train.span')))])

    test_q = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'val.ids.question')))])
    test_p = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'val.ids.context')))])
    test_y = np.asarray([[int(x) for x in datum.split()] for datum in list(open(os.path.join(data_loc, 'val.span')))])

    if macros['debug'] > 3:
        print("Train Q: ", train_q.shape)
        print("Train P: ", train_p.shape)
        print("Train Y: ", train_y.shape)
        print("Test Q: ", test_q.shape)
        print("Test P: ", test_p.shape)
        print("Test Y: ", test_y.shape)
    
    """
        Parse the semi-raw data:
            - shuffle
            - pad, prepare
            - dump useless vars
    """
    # Shuffle data
    
    if crop:
        index_train, index_test = np.random.choice(np.arange(len(train_p)), crop), \
                                  np.random.choice(np.arange(len(test_p)), crop)
    else:
        index_train, index_test = np.arange(len(train_p)), np.arange(len(test_p))
        np.random.shuffle(index_train)
        np.random.shuffle(index_test)

    train_p, train_q, train_y = train_p[index_train], train_q[index_train], train_y[index_train]
    test_p, test_q, test_y = test_p[index_test], test_q[index_test], test_y[index_test]

#     sanity_check(train_p, train_y)

    if macros['debug'] >= 5:
        print("Max q len: ", max(len(q) for q in train_q))
        
    
    # Pad and prepare
    train_P = np.zeros((len(train_p), PARA_LEN))
    train_Q = np.zeros((len(train_q), QUES_LEN))
    train_Y_start = np.zeros((len(train_p), PARA_LEN))
    train_Y_end = np.zeros((len(train_p), PARA_LEN))

    test_P = np.zeros((len(test_p), PARA_LEN))
    test_Q = np.zeros((len(test_q), QUES_LEN))
    test_Y_start = np.zeros((len(test_p), PARA_LEN))
    test_Y_end = np.zeros((len(test_p), PARA_LEN))
    
#     print(train_P.shape)

    crop_train = []    # Remove these rows from training
    for i in range(len(train_p)):
        p = train_p[i]
        q = train_q[i]
        y = train_y[i]
        
        # First see if you can keep this example or not (due to size)
        if y[0] >= PARA_LEN or y[1] >= PARA_LEN:
            crop_train.append(i)
            continue


        train_P[i, :min(PARA_LEN, len(p))] = p[:min(PARA_LEN, len(p))]
        train_Q[i, :min(QUES_LEN, len(q))] = q[:min(QUES_LEN, len(q))]
        train_Y_start[i, y[0]] = 1
        train_Y_end[i, y[1]] = 1

    crop_test = []
    for i in range(len(test_p)):
        p = test_p[i]
        q = test_q[i]
        y = test_y[i]

        # First see if you can keep this example or not (due to size)
        if y[0] >= PARA_LEN or y[1] >= PARA_LEN:
            crop_test.append(i)
            continue

        test_P[i, :min(PARA_LEN, len(p))] = p[:min(PARA_LEN, len(p))]
        test_Q[i, :min(QUES_LEN, len(q))] = q[:min(QUES_LEN, len(q))]
        test_Y_start[i, y[0]] = 1
        test_Y_end[i, y[1]] = 1
        
        
    # Remove the instances which are in crop_train
    train_P = np.delete(train_P, crop_train, axis=0)
    train_Q = np.delete(train_Q, crop_train, axis=0)
    train_Y_start = np.delete(train_Y_start, crop_train, axis=0)
    train_Y_end = np.delete(train_Y_end, crop_train, axis=0)
    
    test_P = np.delete(test_P, crop_test, axis=0)
    test_Q = np.delete(test_Q, crop_test, axis=0)
    test_Y_start = np.delete(test_Y_start, crop_test, axis=0)
    test_Y_end = np.delete(test_Y_end, crop_test, axis=0)

    if macros['debug'] >= 1:
        print("Train Q: ", train_Q.shape)
        print("Train P: ", train_P.shape)
        print("Train Y: ", train_Y_start.shape)
        print("Test Q: ", test_Q.shape)
        print("Test P: ", test_P.shape)
        print("Test Y: ", test_Y_start.shape)
        print("Crop_train: ", len(crop_train))
        print("Crop_test: ", len(crop_test))
    # Let's free up some memory now
    train_p, train_q, train_y, test_p, test_q, test_y = None, None, None, None, None, None
    
    # Load embedding matrics
    vectors = np.load(os.path.join(data_loc, 'glove.new.trimmed.300.npy'))
    
    return train_P[:10000], train_Q[:10000], train_Y_start[:10000], train_Y_end[:10000], test_P, test_Q, test_Y_start, test_Y_end, vectors

In [None]:
# macros = {
#     "ques_len": QUES_LEN,
#     "hidden_dim": HIDDEN_DIM, 
#     "vocab_size": VOCAB_SIZE, 
#     "batch_size": BATCH_SIZE,
#     "para_len": PARA_LEN,
#     "embedding_dim": EMBEDDING_DIM,
#     "debug": 5
# } 

# a = prepare_data(DATA_LOC, macros=macros, crop=None)

# Training, and running the model
- Write a train fn
- Write a training loop invoking it
- Fill in real data

----------

Feats:
- Function to test every n epochs.
- Report train accuracy every epoch
- Store the train, test accuracy for every instance.


In [None]:
# Function to save the model
def save_model(loc, models, epochs=0, optimizer=None):
    """
        Input:
            loc: str of the folder where the models are to be saved
            models: dict of 'model_name': model_object
            epochs, optimizers are int, torch.optims (discarded right now).
    """
    
    assert type(models) is dict and len(models.keys()) == 4
    
    # Assumes four models. Doesn't save device/epochs/optimizer right now.
    
    for name in models:
        torch.save(models[name], os.path.join(loc, name+'.torch'))
    

In [None]:
def train(para_batch,
          ques_batch,
          answer_start_batch,
          answer_end_batch,
          ques_model,
          para_model,
          mlstm_model,
          pointer_decoder_model,
          optimizer, 
          loss_fn,
          macros,
          debug=2,
          train=True):

    """
    
    :param para_batch: paragraphs (batch, max_seq_len_para) 
    :param ques_batch: questions corresponding to para (batch, max_seq_len_ques)
    :param answer_start_batch: one-hot vector denoting pos of span start (batch, max_seq_len_para)
    :param answer_end_batch: one-hot vector denoting pos of span end (batch, max_seq_len_para)
    
    # Models
    :param ques_model: model to encode ques
    :param para_model: model to encode para
    :param mlstm_model: model to match para, ques to get para summary
    :param pointer_decoder_model: model to get a pointer over start and end span pointer
    
    # Loss and Optimizer.
    :param loss_fn: 
    :param optimizer: 
    
    :return: 
    
    
    NOTE: When using MSE, 
        - target labels are one-hot
        - target label is float tensor
        - shape (batch, 1, len)
        
        When using CrossEntropy
        - target is not onehot
        - long
        - shape (batch, )
    """
    try:    
        # Temporarily infer batch size, use and then at last, put this value back
        BATCH_SIZE = macros['batch_size']
        macros['batch_size'] = para_batch.shape[0]

        if debug >=2: 
            print("\tpara_batch:\t\t", para_batch.shape)
            print("\tques_batch:\t\t", ques_batch.shape)
            print("\tanswer_start_batch:\t", answer_start_batch.shape)
            print("\tanswer_end_batch:\t\t", answer_end_batch.shape)

        # Wiping all gradients
        optimizer.zero_grad()

        # Initializing all hidden states.
        hidden_quesenc = ques_model.init_hidden(macros['batch_size'], device)
        hidden_paraenc = para_model.init_hidden(macros['batch_size'], device)
        hidden_mlstm = mlstm_model.init_hidden(macros['batch_size'], device)
        hidden_ptrnet = pointer_decoder_model.init_hidden(macros['batch_size'], device)
        h_ri = torch.zeros((1, macros['batch_size'], macros['hidden_dim']), dtype=torch.float, device=device)
        h_ak = torch.zeros((1, macros['batch_size'], macros['hidden_dim']), dtype=torch.float, device=device)
        if debug >= 2: print("------------Instantiated hidden states------------")

        #passing the data through LSTM pre-processing layer
        H_q, ques_model_hidden = ques_model(ques_batch, hidden_quesenc, device=device)
        H_p, para_model_hidden = para_model(para_batch, hidden_paraenc, device=device)
        if debug >= 2: 
            print("\tH_q:\t\t", H_q.shape)
            print("\tH_p:\t\t", H_p.shape)
            print("\tH_ri:\t\t", h_ri.shape)
    #         raw_input("Check memory and ye shall continue")
            print("------------Encoded hidden states------------")

        H_r = mlstm_model(H_p.view(-1, macros['batch_size'], 2*macros['hidden_dim']), h_ri, H_q, hidden_mlstm, device=device)
        if debug >= 2: print("------------Passed through matchlstm------------")

        #Passing the paragraph embddin via pointer network to generate final answer pointer.
        h_ak, hidden_ptrnet, beta_k_start = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device=device)
        h_ak, hidden_ptrnet, beta_k_end = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device=device)
        if debug >= 2: print("------------Passed through pointernet------------")


        # For crossentropy
        _, answer_start_batch = answer_start_batch.max(dim=2)
        _, answer_end_batch = answer_end_batch.max(dim=2)
        answer_start_batch = answer_start_batch.view(-1).long()
        answer_end_batch = answer_end_batch.view(-1).long()
#         print(beta_k_start.view(-1, macros['para_len']).shape, answer_start_batch.view(-1).shape)
        
        # Calculate Loss
        loss = loss_fn(beta_k_start.view(-1, macros['para_len']), answer_start_batch)
        loss += loss_fn(beta_k_end.view(-1, macros['para_len']), answer_end_batch)
#         loss = loss_fn(beta_k_start, answer_start_batch)
#         loss += loss_fn(beta_k_end, answer_end_batch)
        if debug >= 2: print("------------Calculated loss------------")

        if train:
            loss.backward()
            if debug >= 2: print("------------Calculated Gradients------------")

        if train:
            #optimization step
            optimizer.step()
            if debug >= 2: print("------------Updated weights.------------")
        
        macros['batch_size'] = BATCH_SIZE
        return beta_k_start, beta_k_end, loss
    
    except: 
        macros['batch_size'] = BATCH_SIZE
        traceback.print_exc()

In [None]:
def custom_train(para_batch,
                 ques_batch,
                 answer_start_batch,
                 answer_end_batch,
                 matchlstm_model,
                 optimizer, 
                 loss_fn,
                 macros,
                 debug=2,
                 train=True):

    """
    
    :param para_batch: paragraphs (batch, max_seq_len_para) 
    :param ques_batch: questions corresponding to para (batch, max_seq_len_ques)
    :param answer_start_batch: one-hot vector denoting pos of span start (batch, max_seq_len_para)
    :param answer_end_batch: one-hot vector denoting pos of span end (batch, max_seq_len_para)
    
    # Models
    :param ques_model: model to encode ques
    :param para_model: model to encode para
    :param mlstm_model: model to match para, ques to get para summary
    :param pointer_decoder_model: model to get a pointer over start and end span pointer
    
    # Loss and Optimizer.
    :param loss_fn: 
    :param optimizer: 
    
    :return: 
    
    
    NOTE: When using MSE, 
        - target labels are one-hot
        - target label is float tensor
        - shape (batch, 1, len)
        
        When using CrossEntropy
        - target is not onehot
        - long
        - shape (batch, )
    """
    try:    
        # Temporarily infer batch size, use and then at last, put this value back
        BATCH_SIZE = macros['batch_size']
        macros['batch_size'] = para_batch.shape[0]

        if debug >=2: 
            print("\tpara_batch:\t\t", para_batch.shape)
            print("\tques_batch:\t\t", ques_batch.shape)
            print("\tanswer_start_batch:\t", answer_start_batch.shape)
            print("\tanswer_end_batch:\t\t", answer_end_batch.shape)

        # Wiping all gradients
        optimizer.zero_grad()
        
        # passing th data through matchLSTM
        ans_range_prop, beta, _ = model.forward(ques_batch,para_batch)
        
        # creating batch answer
        _, answer_start_batch = answer_start_batch.max(dim=2)
        _, answer_end_batch = answer_end_batch.max(dim=2)
        answer_start_batch = answer_start_batch.view(-1).long()
        answer_end_batch = answer_end_batch.view(-1).long()
        
        # TODO: Check if the logic works correctly.
        answer_range_batch = np.hstack((answer_start_batch,answer_end_batch))
        
        # passing it through custom loss function
        if debug >= 2: print("------------Calculated loss------------")
        loss = criterion.forward(ans_range_prop, answer_range_batch)
        
        
        if train:
            if debug >= 2: print("------------Calculated Gradients------------")
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_max)  # fix gradient explosion
            if debug >= 2: print("------------Updated weights.------------")
            optimizer.step()
        
        macros['batch_size'] = BATCH_SIZE
        return beta[:,0], beta[:,1], loss
    
    except: 
        macros['batch_size'] = BATCH_SIZE
        traceback.print_exc()

In [None]:
# Predict function (no grad, no eval)
def predict(para_batch,
            ques_batch,
            ques_model,
            para_model,
            mlstm_model,
            pointer_decoder_model,
            macros,
            loss_fn=None,
            debug=DEBUG):
    """
        Function which returns the model's output based on a given set of P&Q's. 
        Does not convert to strings, gives the direct model output.
        
        Expects:
            four models
            data
            misc macros
    """
    
#     BATCH_SIZE = macros['batch_size']
    BATCH_SIZE = ques_batch.shape[0]
    HIDDEN_DIM = macros['hidden_dim']
    DEBUG = debug
    
    if debug >=2: 
        print("\tpara_batch:\t\t", para_batch.shape)
        print("\tques_batch:\t\t", ques_batch.shape)
        
    with torch.no_grad():    

        # Initializing all hidden states.
        hidden_quesenc = ques_model.init_hidden(BATCH_SIZE, device)
        hidden_paraenc = para_model.init_hidden(BATCH_SIZE, device)
        hidden_mlstm = mlstm_model.init_hidden(BATCH_SIZE, device)
        hidden_ptrnet = pointer_decoder_model.init_hidden(BATCH_SIZE, device)
        h_ri = torch.zeros((1, BATCH_SIZE, HIDDEN_DIM), dtype=torch.float, device=device)
        h_ak = torch.zeros((1, BATCH_SIZE, HIDDEN_DIM), dtype=torch.float, device=device)
        if DEBUG >= 2: print("------------Instantiated hidden states------------")
            
        #passing the data through LSTM pre-processing layer
        H_q, ques_model_hidden = ques_model(ques_batch, hidden_quesenc, device)
        H_p, para_model_hidden = para_model(para_batch, hidden_paraenc, device)
        if DEBUG >= 2: 
            print("\tH_q:\t\t", H_q.shape)
            print("\tH_p:\t\t", H_p.shape)
            print("\tH_ri:\t\t", h_ri.shape)
#             raw_input("Check memory and ye shall continue")
            print("------------Encoded hidden states------------")

        H_r = mlstm_model(H_p.view(-1, BATCH_SIZE, 2*HIDDEN_DIM), h_ri, H_q, hidden_mlstm, device)
        if DEBUG >= 2: print("------------Passed through matchlstm------------")

        #Passing the paragraph embddin via pointer network to generate final answer pointer.
        h_ak, hidden_ptrnet, beta_k_start = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device)
        _, _, beta_k_end = pointer_decoder_model(h_ak, H_r, hidden_ptrnet, device)
        if DEBUG >= 2: print("------------Passed through pointernet------------")
                            
        # For crossentropy
#         _, answer_start_batch = answer_start_batch.max(dim=2)[1]
#         _, answer_end_batch = answer_end_batch.max(dim=2)[1]
#         print("labels: ", answer_start_batch.shape)[1]
            
#         #How will we manage batches for loss.
#         loss = loss_fn(beta_k_start, answer_start_batch)
#         loss += loss_fn(beta_k_end, answer_end_batch)
#         if debug >= 2: print("------------Calculated loss------------")
            
        return (beta_k_start, beta_k_end, 0.0)


In [None]:
# Eval function (no grad no eval no nothing)
def eval(y_cap, y):
    """ 
        Returns the exact-match (em) metric by default.
        Can specifiy more in a list (TODO)
        
        Inputs:
        - y_cap: list of two tensors (start, end) of dim [BATCH_SIZE, PARA_LEN] each
        - y: list of two tensors (start, end) of dim [BATCH_SIZE, 1] each
    """
    metrics={'em':None, 'p':None, 'r':None, 'f1':None}
    
#     y_cap= torch.argmax(y_cap[0], dim=1).float(), torch.argmax(y_cap[1], dim=1).float()
#     y = torch.argmax(y[0], dim=1).float(), torch.argmax(y[1], dim=1).float()
    
    
    # If we want f1 and haven't specified that we want p and q, fuck it and add it there
    if 'f1' in metrics.keys():
        metrics['p'] = None 
        metrics['r'] = None
    
    # Convert to numpy arrays of size (batch, 2)
    y_cap= np.vstack((torch.argmax(y_cap[0], dim=1), torch.argmax(y_cap[1], dim=1))).transpose()
    y = np.vstack((torch.argmax(y[0], dim=1), torch.argmax(y[1], dim=1))).transpose()
      
    # First, if start > end, fix that (we're cool that way.)
    for i in range(y_cap.shape[0]):
        if y_cap[i][0] > y_cap[i][1]: 
            y_cap[i] = y_cap[i][[1,0]]
            
    # First, if start > end, fix that (we're cool that way.)
    for i in range(y.shape[0]):
        if y[i][0]> y[i][1]: 
            y[i] = y[i][[1,0]]
            
            
    if "em" in metrics.keys():
        metrics['em'] = np.mean(np.logical_and(np.equal(y[:,0], y_cap[:,0]),np.equal(y[:,1], y_cap[:,1])))
            
    if 'f1' in metrics.keys():
        
        f1, pr, rk = [], [], []
        for i in range(y.shape[0]):
            
            if y[i][0] == y[i][1]:
                _y = [int(y[i][0])]
            else:
                _y = range(y[i][0], y[i][1])
                
            if y_cap[i][0] == y_cap[i][1]:
                _y_cap = [int(y_cap[i][0])]
            else:
                _y_cap = range(y_cap[i][0], y_cap[i][1])
            
            intersection = len(set(_y).intersection(_y_cap))
            
            positives = float(len(_y_cap))
            truth = float(len(_y))
            
            try:
                p = intersection/positives
                r = intersection/truth
            except ZeroDivisionError:
                print("Ran into zero division error. Here are the inputs.")
                print(_y)
                print(_y_cap)
                
                p = 0
                r = 0
            
            f = (2*p*r)/(p+r) if p > 0 and r > 0 else 0.0
            
            f1.append(f)
            pr.append(p)
            rk.append(r)
            
        f1 = np.mean(f1)
        pr = np.mean(pr)
        rk = np.mean(rk)
        
        metrics['f1'] = f1
        metrics['p'] = pr
        metrics['r'] = rk
            
    if DEBUG >= 3: 
        print("Test performance: ", metrics)
        print("------------Evaluated------------")
        
    return metrics

if True:
    # Testing this function
    metrics = {'em':None}
#     y = torch.tensor([[3]]).float(), torch.tensor([[4]]).float()
    y = torch.tensor([[0,0,3,0], [0,2,0,0]]), torch.tensor([[0,0,0,3], [0,0,0,3]])
    y_cap = torch.tensor([[3,0,0,0],[0,0,3,0]]), torch.tensor([[0,2,0,1],[0,1,0,0]])
#     y = torch.randint(0, PARA_LEN, (BATCH_SIZE,)).float(), torch.randint(0, PARA_LEN, (BATCH_SIZE,)).float()
#     y_cap = torch.rand((BATCH_SIZE, PARA_LEN)), torch.rand((BATCH_SIZE, PARA_LEN))
    print(eval(y_cap, y))   

In [None]:
def training_loop(_models, _data, _macros, _epochs, _save=0, _test_eval=0, _train_eval=0, _debug=2):
    """
        > Instantiate models
        > Instantiate loss, optimizer
        > Instantiate ways to store loss

        > Per epoch
            > sample batch and give to train fn
            > get loss
            > if epoch %k ==0: get test accuracy

        > have fn to calculate test accuracy
        
        > _save: int
            > 0: dont
            > 1+: save every _save epoch (overwrite)
            > -1 -> save best (turned to 1 if test evals dont happen.)
        
        > Save the model at every epoch if we don't test on test. 
            > else save on the best performning mode
    """
    
    # Unpack data
    DEBUG = _debug
    train_P = _data['train']['P']
    train_Q = _data['train']['Q']
    train_Y_start = _data['train']['Ys']
    train_Y_end = _data['train']['Ye']
    test_P = _data['test']['P']
    test_Q = _data['test']['Q']
    test_Y_start = _data['test']['Ys']
    test_Y_end = _data['test']['Ye']

    ques_model, para_model, mlstm_model, pointer_decoder_model = _models
    _data = None

    # Instantiate Loss
#         loss_fn = nn.MSELoss()
    loss_fn = nn.NLLLoss()
    optimizer = optim.Adam(list(filter(lambda p: p.requires_grad, ques_model.parameters())) + 
                             list(filter(lambda p: p.requires_grad, para_model.parameters())) + 
                             list(mlstm_model.parameters()) + 
                             list(pointer_decoder_model.parameters()), lr=macros['lr'])
#         optimizer = optim.Adam(list(ques_model.parameters()) + \
#                                list(para_model.parameters()) + \
#                                list(mlstm_model.parameters()) + \
#                               list(pointer_decoder_model.parameters()), lr=macros['lr'])

    # Losses
    train_losses = []
    train_em = []
    train_f = []
    test_losses = []
    test_em = []
    test_f = []
    best_test = 0.0
    found_best_test = False
    traces_test = None
    traces_train = None
    try: 

        # Training Loop
        for epoch in range(_epochs):
            print("Epoch: ", epoch, "/", _epochs)

            epoch_loss = []
            epoch_train_em = []
            epoch_train_f = []
            epoch_time = time.time()
            epoch_traces = {'true':{'start':[], 'end':[]}, 'pred':{'start':[], 'end':[]}}
            epoch_traces_train = {'true':{'start':[], 'end':[]}, 'pred':{'start':[], 'end':[]}}
            
            for iter in range(int(len(train_P)/BATCH_SIZE)):
#             for iter in range(2):

                batch_time = time.time()

                # Sample batch and train on it
                sample_index = np.random.randint(0, len(train_P), _macros['batch_size'])
            
                y_cap_start, y_cap_end, loss = train(
                    para_batch = torch.tensor(train_P[sample_index], dtype=torch.long, device=device),
                    ques_batch = torch.tensor(train_Q[sample_index], dtype=torch.long, device=device),
                    answer_start_batch = torch.tensor(train_Y_start[sample_index], dtype=torch.float, device=device).view( _macros['batch_size'], 1, _macros['para_len']),
                    answer_end_batch = torch.tensor(train_Y_end[sample_index], dtype=torch.float, device=device).view(_macros['batch_size'], 1, _macros['para_len']),
                    ques_model = ques_model,
                    para_model = para_model,
                    mlstm_model = mlstm_model,
                    pointer_decoder_model = pointer_decoder_model,
                    optimizer = optimizer, 
                    loss_fn= loss_fn,
                    macros=_macros,
                    debug=_macros['debug']
                )
                
                epoch_traces_train['true']['start'].append(train_Y_start[sample_index])
                epoch_traces_train['true']['end'].append(train_Y_end[sample_index])
                epoch_traces_train['pred']['start'].append(y_cap_start.cpu().detach().numpy())
                epoch_traces_train['pred']['end'].append(y_cap_end.cpu().detach().numpy())

                if _train_eval: 

                    # Calculate train accuracy for this minibatch
                    metrics = eval(
                        y=(torch.tensor(train_Y_start[sample_index], device=device),
                            torch.tensor(train_Y_end[sample_index], device=device)),
                        y_cap=[y_cap_start.squeeze(), y_cap_end.squeeze()])

                    epoch_train_em.append(metrics['em'])
                    epoch_train_f.append(metrics['f1'])
    
                epoch_loss.append(loss.item())
    
#                 grad_new = sum([x.grad.sum().item() for x in params])

                print("Batch:\t%d" % iter,"/%d\t\b: " % (len(train_P)/_macros['batch_size']),
                      str("%s" % (time.time() - batch_time))[:8], 
                      str("\t\b%s" % (time.time() - epoch_time))[:10], 
                      "\tl:%f" % loss.item(),
                      "\tem:%f" % epoch_train_em[-1] if _train_eval else "",
                     "\t\bf1:%f" % epoch_train_f[-1] if _train_eval else "")
#                      "\t\b\b%s" % grad_new - grad_old)
#                      end=None if iter+1 == int(len(train_P)/BATCH_SIZE) else "\r")

            train_losses.append(epoch_loss)
        
            if _train_eval: 
                train_em.append(epoch_train_em)
                train_f.append(epoch_train_f)

            # TEMP
            # Save model now
            models = { 'ques_model': ques_model,
                   'para_model': para_model,
                   'mlstm_model':  mlstm_model,
                   'pointer_decoder_model': pointer_decoder_model
                 }
                
            save_model(macros['save_model_loc'], models,
                          epochs=epoch,
                           optimizer=optimizer)
                
            print("Saving new model on epoch %d" % epoch)
                
                
            if _test_eval and epoch % _test_eval == 0:
            
                metrics_epoch_test = []
                for i_batch in range(0, len(test_P),_macros['batch_size']):
                    test_p = test_P[i_batch: i_batch+_macros['batch_size']]
                    test_q = test_Q[i_batch: i_batch+_macros['batch_size']]
                    test_y_start = test_Y_start[i_batch: i_batch+_macros['batch_size']]
                    test_y_end =  test_Y_end[i_batch: i_batch+_macros['batch_size']]

#                     y_cap_start, y_cap_end, test_loss = train(
#                         para_batch = torch.tensor(test_p, dtype=torch.long, device=device),
#                         ques_batch = torch.tensor(test_q, dtype=torch.long, device=device),
#                         ques_model = ques_model,
#                         para_model = para_model,
#                         mlstm_model = mlstm_model,
#                         pointer_decoder_model = pointer_decoder_model,
#                         macros = _macros,
#                         loss_fn = loss_fn,
#                         debug = _macros['debug'],
                        
#                     )
                    y_cap_start, y_cap_end, test_loss = train(
                        para_batch = torch.tensor(test_p, dtype=torch.long, device=device),
                        ques_batch = torch.tensor(test_q, dtype=torch.long, device=device),
                        answer_start_batch = torch.tensor(test_y_start, dtype=torch.float, device=device).view( -1, 1, _macros['para_len']),
                        answer_end_batch = torch.tensor(test_y_end, dtype=torch.float, device=device).view(-1, 1, _macros['para_len']),
                        ques_model = ques_model,
                        para_model = para_model,
                        mlstm_model = mlstm_model,
                        pointer_decoder_model = pointer_decoder_model,
                        optimizer = optimizer, 
                        loss_fn= loss_fn,
                        macros=_macros,
                        debug=_macros['debug'],
                        train=False
                    )
                    metrics = eval(
                        y=(torch.tensor(test_y_start, device=device),
                            torch.tensor(test_y_end, device=device)),
                        y_cap=[y_cap_start.squeeze(), y_cap_end.squeeze()])
                    metrics_epoch_test.append(metrics)
                    
                    epoch_traces['true']['start'].append(test_y_start)
                    epoch_traces['true']['end'].append(test_y_end)
                    epoch_traces['pred']['start'].append(y_cap_start.cpu().detach().numpy())
                    epoch_traces['pred']['end'].append(y_cap_end.cpu().detach().numpy())
                
                # Find em and f1
                em = np.mean([metric['em'] for metric in metrics_epoch_test])
                f1 = np.mean([metric['f1'] for metric in metrics_epoch_test])

                test_losses.append(test_loss.item())
                test_em.append(em)
                test_f.append(f1)
                
                # Check if we outperformed the best one.
                if f1 > best_test:
                    
                    # Set flag
                    found_best_test = True
                    
                    # Update value
                    best_test = f1
                    
                    # Update traces for the best one.
                    traces_train = epoch_traces_train
                    traces_test = epoch_traces

            # Saving logic
            if _save == 0:
                pass
            elif ( _save>0 and epoch % _save == 0) or \
            ( _save == -1 and found_best_test ):
                models = { 'ques_model': ques_model,
                           'para_model': para_model,
                           'mlstm_model':  mlstm_model,
                           'pointer_decoder_model': pointer_decoder_model
                         }
                
                save_model(macros['save_model_loc'], models,
                          epochs=epoch,
                           optimizer=optimizer)
                
                print("Saving new model on epoch %d" % epoch)
            
            # Reset flags
            found_best_test = False
            
#             At the end of every epoch, do print the average epoch loss, and other stat
            print("\nEpoch performance: ",
                  "%ssec" % str(time.time() - epoch_time)[:6],
                  "Trl:%f" % np.mean(epoch_loss, axis=0),
                  "Tel:%f" % test_losses[-1],
                  "\n\tTrem:%f" % np.mean(epoch_train_em) if _train_eval and epoch % _train_eval == 0 else "",
                  "\tTrf1:%f" % np.mean(epoch_train_f) if _train_eval and epoch % _train_eval == 0 else "",
                  "\tTeem:%f" % test_em[-1] if _test_eval and epoch % _test_eval == 0 else "",
                  "\tTef1:%f\n" % test_f[-1] if _test_eval and epoch % _test_eval == 0 else "\n")

        return train_losses, train_em, test_losses, test_em
    
    except KeyboardInterrupt:
        
        # someone called a ctrl+c on it. Let' return the things computed so far atlest.
        print("Found keyboard interrupt. Stopping training loop")
        
    except:
        traceback.print_exc()
        
    finally:       
        return train_losses, train_em, train_f, test_losses, test_em, test_f, best_test, traces_train, traces_test
            
            

In [None]:
def visualize_loss(loss, loss2=None, _label="Some label", _label2="Some other label", _name="Generic Name", _only_epoch=True):
    """
        Fn to visualize loss.
        Expects either
            - [int, int] for epoch level stuff
            - [ [int, int], [int, int] ] for batch level data. 
    """
    
    plt.rcParams['figure.figsize'] = [15, 8] 
    
    # Detect input format
    if type(loss[0]) is not list: #in [int, float, long]:
        
#         print("here")
        plt.plot(loss, '-b', label=_label)
        if loss2: plt.plot(loss2, '-r', label=_label2)
        plt.ylabel(_name)
        pylab.legend(loc='upper left')
        plt.show()
        
    elif type(loss[0]) == list:
        
        if _only_epoch:
            loss = [ np.mean(x) for x in loss ]
            if loss2 is not None: 
                loss2 = [ np.mean(x) for x in loss2 ]
            
        else:
            loss = [ y for x in loss for y in x ]
            if loss2 is not None: loss2 = [ y for x in loss2 for y in x ]
            
        plt.plot(loss, '-b', label=_label)
        if loss2 is not None: plt.plot(loss2, '-r', label=_label2)
        plt.ylabel(_name)
        pylab.legend(loc='upper left')
        plt.show()        

In [None]:
def visualize_traces(trace):
    print('Start')
#     x = [ np.argmax(x, axis=1) for x in trace['true']['start']]
#     for a in x:
#         print(a.shape)
# #     b = np.hstack([ np.argmax(x.reshape(-1, macros['para_len']), axis=1) for x in trace['true']['start'][:-1]])
# #     a =
#     print(a.shape)
#     print(b.shape)
#     print(x.shape)
#     print(x.reshape(-1, macros['para_len']).shape)
#     print(np.argmax(x.reshape(-1, macros['para_len']), axis=1).shape)
    cm = confusion_matrix(
        np.hstack([ np.argmax(x.reshape(-1, macros['para_len']), axis=1) for x in trace['true']['start'][:-1]]),
        np.hstack([ np.argmax(x.reshape(-1, macros['para_len']), axis=1) for x in trace['pred']['start'][:-1]]))
    cm.plot()
    plt.show()
    
    print('End')
    cm = confusion_matrix(
        np.hstack([ np.argmax(x.reshape(-1, macros['para_len']), axis=1) for x in trace['true']['end'][:-1]]),
        np.hstack([ np.argmax(x.reshape(-1, macros['para_len']), axis=1) for x in trace['pred']['end'][:-1]]))
    cm.plot()
    plt.show()

## Orchestrator

One cell which instantiates and runs everything

In [None]:
"""
    Cell which pulls everything together.

    > init models
    > get data prepared
    > pass models and data to training loop
    > gets trained models and loss
    > saves models
    > visualizes loss?

No other function but this one ever sees global macros!
"""
macros = {
    "ques_len": QUES_LEN,
    "hidden_dim": HIDDEN_DIM, 
    "vocab_size": VOCAB_SIZE, 
    "batch_size": BATCH_SIZE,
    "para_len": PARA_LEN,
    "embedding_dim": EMBEDDING_DIM,
    "lr": LR,
    "debug":DEBUG,
    "save_model_loc": MODEL_LOC,
    "epochs": EPOCHS
#     "device": device
} 

In [None]:
data = {'train':{}, 'test':{}}
data['train']['P'], data['train']['Q'], data['train']['Ys'], data['train']['Ye'], \
data['test']['P'], data['test']['Q'], data['test']['Ys'], data['test']['Ye'], vectors = \
    prepare_data(DATA_LOC, macros, crop=CROP)

In [None]:
def stringify(index,test=False):
    if test:
        pass

In [None]:
# # Instantiate models
ques_model = Encoder(QUES_LEN, macros, vectors, device).cuda(device)
para_model = Encoder(PARA_LEN, macros, vectors, device).cuda(device)
mlstm_model = MatchLSTMEncoder(macros, device).cuda(device)
pointer_decoder_model = PointerDecoder(macros, device).cuda(device)

In [None]:
macros

In [None]:
op = training_loop(_models=[ques_model, para_model, mlstm_model, pointer_decoder_model],
                   _data=data,
                   _debug=macros['debug'],
                   _save=-1,
                   _test_eval=1,
                   _train_eval=1,
                   _epochs=macros['epochs'],
                   _macros=macros)    

#### Visualizations

So far, we plot the training losss. 
Shall we superimpose test loss on it too? We don't calculate test loss per batch though (fortunately).

In [None]:
# Visualizations
print("Training Loss")
visualize_loss(loss=op[0], _name="train loss", _only_epoch=True)
# loss2=op[3], _label="train loss", _label2="test_loss",

print("Test Loss")
visualize_loss(loss=op[4], _name="test loss", _only_epoch=True)

# if len(op[1]) > 0:

print("Exact Match")
visualize_loss(loss=op[1], loss2=op[4], _label="train", _label2="test", _name="Exact Match", _only_epoch=True)
# visualize_loss(loss=op[1], _label="train em", _label2="test em", _only_epoch=True)

print("F-Measure")
visualize_loss(loss=op[2], loss2=op[5], _label="train", _label2="test", _name="F-Measure")

# op[3]
# print(op[1])
print("Conf Mat Train")
visualize_traces(op[7])

print("Conf Mat Test")
visualize_traces(op[8])

In [None]:
a = op[-1]
b = a['true']['start']
np.argmax(b[1], axis=1)

In [None]:
# pickle.dump(op, open('./performance/domain-noglove-17-07-2018/op.dump', 'wb+'))
print("Best F1: ", op[-1])

In [None]:
# # Testing (temp)
# models = { 'ques_model': ques_model,
#            'para_model': para_model,
#            'mlstm_model':  mlstm_model,
#            'pointer_decoder_model': pointer_decoder_model
#          }
# save_model(loc=macros['save_model_loc'], models=models, epochs=0)