In [1]:
import numpy as np
import torch
from torch.autograd import Variable


# Global vars
hidden_size = 100
input_size = 200
num_differing_questions = 20

# Produces tensor [1 x num_words x input_size] for one particular question
def get_question_matrix(questionID, word2vec, id2Data, input_size):
    # Get the vector representation for each word in this question as list [v1,v2,v3,...]
    q_word_vecs = []
    for word in id2Data[questionID]:
        try:
            word_vec = np.array(word2vec[word]).astype(np.float32).reshape(len(word2vec[word]), -1)
            q_word_vecs.append(word_vec)
        except KeyError:
            pass

    # num_words x dim_words
    q_matrix = torch.Tensor(np.concatenate(q_word_vecs, axis=1).T)
    num_words_found = q_matrix.size()[0]
    
    if num_words_found < 100:
        padding_rows = torch.zeros(100-num_words_found, input_size)
        q_matrix = torch.cat((q_matrix, padding_rows), 0)
    
    return [q_matrix.unsqueeze(0), num_words_found]


# Given ids of main qs in this batch
# Returns:
# 1. ids in ordered list as: 
# [ q_1+, q_1-, q_1--,..., q_1++, q_1-, q_1--,...,
# q_2+, q_2-, q_2--,..., q_2++, q_2-, q_2--,...,]
# All n main questions have their pos,neg,neg,neg,... interleaved
# 2. A dict mapping main question id --> its interleaved sequence length
def organize_ids_training(q_ids, data):
    sequence_ids = []
    dict_sequence_lengths = {}
    
    for q_main in q_ids:
        p_pluses = data[q_main][0]
        p_minuses = list(np.random.choice(data[q_main][1], num_differing_questions, replace = False))
        sequence_length = len(p_pluses) * num_differing_questions + len(p_pluses)
        dict_sequence_lengths[q_main] = sequence_length
        for p_plus in p_pluses:
            sequence_ids += [p_plus] + p_minuses

    return sequence_ids, dict_sequence_lengths


# Given ids of main qs in this batch
# Returns:
# 1. ids of the 20 questions for each q_main
# Note: Varying number of p_plus
# 2. A dict mapping main question id --> its p_pluses ids
def organize_test_ids(q_ids, data):
    sequence_ids = []
    dict_p_pluses = {}
    
    for i, q_main in enumerate(q_ids):
        all_p = data[q_main][1]
        p_pluses = data[q_main][0]
        p_pluses_indices = []
        for pos_id in p_pluses:
            p_pluses_indices += [all_p.index(pos_id)] 
        sequence_ids += all_p
        dict_p_pluses[i] = p_pluses_indices
        
    return sequence_ids, dict_p_pluses


# A tuple is (q+, q-, q--, q--- ...)
# Let all main questions be set Q
# Each q in Q has a number of tuples equal to number of positives |q+, q++, ...|
# Each q in Q will have a 2D matrix of: num_tuples x num_candidates_in_tuple
# Concatenate this matrix for all q in Q and you get a matrix of: |Q| x num_tuples x num_candidates_in_tuple

# The above is for candidates
# To do cosine_similarity, need same structure with q's
# Basically each q will be a matrix of repeated q's: num_tuples x num_candidates_in_tuple, all elts are q (repeated)

# This method constructs those matrices, use candidates=True for candidates matrix
def construct_qs_matrix_training(q_ids_sequential, lstm, h0, c0, word2vec, id2Data, dict_sequence_lengths, candidates=False):
    if not candidates:
        q_ids_complete = []
        for q in q_ids_sequential:
            q_ids_complete += [q] * dict_sequence_lengths[q]
    
    else: q_ids_complete = q_ids_sequential

    qs_matrix_list = []
    qs_seq_length = []
    
    for q in q_ids_complete:
        q_matrix_3d, q_num_words = get_question_matrix(q, word2vec, id2Data, input_size)
        qs_matrix_list.append(q_matrix_3d)
        qs_seq_length.append(q_num_words)

    qs_padded = Variable(torch.cat(qs_matrix_list, 0))
    print('qs_padded', qs_padded)
    qs_hidden = lstm(qs_padded, (h0, c0)) # [ [num_q, num_word_per_q, hidden_size] i.e. all hidden, [1, num_q, hidden_size]  i.e. final hidden]
    print('qs_hidden', qs_hidden)
    sum_h_qs = torch.sum(qs_hidden[0], dim=1)
    print('sum', sum_h_qs)
    mean_pooled_h_qs = torch.div(sum_h_qs, torch.autograd.Variable(torch.FloatTensor(qs_seq_length)[:, np.newaxis]))
    print('mean', mean_pooled_h_qs)
    qs_tuples = mean_pooled_h_qs.split(1+num_differing_questions)
    print('tuples', qs_tuples)
    final_matrix_tuples_by_constituent_qs_by_hidden_size = torch.stack(qs_tuples, dim=0, out=None)
    print('final', final_matrix_tuples_by_constituent_qs_by_hidden_size)
    sss
    return final_matrix_tuples_by_constituent_qs_by_hidden_size


# Case candidates: gives a matrix with a row for each q_main, with 20 p's
# Case not candidates: gives a matrix with a row for each q_main, with 20 q_main's repeated
def construct_qs_matrix_testing(q_ids_sequential, lstm, h0, c0, word2vec, id2Data, candidates=False):
    num_ps_per_q = 20
    
    if not candidates:
        q_ids_complete = []
        for q in q_ids_sequential:
            q_ids_complete += [q] * num_ps_per_q
    
    else: q_ids_complete = q_ids_sequential

    qs_matrix_list = []
    qs_seq_length = []
    
    for q in q_ids_complete:
        q_matrix_3d, q_num_words = get_question_matrix(q, word2vec, id2Data, input_size)
        qs_matrix_list.append(q_matrix_3d)
        qs_seq_length.append(q_num_words)

    qs_padded = Variable(torch.cat(qs_matrix_list, 0))
    qs_hidden = lstm(qs_padded, (h0, c0)) # [ [num_q, num_word_per_q, hidden_size] i.e. all hidden, [1, num_q, hidden_size]  i.e. final hidden]
    sum_h_qs = torch.sum(qs_hidden[0], dim=1)
    mean_pooled_h_qs = torch.div(sum_h_qs, torch.autograd.Variable(torch.FloatTensor(qs_seq_length)[:, np.newaxis]))
    qs_tuples = mean_pooled_h_qs.split(num_ps_per_q)
    final_matrix_tuples_by_constituent_qs_by_hidden_size = torch.stack(qs_tuples, dim=0, out=None)
    
    return final_matrix_tuples_by_constituent_qs_by_hidden_size

In [2]:
from preprocess import *
from scoring_metrics import *

import torch
from torch.autograd import Variable

import time

saved_model_name = "bestbest"


'''Hyperparams dashboard'''
dropout = 0.2
margin = 0.4
lr = 10**-3


''' Data Prep '''
word2vec = get_words_and_embeddings()
id2Data = questionID_to_questionData_truncate(100)

training_data = training_id_to_similar_different()
trainingQuestionIds = list(training_data.keys())[:100]

dev_data = devTest_id_to_similar_different(dev=True)
dev_question_ids = list(dev_data.keys())[:20]

test_data = devTest_id_to_similar_different(dev=False)
test_question_ids = list(test_data.keys())


''' Model Specs '''
input_size = len(word2vec[list(word2vec.keys())[0]])
hidden_size = 100
num_layers = 1
bias = True
batch_first = True
bidirectional = False

lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional)
loss_function = torch.nn.MultiMarginLoss(margin=margin)
optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)

first_dim = num_layers * 2 if bidirectional else num_layers
h0 = Variable(torch.zeros(1, 1, hidden_size), requires_grad=True)
c0 = Variable(torch.zeros(1, 1, hidden_size), requires_grad=True)

''' Procedural parameters '''
batch_size = 2
num_differing_questions = 20
num_epochs = 10
num_batches = round(len(trainingQuestionIds)/batch_size)

def train_model(lstm, optimizer, batch_ids, batch_data, word2vec, id2Data, dev_set=False):
    lstm.train()
    optimizer.zero_grad()

    sequence_ids, dict_sequence_lengths = organize_ids_training(batch_ids, batch_data)

    candidates_qs_tuples_matrix = construct_qs_matrix_training(sequence_ids, lstm, h0, c0, word2vec, id2Data, dict_sequence_lengths, candidates=True)
    main_qs_tuples_matrix = construct_qs_matrix_training(batch_ids, lstm, h0, c0, word2vec, id2Data, dict_sequence_lengths, candidates=False)
    similarity_matrix = torch.nn.functional.cosine_similarity(candidates_qs_tuples_matrix, main_qs_tuples_matrix, dim=2, eps=1e-08)

    target = Variable(torch.LongTensor([0] * int(len(sequence_ids)/(1+num_differing_questions))))
    loss_batch = loss_function(similarity_matrix, target)

    loss_batch.backward()
    optimizer.step()

    if dev_set: print("Trained on dev set with loss:", loss_batch.data[0], " time_on_batch:", time.time() - start)
    else: print("loss_on_batch:", loss_batch.data[0], " time_on_batch:", time.time() - start)
    return

def eval_model(lstm, ids, data, word2vec, id2Data):
    lstm.eval()
    sequence_ids, p_pluses_indices_dict = organize_test_ids(ids, data)

    candidates_qs_tuples_matrix = construct_qs_matrix_testing(sequence_ids, lstm, h0, c0, word2vec, id2Data, candidates=True)
    main_qs_tuples_matrix = construct_qs_matrix_testing(ids, lstm, h0, c0, word2vec, id2Data, candidates=False)

    similarity_matrix = torch.nn.functional.cosine_similarity(candidates_qs_tuples_matrix, main_qs_tuples_matrix, dim=2, eps=1e-08)
    MRR_score = get_MRR_score(similarity_matrix, p_pluses_indices_dict)
    return MRR_score


'''Begin training'''
for epoch in range(num_epochs):

    # Train on whole training data set
    for batch in range(1, num_batches+1):
        start = time.time()
        questions_this_training_batch = trainingQuestionIds[batch_size * (batch - 1):batch_size * batch]
        print("Working on batch #: ", batch)
        train_model(lstm, optimizer, questions_this_training_batch, training_data, word2vec, id2Data, dev_set=False)
        
    # Evaluate on dev and test sets for MRR score
    dev_MRR_score = eval_model(lstm, dev_question_ids, dev_data, word2vec, id2Data)
    test_MRR_score = eval_model(lstm, test_question_ids, test_data, word2vec, id2Data)
    print("MRR score on dev set:", dev_MRR_score)
    print("MRR score on test set:", test_MRR_score)

    # Log results to local logs.txt file
    with open('logs.txt', 'a') as log_file:
        log_file.write('epoch: ' + str(epoch) + '\n')
        log_file.write('lr: ' + str(lr) +  ' marg: ' + str(margin) + ' drop: ' + str(dropout) + '\n' )        
        log_file.write('dev_MRR: ' +  str(dev_MRR_score) + '\n')
        log_file.write('test_MRR: ' +  str(test_MRR_score) + '\n')

    # Save model for this epoch
    # torch.save(lstm, '../Pickle/' + saved_model_name + '_epoch' + str(epoch) + '.pt')


Working on batch #:  1
qs_padded Variable containing:
( 0 ,.,.) = 
  0.1442 -0.1867 -0.0002  ...   0.0169 -0.0526 -0.1357
  0.1020 -0.1044 -0.0128  ...   0.0344 -0.0136 -0.0370
  0.1425 -0.1104  0.0730  ...  -0.0451  0.0040  0.0159
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 1 ,.,.) = 
  0.0587 -0.1163  0.0160  ...   0.1748 -0.0052 -0.1124
 -0.0123 -0.1376 -0.0129  ...  -0.0811  0.0452  0.0052
  0.0104 -0.0439  0.0399  ...   0.0976 -0.0064  0.0516
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 2 ,.,.) = 
  0.1019 -0.0890 -0.0525  ...  -0.0341  0.1238  0.1201
 -0.0385 -0.0178 -0.1324  ...  -0.0632  0.0141 -0.0471
  0.0149  0.1627 -0.0706 

NameError: name 'sss' is not defined