In [2]:
import numpy as np
import torch
import torch.utils.data
from torch import Tensor

In [3]:
def read_text_tokenized(text_tokenized_file, truncate_length=100):
    # returns a dictionary of {question_id : (title, body)} key-value pairs
    question_id_to_title_body_tuple = {}
    for line in open(text_tokenized_file, 'r'):
        question_id, title, body = line.split('\t')
        question_id_to_title_body_tuple[question_id] = (title.split()[:truncate_length], 
                                                        body.split()[:truncate_length])
    return question_id_to_title_body_tuple

In [4]:
def read_train_ids(train_file):
    # returns list of (question_id, positive_id, [negative_id, ...]) tuples
    # where all ids are strings
    train_id_instances = []
    for line in open(train_file):
        qid, positive_ids, negative_ids = line.split('\t')
        negative_ids = negative_ids.split()
        for positive_id in positive_ids.split():
            train_id_instances.append((qid, positive_id, negative_ids))
    return train_id_instances

In [5]:
def make_word_to_vec_dict(word_embeddings_file):
    word_to_vec = {}
    for line in open(word_embeddings_file):
        split_line = line.split()
        word, vector = split_line[0], split_line[1:]
        vector = np.array([float(x) for x in vector])
        word_to_vec[word] = vector
    return word_to_vec

In [6]:
word_embeddings_file = 'askubuntu/vector/vectors_pruned.200.txt'
word_to_vec = make_word_to_vec_dict(word_embeddings_file)

In [7]:
def get_sentence_matrix_embedding(words, num_words=100):
    # returns [num_words x length_embedding] np matrix
    # matrix may be padded
    if len(words) >  num_words:
        # we shouldn't be printing here because we should have truncated already
        print(len(words))
    num_features = len(word_to_vec['.'])
    sentence_mat = np.zeros((num_words, num_features))
    i = 0
    for word in words:
        # TODO: IS JUST SKIPPING THE WORD THE RIGHT APPROACH?
        if word in word_to_vec:
            sentence_mat[i] = word_to_vec[word]
        i += 1
        if i == num_words:
            break
    return sentence_mat

In [106]:
class QuestionDataset(torch.utils.data.Dataset):
    def __init__(self, text_tokenized_file, train_file, truncate=100):
        # id_to_question is an optional pre-computed id_to_question
        self.truncate = truncate
        self.id_to_question = read_text_tokenized(text_tokenized_file, truncate_length=self.truncate)
        self.train_id_instances = read_train_ids(train_file)
        self.num_features = len(word_to_vec['.'])
        
    def __len__(self):
        return len(self.train_id_instances)
    
#     def get_question_embedding(self, title_body_tuple):
#         title_embedding = Tensor(get_sentence_matrix_embedding(title_body_tuple[0], self.truncate))
#         body_embedding = Tensor(get_sentence_matrix_embedding(title_body_tuple[1], self.truncate))
#         return title_embedding, body_embedding
    
    def get_question_embeddings(self, title_body_tuples):
        num_questions = len(title_body_tuples)
        title_embeddings = np.zeros((num_questions, self.truncate, self.num_features))
        body_embeddings = np.zeros((num_questions, self.truncate, self.num_features))
        for i, (title, body) in enumerate(title_body_tuples):
            title_embeddings[i] = get_sentence_matrix_embedding(title, self.truncate)
            body_embeddings[i] = get_sentence_matrix_embedding(body, self.truncate)
        return Tensor(title_embeddings), Tensor(body_embeddings)
    
    def __getitem__(self, index):
        (q_id, positive_id, negative_ids) = self.train_id_instances[index]
        q = self.id_to_question[q_id]
        p = self.id_to_question[positive_id]
        negatives = [self.id_to_question[neg_id] for neg_id in negative_ids]
        q_title_embedding, q_body_embedding = self.get_question_embeddings([q])
        p_title_embedding, p_body_embedding = self.get_question_embeddings([p])
        neg_title_embeddings, neg_body_embeddings = self.get_question_embeddings(negatives)
        # negative_body_matrices is tensor of [num_negs x truncate_length x 200]
        # q_body_matrix and positive_body_matrix are tensors of [1 x truncate_length x 200]
        return dict(q_body=q_body_embedding, q_title=q_title_embedding, 
                    p_body=p_body_embedding, p_title=p_title_embedding, 
                    neg_bodies=neg_body_embeddings, neg_titles=neg_title_embeddings)

#dataset = QuestionDataset('askubuntu/text_tokenized.txt', 'askubuntu/train_random.txt', truncate=150)

In [107]:
dataset = QuestionDataset('askubuntu/text_tokenized.txt', 'askubuntu/train_random.txt', truncate=150)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=13,
                                              shuffle=True, drop_last=True)

In [108]:
for batch in data_loader:
    batch = batch
    break

In [109]:
q_body = Variable(batch["q_body"], requires_grad=True)
p_body = Variable(batch["p_body"], requires_grad=True)
neg_bodies = Variable(batch["neg_bodies"], requires_grad=True)
q_title = Variable(batch["q_title"], requires_grad=True)
p_title = Variable(batch["p_title"], requires_grad=True)
neg_titles = Variable(batch["neg_titles"], requires_grad=True)
q_body_enc = q_body[:, :, :, 1] # batch_size x 1 x enc_length
p_body_enc = p_body[:, :, :, 1]
neg_body_encs = neg_bodies[:, :, :, 1] #batch_size x num_negs x enc_length
q_title_enc = q_title[:, :, :, 1]
p_title_enc = p_title[:, :, :, 1]
neg_title_encs = neg_titles[:, :, :, 1]
q_enc = q_title_enc + q_body_enc / 2.0
p_enc = p_title_enc + p_body_enc / 2.0
neg_encs = neg_title_encs + neg_body_encs / 2.0
#p_enc = p_enc.resize(13, 1, 150)
candidate_encs = torch.cat((p_enc, neg_encs), dim=1) #batch_size x (num_negs + 1) x enc_length
#q_enc = q_enc.resize(13, 1, 150) 
query_encs = q_enc.repeat(1, 101, 1) # batch_size x (num_negs + 1) x enc_length
cos = torch.nn.CosineSimilarity(dim=2, eps=1e-08)(candidate_encs, query_encs) # batch_size x (num_negs + 1)

Variable containing:
 0.6292  0.4047  0.4759  ...   0.3253  0.3262  0.5561
 0.4431  0.4948  0.4942  ...   0.3923  0.3356  0.3528
 0.5343  0.4228  0.4100  ...   0.5150  0.4775  0.3825
          ...             ⋱             ...          
 0.5013  0.5764  0.4346  ...   0.3528  0.6759  0.4552
 0.5808  0.5610  0.3754  ...   0.3485  0.5683  0.5332
 0.5924  0.3875  0.5187  ...   0.6583  0.3167  0.4455
[torch.FloatTensor of size 13x101]

In [125]:
target = Variable(torch.zeros(13).long(), requires_grad=True)

In [128]:
loss = torch.nn.MultiMarginLoss()(cos, target)

In [132]:
neg_titles.size()[1]+1

101L