In [None]:
import read_input
from read_input import read_word_embeddings, AndroidEvalQuestionDataset
from prettytable import PrettyTable

from evaluation import Evaluation
import torch
from torch.autograd import Variable
import torch.utils
import torch.utils.data
from tqdm import tqdm
from torch import nn
import numpy as np
import os
import sys

from meter_auc import AUCMeter
from models import LSTM, CNN, evaluate, DomainClassifier
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer()

In [None]:
def tfidf_encoder(sentences, masks):
    # sentence is a tensor of batch_size x truncate_length
    batch_size = sentences.size()[0]
    vocab_size = padding_idx
    word_freqs = np.zeros((batch_size, vocab_size))
    for i in range(batch_size):        
        for word_idx in sentences[i][masks[i].byte()]:
            if word_idx != padding_idx:
                word_freqs[i][word_idx] += 1
    tfidf = transformer.fit_transform(word_freqs)
    sys.stdout.flush()
    return torch.Tensor(tfidf.toarray())

In [None]:
def hasnan(var): 
    return np.isnan(np.sum(var.data.numpy()))

In [None]:
def evaluate_model(dataset, model, batch_size):
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                          shuffle=True, drop_last=True)
    all_ranked_labels = []
    meter = AUCMeter()
    for batch in tqdm(data_loader):
        q_body = batch["q_body"] # batch_size x truncate_length
        cand_bodies = batch["candidate_bodies"] # batch_size x num_cands x truncate_length
        q_title = batch["q_title"]
        cand_titles = batch["candidate_titles"]
        q_body_mask = batch["q_body_mask"] # batch_size x truncate_length
        q_title_mask = batch["q_title_mask"]
        cand_body_masks = batch["candidate_body_masks"] # batch_size x num_cands x truncate_length
        cand_title_masks = batch["candidate_title_masks"]
        num_cands = cand_titles.size()[1]
       
        q_body_enc, q_title_enc = model(q_body, q_body_mask), model(q_title, q_title_mask) # output is batch_size  x enc_length
        cand_body_encs = model(cand_bodies.view(batch_size*num_cands, TRUNCATE_LENGTH), # output is (batch_size x num_cands) x enc_length
                               cand_body_masks.view(batch_size*num_cands, TRUNCATE_LENGTH)) 
        cand_title_encs = model(cand_titles.view(batch_size*num_cands, TRUNCATE_LENGTH),
                                cand_title_masks.view(batch_size*num_cands, TRUNCATE_LENGTH))
        q_enc = q_title_enc + q_body_enc / 2.0
        candidate_encs = cand_title_encs + cand_body_encs / 2.0
        enc_length = q_enc.size()[-1]
        
        candidate_encs = candidate_encs.view(batch_size, num_cands, -1) # batch_size x num_cands x enc_length
        query_encs = q_enc.view(batch_size, 1, -1).expand_as(candidate_encs) # batch_size x (num_cands) x enc_length
        cos = torch.nn.CosineSimilarity(dim=2, eps=1e-08)(candidate_encs, query_encs) # batch_size x (num_cands)
        labels = batch["labels"]
        meter.add(cos.view(-1, 1), labels.view(-1, 1))
        
    return meter.value(), meter.value(0.05)


In [None]:
WORD_EMBEDDINGS_FILE = 'askubuntu/vector/vectors_pruned.200.txt'
WORD_EMBEDDINGS_FILE = 'vectors_stackexchange.txt'
ANDROID_DEV_NEG_FILE = 'Android/dev.neg.txt'
ANDROID_DEV_POS_FILE = 'Android/dev.pos.txt'
ANDROID_TEST_NEG_FILE = 'Android/test.neg.txt'
ANDROID_TEST_POS_FILE = 'Android/test.pos.txt'
ANDROID_TEXT_TOKENIZED_FILE = 'Android/corpus.tsv'

TRUNCATE_LENGTH = 100
word_to_idx, embeddings, padding_idx = read_word_embeddings(WORD_EMBEDDINGS_FILE)

In [None]:
# For doing android eval alone (I.E. Part 2a - baselines)
android_dev_dataset = AndroidEvalQuestionDataset(ANDROID_TEXT_TOKENIZED_FILE, ANDROID_DEV_POS_FILE, ANDROID_DEV_NEG_FILE, 
                                                 word_to_idx, padding_idx, truncate=100, test_subset=None)
android_test_dataset = AndroidEvalQuestionDataset(android_dev_dataset.id_to_question, ANDROID_TEST_POS_FILE, ANDROID_TEST_NEG_FILE, 
                                                 word_to_idx, padding_idx, truncate=100, test_subset=None)

In [None]:
model = tfidf_encoder
#model = LSTM(embeddings, padding_idx, 240, 1, TRUNCATE_LENGTH, 0.1, False)
# Example of how to load a previously trained model
# model.load_state_dict(torch.load('lstm_saved_models/epoch10.pkl'))

In [None]:
batch_size = 2
print('TFIDF BASELINE')
dev_auc, dev_auc05 = evaluate_model(android_dev_dataset, model, batch_size)
print('dev auc {}, dev auc05 {}'.format(dev_auc, dev_auc05))
test_auc, test_auc05 = evaluate_model(android_dev_dataset, model, batch_size)
print('dev auc {}, dev auc05 {}, test auc {}, test auc05 {}'.format(dev_auc, dev_auc05, test_auc, test_auc05))