In [None]:
import read_input
from read_input import read_word_embeddings
from prettytable import PrettyTable
import random
from evaluation import Evaluation
import torch
from torch.autograd import Variable
import torch.utils
import torch.utils.data
from tqdm import tqdm
from torch import nn
import numpy as np
import os
import sys

from meter_auc import AUCMeter
from models import LSTM, CNN, evaluate, DomainClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def read_text_tokenized(text_tokenized_file, truncate_length=100):
    # returns a dictionary of {question_id : (title, body)} key-value pairs
    print('read corpus', text_tokenized_file)
    question_id_to_title_body_tuple = {}
    all_questions_pieces = []
    for line in open(text_tokenized_file, 'r'):
        question_id, title, body = line.lower().split('\t')
        #title = title.split()[:truncate_length]
        #body = body.split()[:truncate_length]
        if len(title) == 0:
            title = 'title'
        if len(body) == 0:
            body = 'body'
        all_questions_pieces.append(title)
        all_questions_pieces.append(body)
        question_id_to_title_body_tuple[question_id] = (title, body)
    return question_id_to_title_body_tuple, all_questions_pieces

In [None]:
def evaluate_model(dataset, model):
    meter = AUCMeter()
    for sample in tqdm(dataset):
        q_body = sample["q_body"] 
        cand_bodies = sample["candidate_bodies"]
        q_title = sample["q_title"]
        cand_titles = sample["candidate_titles"]
        num_cands = len(cand_titles)
        q_body_enc = model.transform([q_body])
        q_title_enc = model.transform([q_title])
        cand_body_encs = model.transform(cand_bodies) 
        cand_title_encs = model.transform(cand_titles)
        q_enc = q_title_enc + q_body_enc / 2.0
        candidate_encs = cand_title_encs + cand_body_encs / 2.0

        candidate_encs = candidate_encs.toarray()
        query_encs = np.repeat(q_enc.toarray(), num_cands, axis=0)
        sim = cosine_similarity(query_encs, candidate_encs, dense_output=True)
        cos = sim[0]
        labels = sample["labels"]
        meter.add(cos, labels)
    return meter.value(), meter.value(0.05)

In [None]:
#WORD_EMBEDDINGS_FILE = 'askubuntu/vector/vectors_pruned.200.txt'
#WORD_EMBEDDINGS_FILE = 'vectors_stackexchange.txt'
ANDROID_DEV_NEG_FILE = 'Android/dev.neg.txt'
ANDROID_DEV_POS_FILE = 'Android/dev.pos.txt'
ANDROID_TEST_NEG_FILE = 'Android/test.neg.txt'
ANDROID_TEST_POS_FILE = 'Android/test.pos.txt'
ANDROID_TEXT_TOKENIZED_FILE = 'Android/corpus.tsv'

TRUNCATE_LENGTH = 100
#word_to_idx, embeddings, padding_idx = read_word_embeddings(WORD_EMBEDDINGS_FILE)

In [None]:
class AndroidEvalQuestionDataset(torch.utils.data.Dataset):
    # Same idea as UbuntuEval, only difference is text_tokenized corpus file will be different, and
    # no bm25scores. Not that we're using bm25 for UbuntuEval anyways.
    def __init__(self, eval_pos_file, eval_neg_file, truncate=100, test_subset=None, num_negs=None):
        self.eval_id_instances = read_input.read_android_eval_ids(eval_pos_file, eval_neg_file, test_subset)
        self.num_negs = num_negs
        
    def __len__(self):
        return len(self.eval_id_instances)
    
    def __getitem__(self, index):
        (q_id, candidate_ids, labels) = self.eval_id_instances[index]
        pos_id = candidate_ids[0]
        pos_label = labels[0]
        if self.num_negs is None:
            negative_ids = candidate_ids[1:]
        else:
            negative_ids = random.sample(candidate_ids[1:], self.num_negs)
        candidate_ids = [pos_id] + negative_ids
        candidate_labels = [1] + [0]*self.num_negs
        query_title, query_body = question_id_to_questions[q_id]
        candidate_titles = np.array([question_id_to_questions[i][0] for i in candidate_ids])
        candidate_bodies = np.array([question_id_to_questions[i][1] for i in candidate_ids])

        return dict(q_title=query_title, q_body=query_body, 
                    candidate_titles=candidate_titles, 
                    candidate_bodies=candidate_bodies, labels=np.array(candidate_labels))    

In [None]:
# For doing android eval alone (I.E. Part 2a - baselines)
android_dev_dataset = AndroidEvalQuestionDataset(ANDROID_DEV_POS_FILE, ANDROID_DEV_NEG_FILE, 
                                                 truncate=100, test_subset=100, num_negs=20)
android_test_dataset = AndroidEvalQuestionDataset(ANDROID_TEST_POS_FILE, ANDROID_TEST_NEG_FILE, 
                                                 truncate=100, test_subset=100, num_negs=20)

In [None]:
question_id_to_questions, all_questions_pieces = read_text_tokenized(ANDROID_TEXT_TOKENIZED_FILE, TRUNCATE_LENGTH)
vectorizer = TfidfVectorizer()
vectorizer.fit(all_questions_pieces)

In [None]:
print('TFIDF BASELINE')
dev_auc, dev_auc05 = evaluate_model(android_dev_dataset, vectorizer)
print('dev auc {}, dev auc05 {}'.format(dev_auc, dev_auc05))
test_auc, test_auc05 = evaluate_model(android_dev_dataset, vectorizer)
print('dev auc {}, dev auc05 {}, test auc {}, test auc05 {}'.format(dev_auc, dev_auc05, test_auc, test_auc05))