In [None]:
# Part 2 stuff

In [None]:
import read_input
from read_input import TransferTrainQuestionDataset, AndroidEvalQuestionDataset, read_word_embeddings
from prettytable import PrettyTable

from evaluation import Evaluation
import torch
from torch.autograd import Variable
import torch.utils
import torch.utils.data
from tqdm import tqdm
from torch import nn
import numpy as np
import os
import sys
from meter_auc import AUCMeter

from models import LSTM, CNN, evaluate, DomainClassifier

In [None]:
def masked_select_rows(matrix, mask, mask_value=1):
    # matrix is 2d tensor [n x m]
    # mask is 1d tensor
    # returns matrix [new_n x m], with all the rows selected where mask=mask_value
    return torch.from_numpy(matrix.numpy()[(mask.numpy()==mask_value),:])

In [None]:
def run_epoch(dataset, is_training, model, optimizer, batch_size, margin, save_path):
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                              shuffle=True, drop_last=True)
    losses = []
    all_ranked_labels = []
    if is_training:
        model.train()
    else:
        model.eval()
    requires_grad = False
    for batch in tqdm(data_loader):
        # If is_training = True: 
        #    Batch is mixed android and ubuntu examples
        #    All examples have a binary isUbuntu label, and the usual q_body, q_title, q_body_mask, q_title_mask
        #    For Android examples, the candidate_bodies, candidate_titles, candidate_body_masks, candidate_title_masks
        #      are all just tensors of all 0s, and SHOULD NOT BE USED!!! 
        
        # If is_training = False:
        #    - Batch is only ubuntu eval examples
        #    - Examples have q_body, q_title, q_body_mask, q_title_mask, candidate_bodies, 
        #      candidate_titles, candidate_body_masks, candidate_title_masks, 
        #             AND label
        
        # example of how to select the examples of the batch which are all ubuntu, etc. 
        # Note: for training only! 
        ubuntu_q_body = masked_select_rows(batch["q_body"], batch["isUbuntu"])
        android_q_body = masked_select_rows(batch["q_body"], batch["isUbuntu"], mask_value=0)
        
        # ....
        
        # cos = ....
        
        if is_training:
            pass
        else:
            labels = batch["labels"]
            meter.add(cos.view(-1, 1), labels.view(-1, 1))
    if is_training:
        pass
    else:
        return meter.value(), meter.value(0.05) #return auc and auc(.05)


In [None]:
WORD_EMBEDDINGS_FILE = 'vectors_stackexchange.txt'
UBUNTU_TEXT_TOKENIZED_FILE = 'askubuntu/text_tokenized.txt'
UBUNTU_TRAIN_FILE = 'askubuntu/train_random.txt'
UBUNTU_DEV_FILE = 'askubuntu/dev.txt'
UBUNTU_TEST_FILE = 'askubuntu/test.txt'

ANDROID_DEV_NEG_FILE = 'Android/dev.neg.txt'
ANDROID_DEV_POS_FILE = 'Android/dev.pos.txt'
ANDROID_TEST_NEG_FILE = 'Android/test.neg.txt'
ANDROID_TEST_POS_FILE = 'Android/test.pos.txt'
ANDROID_TEXT_TOKENIZED_FILE = 'Android/corpus.tsv'

TRUNCATE_LENGTH = 100
word_to_idx, embeddings, padding_idx = read_word_embeddings(WORD_EMBEDDINGS_FILE)

In [None]:
# For doing domain adaptation (I.E. Part2b)
transfer_train_dataset = TransferTrainQuestionDataset(ANDROID_TEXT_TOKENIZED_FILE, UBUNTU_TEXT_TOKENIZED_FILE, 
                                                      UBUNTU_TRAIN_FILE, word_to_idx, padding_idx, truncate=100, test_subset=7000)
android_dev_dataset = AndroidEvalQuestionDataset(transfer_train_dataset.android_dataset.id_to_question, ANDROID_DEV_POS_FILE, ANDROID_DEV_NEG_FILE, 
                                                 word_to_idx, padding_idx, truncate=100, test_subset=None)
android_test_dataset = AndroidEvalQuestionDataset(android_dev_dataset.id_to_question, ANDROID_DEV_POS_FILE, ANDROID_DEV_NEG_FILE, 
                                                 word_to_idx, padding_idx, truncate=100, test_subset=None)