In [1]:
import sys
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import sklearn
import random
import time
import torch.utils.data

sys.path.append("../src/")
import data_reader as dr
from evaluate import Evaluation
from lstm import LSTM
from loss_function import loss_function
from loss_function import cs
from meter import AUCMeter

In [2]:
def normalize(x, dim):
    l2 = torch.norm(x, 2, dim)#.expand_as(x)
    l2 = torch.unsqueeze(l2, 2)
    l2 = l2.expand_as(x)
    return x / l2.clamp(min = 1e-8)

In [3]:
def run_epoch(data, is_training, model, optimizer):
    '''
    Train model for one pass of train data, and return loss, acccuracy
    '''
    # the number of candidates for each question is not the same, so has to set batch_size=1 ?
    data_loader = torch.utils.data.DataLoader(
        data,
        #batch_size=40,
        drop_last=False)

    #losses = []
    scores = []
    targets = []

    if is_training:
        model.train()
    else:
        model.eval()

    for batch in data_loader:
        pid_title = torch.unsqueeze(Variable(batch['pid_title']), 1)
        pid_body = torch.unsqueeze(Variable(batch['pid_body']), 1)
        rest_title = Variable(batch['rest_title'])
        rest_body = Variable(batch['rest_body'])
        
        pid_title_pad = torch.unsqueeze(Variable(batch['pid_title_pad']), 1)
        pid_body_pad = torch.unsqueeze(Variable(batch['pid_body_pad']), 1)
        rest_title_pad = Variable(batch['rest_title_pad'])
        rest_body_pad = Variable(batch['rest_body_pad'])
        
        pid_title, pid_body = pid_title.cuda(), pid_body.cuda()
        rest_title, rest_body = rest_title.cuda(), rest_body.cuda()
        pid_title_pad, pid_body_pad = pid_title_pad.cuda(), pid_body_pad.cuda()
        rest_title_pad, rest_body_pad = rest_title_pad.cuda(), rest_body_pad.cuda()
        
        if is_training:
            optimizer.zero_grad()
        
        pt = model(pid_title)
        pb = model(pid_body)
        rt = model(rest_title)
        rb = model(rest_body)
        
        pt = normalize(pt, 2)
        pb = normalize(pb, 2)
        rt = normalize(rt, 2)
        rb = normalize(rb, 2)
        
        # we need to take the mean pooling taking into account the padding
        # tensors are of dim batch_size x samples x output_size x (len - kernel + 1)
        # pad tensors are of dim batch_size x samples x (len - kernel + 1)
        
        pid_title_pad_ex = torch.unsqueeze(pid_title_pad, 2).expand_as(pt)
        pid_body_pad_ex = torch.unsqueeze(pid_body_pad, 2).expand_as(pb)
        rest_title_pad_ex = torch.unsqueeze(rest_title_pad, 2).expand_as(rt)
        rest_body_pad_ex = torch.unsqueeze(rest_body_pad, 2).expand_as(rb)
        
        pt = torch.sum(pt * pid_title_pad_ex, dim = 3)
        pb = torch.sum(pb * pid_body_pad_ex, dim = 3)
        rt = torch.sum(rt * rest_title_pad_ex, dim = 3)
        rb = torch.sum(rb * rest_body_pad_ex, dim = 3)
        
        # tensors are of dim batch_size x samples x output_size
        # need to scale down because not all uniformly padded
        
        ptp_norm = torch.unsqueeze(torch.sum(pid_title_pad, dim = 2).clamp(min = 1), 2).expand_as(pt)
        pbp_norm = torch.unsqueeze(torch.sum(pid_body_pad, dim = 2).clamp(min = 1), 2).expand_as(pb)
        rtp_norm = torch.unsqueeze(torch.sum(rest_title_pad, dim = 2).clamp(min = 1), 2).expand_as(rt)
        rbp_norm = torch.unsqueeze(torch.sum(rest_body_pad, dim = 2).clamp(min = 1), 2).expand_as(rb)
            
        pt = pt / ptp_norm
        pb = pb / pbp_norm
        rt = rt / rtp_norm
        rb = rb / rbp_norm
        
        pid_tensor = (pt + pb)/2
        rest_tensor = (rt + rb)/2
        
        if is_training:
            pass
        else:
            expanded = pid_tensor.expand_as(rest_tensor)
            similarity = cs(expanded, rest_tensor, dim=2)#.squeeze(2)
            similarity = torch.FloatTensor(similarity.data.cpu().numpy())
            labels = batch['labels']
            
            for sim in similarity:
                scores.append(sim)
            targets.extend(labels[0])
            
    # Calculate epoch level scores
    if is_training:
        pass
    else:
        aucmeter = AUCMeter()
        aucmeter.reset()
        
        output = torch.cat(scores)
        expect = torch.LongTensor(targets)
        aucmeter.add(output, expect)
        return aucmeter.value(max_fpr=0.05)

In [4]:
corpus_path = "../data_Android/corpus.tsv.gz"

corpus = dr.read_corpus(corpus_path)

#embedding_path = "../data/vectors_pruned.200.txt.gz"
#embedding_path = "../data/glove.6B.200d.txt.gz"
embedding_path = "../data/glove.combined.300d.txt.gz"
embedding_tensor, word_to_indx = dr.getEmbeddingTensor(embedding_path)

ids_corpus = dr.map_corpus(corpus, word_to_indx, kernel_width = 1)

In [41]:
#model = torch.load("cnn_model") # error when loading
#model = torch.load("models_lr0.0003/model_epoch15") # best to now is epoch 7: 0.474, 0.466
model = torch.load("model_epoch1")


In [42]:
eval_path = "../data_Android/dev_android.txt"
eval_anno = dr.read_annotations(eval_path, K_neg = -1, prune_pos_cnt = -1)
eval_set = dr.create_dev_set(ids_corpus, eval_anno)

z = run_epoch(eval_set, False, model, None)
print(z)

  output, hn = self.lstm(x) # hidden and cells are zero


0.567664057385


In [43]:
# using model_epoch...
# 1: 0.567664057385
# 2: 0.508170392997
# 3: 0.422351891972
# 4: 0.407533293215
# 5: 0.415909610287
# 6: 0.3984243719
# 7: 0.425630836314
# 8: 0.419762750268
# 9: 0.412219429248
#10: 0.423556593671
#11: 0.4314310939

#15: 0.366902339517

In [44]:
eval_path = "../data_Android/test_android.txt"
eval_anno = dr.read_annotations(eval_path, K_neg = -1, prune_pos_cnt = -1)
eval_set = dr.create_dev_set(ids_corpus, eval_anno)

z = run_epoch(eval_set, False, model, None)
print(z)

  output, hn = self.lstm(x) # hidden and cells are zero


0.537318239591
