In [1]:
import sys
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import sklearn
import random
import time
import torch.utils.data

sys.path.append("../src/")
import data_reader as dr
from evaluate import Evaluation
from lstm import LSTM
from loss_function import loss_function
from loss_function import cs
from meter import AUCMeter

In [2]:
corpus_path = "../data_Android/corpus.tsv.gz"

corpus = dr.read_corpus(corpus_path)

embedding_path = "../data/glove.combined.300d.txt.gz"
embedding_tensor, word_to_indx = dr.getEmbeddingTensor(embedding_path)

ids_corpus = dr.map_corpus(corpus, word_to_indx, kernel_width = 1)

In [3]:
train_corpus_path = "../data/text_tokenized.txt.gz"
train_corpus = dr.read_corpus(train_corpus_path)
train_ids_corpus = dr.map_corpus(train_corpus, word_to_indx, kernel_width = 1)
train_path = "../data/train_random.txt"
train = dr.read_annotations(train_path)
train_ex = dr.create_android_train_set(train_ids_corpus, ids_corpus, train)

In [4]:
eval_path = "dev.txt"
eval_anno = dr.read_annotations(eval_path, K_neg = -1, prune_pos_cnt = -1, ignore_dup = True)
eval_set = dr.create_dev_set(ids_corpus, eval_anno)

In [5]:
from transfer_model import Encoder
from transfer_model import DomainClassifier

encoder = Encoder(200, embedding_tensor, 0.2)
dc = DomainClassifier(200, 200)

In [None]:
import torch.utils.data

def train_model(train_data, dev_data, encoder, dc):
    encoder.cuda()
    dc.cuda()
    optimizer_encoder = torch.optim.Adam(encoder.parameters(), lr=0.0001)
    optimizer_dc = torch.optim.Adam(dc.parameters(), lr=-0.001)

    lasttime = time.time()
    for epoch in range(1, 31):
        print("-------------\nEpoch {}:\n".format(epoch))

        loss = run_epoch(train_data, True, encoder, dc, optimizer_encoder, optimizer_dc, 40)
        (reg_loss, bceloss) = loss
        print('Train loss: {:.6f}, BCE loss: {:.6f}'.format(reg_loss, bceloss))
        torch.save(encoder, "encoder{}".format(epoch))
        torch.save(dc, "dc{}".format(epoch))
        
        AUC = run_epoch(dev_data, False, encoder, dc, optimizer_encoder, optimizer_dc, 5)
        print('Val AUC: {:.6f}'.format(AUC))
        
        print('This epoch took: {:.6f}'.format(time.time() - lasttime))
        lasttime = time.time()

        
def run_epoch(data, is_training, encoder, dc, optimizer_encoder, optimizer_dc, batch_size):
    '''
    Train model for one pass of train data, and return loss, acccuracy
    '''
    data_loader = torch.utils.data.DataLoader(
        data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        drop_last=False)

    losses = []
    bcelosses = []
    targets = []
    scores = []

    if is_training:
        encoder.train()
        dc.train()
    else:
        encoder.eval()
        dc.eval()

    for batch in data_loader:
        pid_title = torch.unsqueeze(Variable(batch['pid_title']), 1)
        pid_body = torch.unsqueeze(Variable(batch['pid_body']), 1)
        rest_title = Variable(batch['rest_title'])
        rest_body = Variable(batch['rest_body'])
        
        if is_training:
            android_title = torch.unsqueeze(Variable(batch['android_title']), 1).cuda()
            android_body = torch.unsqueeze(Variable(batch['android_body']), 1).cuda()
        
        pid_title, pid_body = pid_title.cuda(), pid_body.cuda()
        rest_title, rest_body = rest_title.cuda(), rest_body.cuda()
        
        true_batch_size = pid_title.size()[0]
        
        if is_training:
            optimizer_encoder.zero_grad()
            optimizer_dc.zero_grad()
        
        pt = encoder(pid_title)
        pb = encoder(pid_body)
        rt = encoder(rest_title)
        rb = encoder(rest_body)
        
        pid_tensor = (pt + pb)/2
        rest_tensor = (rt + rb)/2
        
        if is_training:
            android_tensor = (encoder(android_title) + encoder(android_body))/2
            dropout = nn.Dropout(p = 0.2)
            
            pid_tensor = dropout(pid_tensor)
            rest_tensor = dropout(rest_tensor)
            android_tensor = dropout(android_tensor)
            
            pid_domain = dc(pid_tensor.view(true_batch_size, -1))
            android_domain = dc(android_tensor.view(true_batch_size, -1))
            
            # vectors are of dim batch_size x 2
            
            softmax = nn.Softmax()
            pid_prob = softmax(pid_domain)
            android_prob = softmax(android_domain)
            probs = torch.cat((pid_prob, android_prob)).clamp(min = 1e-5, max = 1 - 1e-5)
            target = Variable(torch.FloatTensor([0] * true_batch_size + [1] * true_batch_size)).cuda()
            dc_loss = torch.nn.BCELoss()
            
            lmbda = 1e-3
            bceloss = dc_loss(probs[:,0], target)
            loss = loss_function(pid_tensor, rest_tensor, margin = 1.0) - lmbda * bceloss
            
            loss.backward()
            losses.append(loss.cpu().data[0])
            bcelosses.append(bceloss.cpu().data[0])
            optimizer_encoder.step()
            optimizer_dc.step()
        else:
            expanded = pid_tensor.expand_as(rest_tensor)
            similarity = cs(expanded, rest_tensor, dim=2).squeeze(2)
            similarity = torch.FloatTensor(similarity.data.cpu().numpy())
            labels = batch['labels']
            
            for sim in similarity:
                scores.append(sim)
            targets.extend(labels.view(-1))

    # Calculate epoch level scores
    if is_training:
        avg_loss = np.mean(losses)
        avg_bce = np.mean(bcelosses)
        return (avg_loss, avg_bce)
    else:
        aucmeter = AUCMeter()
        aucmeter.reset()
        
        output = torch.cat(scores)
        expect = torch.LongTensor(targets)
        aucmeter.add(output, expect)
        return aucmeter.value(max_fpr=0.05)

In [15]:
z = train_model(train_ex, eval_set, encoder, dc)

-------------
Epoch 1:

Train loss: 1.023345, BCE loss: 0.390002
Val AUC: 0.481933
This epoch took: 132.349137
-------------
Epoch 2:

Train loss: 0.965928, BCE loss: 0.266772
Val AUC: 0.594675
This epoch took: 130.363563
-------------
Epoch 3:

Train loss: 0.906220, BCE loss: 0.216189
Val AUC: 0.596274
This epoch took: 132.697346
-------------
Epoch 4:

Train loss: 0.883714, BCE loss: 0.196960
Val AUC: 0.593224
This epoch took: 134.621222
-------------
Epoch 5:

Train loss: 0.864735, BCE loss: 0.180113
Val AUC: 0.591745
This epoch took: 134.793433
-------------
Epoch 6:

Train loss: 0.850426, BCE loss: 0.176465
Val AUC: 0.569053
This epoch took: 134.828876
-------------
Epoch 7:



Process Process-144:
Process Process-143:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/sunl/.conda/envs/my_pytorch/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/sunl/.conda/envs/my_pytorch/lib/python3.6/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/sunl/.conda/envs/my_pytorch/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/sunl/.conda/envs/my_pytorch/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/sunl/.conda/envs/my_pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 35, in _worker_loop
    r = index_queue.get()
  File "/home/sunl/.conda/envs/my_pytorch/lib/python3.6/multiprocessing/queues.py", line 341, in get
    with self._rlock:
  File "/home/sunl/.conda/envs/my_pytorch/lib/python3.6/site-packages

KeyboardInterrupt: 

In [None]:
encoder = torch.load("encoder3")
dc = torch.load("dc3")
z = run_epoch(eval_set, False, encoder, dc, None, None, 5)
print(z)

In [11]:
model = torch.load("model1")
eval_path = "test.txt"
eval_anno = dr.read_annotations(eval_path, K_neg = -1, prune_pos_cnt = -1, ignore_dup = True)
eval_set = dr.create_dev_set(ids_corpus, eval_anno)

z = run_epoch(eval_set, False, model, None, 5)
print(z)

0.565994666816


In [11]:
out = dr.build_android_qsets("../data_Android/test.pos.txt", "../data_Android/test.neg.txt")

In [12]:
qIDs, qCandidates, labels = out

In [13]:
f = open("test.txt", "w")

for qid in qIDs:
    l = len(labels[qid])
    if l == 101:
        qPos = [qCandidates[qid][0]]
        qNeg = qCandidates[qid][1:]
    else:
        qPos = qCandidates[qid][0:2]
        qNeg = qCandidates[qid][2:]
    for i in range(len(qPos)):
        line = qid + "\t" + qPos[i] + "\t" + " ".join(qNeg[100 * i: 100 * i + 100]) + "\n"
        f.write(line)
        
f.close()

In [21]:
lstm_parameters = list(model.parameters())
lstm_num_params = sum([np.prod(p.size()) for p in lstm_parameters]) - 126219 * 301 # embedding layer
print(lstm_num_params)

322400


In [23]:
torch.__version__

'0.1.12_2'

In [30]:
list(ids_corpus.keys())[:10]

['1', '2', '5', '8', '9', '11', '16', '17', '27', '30']

In [26]:
a = torch.randn(40,2)

In [29]:
a[:,0]


-0.2072
-0.8126
 1.2249
 2.0323
 1.2807
 1.0441
-0.9534
-1.1760
-0.5620
 1.2625
-0.4856
-0.5826
 0.0455
 0.8008
 1.2431
 1.5276
-3.2283
 0.1589
 0.0005
-0.3962
 1.7559
 0.5291
-0.1795
 0.3254
 0.6758
 1.0323
-0.2222
 1.4305
-0.7630
-0.3778
-1.1588
-0.2775
-0.6053
-0.0738
-1.4539
 0.2250
 2.4940
-0.8894
-0.7278
-0.1018
[torch.FloatTensor of size 40]