In [None]:
import torch
import torch.nn as nn
from torch.nn import LSTM,Embedding,Linear
from torch.nn import Module
import torch.nn.functional as F
from torch.autograd import Variable

class compare_regex(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size):
        super(compare_regex, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.embed = Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm1 = LSTM(embedding_dim ,hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        self.lstm2 = LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        self.fc1 = Linear(hidden_dim*2*2, 60)
        self.fc2 = Linear(60, 20)
        self.fc3 = Linear(20, target_size)

        
    def init_hidden(self, bs):
        if torch.cuda.is_available():
            return (torch.zeros(2, bs, self.hidden_dim).cuda(),
                   torch.zeros(2, bs, self.hidden_dim).cuda())
        else:
            return (torch.zeros(2, bs, self.hidden_dim),
                   torch.zeros(2, bs, self.hidden_dim))
    
    def forward(self, bs, line1, line2, input1_lengths,input2_lengths):
        embeded1 = self.embed(line1)
        embeded2 = self.embed(line2)
#         packed1 = torch.nn.utils.rnn.pack_padded_sequence(embeded1, input1_lengths, batch_first=True)
#         packed2 = torch.nn.utils.rnn.pack_padded_sequence(embeded2, input2_lengths, batch_first=True)
        hidden1 = self.init_hidden(bs)
        lstm1_out, last_hidden1 = self.lstm1(embeded1,hidden1)
        hidden2 = self.init_hidden(bs)
        lstm2_out, last_hidden2 = self.lstm2(embeded2,hidden2)
#         unpack1, unpack1_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm1_out, batch_first=True)
#         unpack2, unpack2_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm2_out, batch_first=True)

#         lstm1_last_hidden = (torch.gather(lstm1_out,1,torch.tensor(input1_lengths).cuda().expand(self.hidden_dim, 1,-1).transpose(0,2)-1)).cuda()
#         lstm2_last_hidden = (torch.gather(lstm2_out,1,torch.tensor(input2_lengths).cuda().expand(self.hidden_dim, 1,-1).transpose(0,2)-1)).cuda()


        fc1_out = self.fc1(torch.cat((lstm1_out.mean(1), lstm2_out.mean(1)),1))  #encoder outputs 평균값 concat 97.8%
#         fc1_out = self.fc1(lstm1_out.mean(1) * lstm2_out.mean(1))              #encoder outputs 평균값 multiple
#         fc1_out = self.fc1(torch.cat((lstm1_last_hidden.squeeze(1),lstm2_last_hidden.squeeze(1)), 1))     #last hidden concat 97.1%
#         fc1_out = self.fc1(lstm1_last_hidden.squeeze(1) * lstm2_last_hidden.squeeze(1))     #last hidden multiple

        
        fc1_out = F.tanh(fc1_out)
        fc2_out = self.fc2(fc1_out)
        fc2_out = F.tanh(fc2_out)
        fc3_out = self.fc3(fc2_out)
        score = F.log_softmax(fc3_out,dim=1)
        return score

In [None]:
vocab = {}
f = open('./compare_vocab.txt','r')
for i in f.read().splitlines():
    splitted = i.split('\t')
    vocab[splitted[0]] = int(splitted[1])
vocab_size = len(vocab)

In [None]:
def make_input_seq(lines1, lines2, targets):
    max_len = 40
    lines1_seq2idx = list()
    lines2_seq2idx = list()
    targets_idx = list()
    lines1_seq = [s.split() for s in lines1]
    lines2_seq = [s.split() for s in lines2]
    for line_num in range(len(lines1_seq)):
        if len(lines1_seq[line_num]) > max_len or len(lines2_seq[line_num]) > max_len:
            continue
        lines1_padded = lines1_seq[line_num]+['<pad>']*(max_len-len(lines1_seq[line_num]))
        lines2_padded = lines2_seq[line_num]+['<pad>']*(max_len-len(lines2_seq[line_num]))
        lines1_seq2idx.append([vocab[i] for i in lines1_padded])
        lines2_seq2idx.append([vocab[i] for i in lines2_padded])
        
        if targets[line_num] == '0':
            targets_idx.append([1,0])
        else:
            targets_idx.append([0,1])
    if torch.cuda.is_available():
        return torch.LongTensor(lines1_seq2idx).cuda(), torch.LongTensor(lines2_seq2idx).cuda(), torch.LongTensor(targets_idx).cuda()
    else:
        return torch.LongTensor(lines1_seq2idx), torch.LongTensor(lines2_seq2idx), torch.LongTensor(targets_idx)
        
def evaluate_test(model, test_input1, test_input2 , test_target):
    correct = 0
    print(len(test_target))
    tp=0
    tn=0
    fp=0
    fn=0
    count=0
    for i in range(len(test_input1)):
        count+=1
        test_input1_len = torch.tensor([torch.max(test_input1[i].data.nonzero()+1)])
        test_input2_len = torch.tensor([torch.max(test_input2[i].data.nonzero()+1)])
        score = model(1, test_input1[i].unsqueeze(0), test_input2[i].unsqueeze(0) , test_input1_len.tolist(), test_input2_len.tolist())
        if score.argmax().item() == 1 and test_target[i].argmax().item()==1:
            tp+=1
        elif score.argmax().item() == 0 and test_target[i].argmax().item()==0:
            tn+=1
        elif score.argmax().item() == 1 and test_target[i].argmax().item()==0:
            fp+=1
        elif score.argmax().item() == 0 and test_target[i].argmax().item()==1:
            fn+=1
        if score.argmax().item() == test_target[i].argmax().item():
            correct += 1
    precision =  tp/(tp+fp)
    recall =  tp/(tp+fn)
    f1_score = 2*((precision*recall)/(precision+recall))
    print('precision: {},recall: {},f1 score:{}'.format(precision,recall,f1_score))
    print('total: {}, correct: {}'.format(len(test_target), correct))
    return correct/len(test_target)

In [None]:
compare_regex_model = torch.load('./compare_regex_model.pth')

In [None]:
# f = open('../pair_data/test_data.txt','r')
f = open('../pair_data/data_pairs_test(4_depth).txt','r')
# f = open('../pair_data/data_pairs_test(5_depth).txt','r')


total_set = set()
lines1 = list()
lines2 = list()
targets = list()

# count = 0
# for line in f.read().splitlines():
#     count += 1
#     total_set.add(line)
#     splitted = line.split('\t')
# print(count)

count = 0
for line in f.read().splitlines():
    count += 1
    splitted = line.split('\t')
    lines1.append(splitted[0])
    lines2.append(splitted[1])
    targets.append(splitted[2])

test_input1, test_input2, test_targets = make_input_seq(lines1, lines2, targets)

In [None]:
with torch.no_grad():
        print('test acc: {}'.format(evaluate_test(compare_regex_model, test_input1, test_input2,  test_targets)))