In [1]:
import torch
import torch.nn as nn
from torch.nn import LSTM,Embedding,Linear
from torch.nn import Module
import torch.nn.functional as F
from torch.autograd import Variable

class compare_regex(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, target_size):
        super(compare_regex, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.embed = Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm1 = LSTM(embedding_dim ,hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        self.lstm2 = LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=1, batch_first=True)
        self.fc1 = Linear(hidden_dim*2*2, 60)
        self.fc2 = Linear(60, 20)
        self.fc3 = Linear(20, target_size)

        
    def init_hidden(self, bs):
        if torch.cuda.is_available():
            return (torch.zeros(2, bs, self.hidden_dim).cuda(),
                   torch.zeros(2, bs, self.hidden_dim).cuda())
        else:
            return (torch.zeros(2, bs, self.hidden_dim),
                   torch.zeros(2, bs, self.hidden_dim))
    
    def forward(self, bs, line1, line2, input1_lengths,input2_lengths):
        embeded1 = self.embed(line1)
        embeded2 = self.embed(line2)
        hidden1 = self.init_hidden(bs)
        lstm1_out, last_hidden1 = self.lstm1(embeded1,hidden1)
        hidden2 = self.init_hidden(bs)
        lstm2_out, last_hidden2 = self.lstm1(embeded2,hidden2)
        fc1_out = self.fc1(torch.cat((lstm1_out.mean(1), lstm2_out.mean(1)),1))

        fc1_out = F.tanh(fc1_out)
        fc2_out = self.fc2(fc1_out)
        fc2_out = F.tanh(fc2_out)
        fc3_out = self.fc3(fc2_out)
        score = F.log_softmax(fc3_out,dim=1)
        return score

======================================================================================

In [2]:
f = open('../pair_data/train_data.txt','r')

total_set = set()
lines1 = list()
lines2 = list()
targets = list()


for line in f.read().splitlines():
    splitted = line.split('\t')
    a = '{}\t{}\t{}'.format(splitted[1],splitted[0],splitted[2])
    if not a in total_set:
        total_set.add(line)
        
train_lines1 = list()
train_lines2 = list()
train_targets = list()
for line in total_set:
    splitted = line.split('\t')
    train_lines1.append(splitted[0])
    train_lines2.append(splitted[1])
    train_targets.append(splitted[2])
    train_lines1.append(splitted[1])
    train_lines2.append(splitted[0])
    train_targets.append(splitted[2])

In [10]:
f = open('../pair_data/test_data.txt','r')

total_set = set()
lines1 = list()
lines2 = list()
targets = list()


for line in f.read().splitlines():
    splitted = line.split('\t')
    a = '{}\t{}\t{}'.format(splitted[1],splitted[0],splitted[2])
    if not a in total_set:
        total_set.add(line)
        
test_lines1 = list()
test_lines2 = list()
test_targets = list()
for line in total_set:
    splitted = line.split('\t')
    test_lines1.append(splitted[0])
    test_lines2.append(splitted[1])
    test_targets.append(splitted[2])

In [84]:
#delete cell
f = open('../pair_data/data_pairs_test(4_depth).txt','r')

total_set = set()
lines1 = list()
lines2 = list()
targets = list()


for line in f.read().splitlines():
    splitted = line.split('\t')
    a = '{}\t{}\t{}'.format(splitted[1],splitted[0],splitted[2])
    if not a in total_set:
        total_set.add(line)
        
test_lines1 = list()
test_lines2 = list()
test_targets = list()
for line in total_set:
    splitted = line.split('\t')
    test_lines1.append(splitted[0])
    test_lines2.append(splitted[1])
    test_targets.append(splitted[2])

In [11]:
print(len(train_lines1))
print(len(test_lines1))

372728
5297


==================================================================================================

In [12]:
vocab = {w:i for i,w in enumerate(set([t for s in train_lines1 for t in s.split(' ')]), 1)}
vocab['<pad>'] = 0
vocab_size = len(vocab)
print(len(vocab))

31


In [13]:
print(vocab)

{'b': 17, '(': 1, '[': 18, '3': 19, '5': 2, '<NUM>': 3, '<M0>': 21, '}': 22, '2': 23, '<CAP>': 4, ',': 24, '<M1>': 25, '4': 5, '{': 6, ')': 7, '<LET>': 8, '\\': 9, '.': 27, '<M3>': 10, '&': 11, '<pad>': 0, '*': 12, ']': 13, '6': 15, '<M2>': 30, '|': 28, '<VOW>': 20, '7': 26, '+': 29, '~': 14, '<LOW>': 16}


In [14]:
# f_w = open('./compare_vocab(uncleaned).txt','w')
# for i in vocab.items():
#     f_w.write('{}\t{}\n'.format(i[0],i[1]))
# f_w.close()

In [15]:
vocab = {}
f = open('./compare_vocab.txt','r')
for i in f.read().splitlines():
    splitted = i.split('\t')
    vocab[splitted[0]] = int(splitted[1])
vocab_size = len(vocab)

In [16]:
import random


def make_input_seq(lines1, lines2, targets):
    max_len = 40
    lines1_seq2idx = list()
    lines2_seq2idx = list()
    targets_idx = list()
    lines1_seq = [s.split() for s in lines1]
    lines2_seq = [s.split() for s in lines2]
    for line_num in range(len(lines1_seq)):
        if len(lines1_seq[line_num]) > max_len or len(lines2_seq[line_num]) > max_len:
            continue
        lines1_padded = lines1_seq[line_num]+['<pad>']*(max_len-len(lines1_seq[line_num]))
        lines2_padded = lines2_seq[line_num]+['<pad>']*(max_len-len(lines2_seq[line_num]))
        lines1_seq2idx.append([vocab[i] for i in lines1_padded])
        lines2_seq2idx.append([vocab[i] for i in lines2_padded])
        
        if targets[line_num] == '0':
            targets_idx.append([1,0])
        else:
            targets_idx.append([0,1])
    if torch.cuda.is_available():
        return torch.LongTensor(lines1_seq2idx).cuda(), torch.LongTensor(lines2_seq2idx).cuda(), torch.LongTensor(targets_idx).cuda()
    else:
        return torch.LongTensor(lines1_seq2idx), torch.LongTensor(lines2_seq2idx), torch.LongTensor(targets_idx)
        


lines1_seq2idx, lines2_seq2idx, targets_idx = make_input_seq(train_lines1, train_lines2, train_targets)
test_input1, test_input2, test_targets = make_input_seq(test_lines1, test_lines2, test_targets)

In [17]:
print(len(lines1_seq2idx))
print(len(lines2_seq2idx))
print(len(test_targets))
print(test_targets.tolist().count([1,0]))

372728
372728
5297
2500


In [18]:
import torch.optim as optim
import time

In [19]:
def evaluate_test(model, test_input1, test_input2 , test_target):
    correct = 0
    print(len(test_target))
    tp=0
    tn=0
    fp=0
    fn=0
    for i in range(len(test_input1)):
        test_input1_len = torch.tensor([torch.max(test_input1[i].data.nonzero()+1)])
        test_input2_len = torch.tensor([torch.max(test_input2[i].data.nonzero()+1)])
        score = model(1, test_input1[i].unsqueeze(0), test_input2[i].unsqueeze(0) , test_input1_len.tolist(), test_input2_len.tolist())
        if score.argmax().item() == 1 and test_target[i].argmax().item()==1:
            tp+=1
        elif score.argmax().item() == 0 and test_target[i].argmax().item()==0:
            tn+=1
        elif score.argmax().item() == 1 and test_target[i].argmax().item()==0:
            fp+=1
        elif score.argmax().item() == 0 and test_target[i].argmax().item()==1:
            fn+=1
        if score.argmax().item() == test_target[i].argmax().item():
            correct += 1
    try:
        precision =  tp/(tp+fp)
        recall =  tp/(tp+fn)
        f1_score = 2*((precision*recall)/(precision+recall))
    except:
        precision = 0
        recall = 0
        f1_score = 0
    print('precision: {},recall: {},f1 score:{}'.format(precision,recall,f1_score))
    print('total: {}, correct: {}'.format(len(test_target), correct))
    return correct/len(test_target)

In [20]:
if torch.cuda.is_available():
    compare_regex_model = compare_regex(vocab_size, 4, 256, 2).cuda()
else:
    compare_regex_model = compare_regex(vocab_size, 4, 256, 2)

In [21]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(compare_regex_model.parameters(), lr=0.1)
batch_size = 256

batch_num = int(len(lines1_seq2idx)/batch_size)
for epoch in range(200):
    epoch_loss = 0
    start_time = time.time()
    for batch in range(batch_num):
        compare_regex_model.zero_grad()
        lines1_batch = lines1_seq2idx[batch * batch_size:(batch+1) * batch_size]
        lines2_batch = lines2_seq2idx[batch * batch_size:(batch+1) * batch_size]
        lines1_batch_lengths = torch.tensor([torch.max(lines1_batch[i].data.nonzero()+1) for i in range(len(lines1_batch))]).cuda()
        lines2_batch_lengths = torch.tensor([torch.max(lines2_batch[i].data.nonzero()+1) for i in range(len(lines2_batch))]).cuda()
        tag_score = compare_regex_model(batch_size, lines1_batch, lines2_batch, lines1_batch_lengths.tolist(), lines2_batch_lengths.tolist())
        targets_batches = targets_idx[batch * batch_size:(batch+1) * batch_size]
        loss = loss_function(tag_score.squeeze(1).squeeze(1), targets_batches[:,1])
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        with torch.no_grad():
            test_acc = evaluate_test(compare_regex_model, test_input1, test_input2, test_targets)
            print('step:{}, test acc: {}'.format(epoch, test_acc))
    
    if test_acc == 1.0:
        break
    print('epoch: {}, epoch_loss: {}'.format(epoch,epoch_loss/batch_num))


5297
precision: 0,recall: 0,f1 score:0
total: 5297, correct: 2500
step:0, test acc: 0.4719652633566169
epoch: 0, epoch_loss: 0.6589458014956864


KeyboardInterrupt: 

In [90]:
batch_num = int(len(lines1_seq2idx)/batch_size)

with torch.no_grad():
    test_acc = evaluate_test(compare_regex_model, test_input1, test_input2, test_targets)
    print('test acc: {}'.format(test_acc))
    
    print('epoch: {}, epoch_loss: {}'.format(epoch,epoch_loss/batch_num))

995
precision: 0.9435665914221218,recall: 0.8444444444444444,f1 score:0.8912579957356077
total: 995, correct: 893
test acc: 0.8974874371859296
epoch: 0, epoch_loss: 6.862730427910782e-07


In [83]:
torch.save(compare_regex_model, './compare_regex_model_share.pth')

RuntimeError: cuda runtime error (2) : out of memory at /pytorch/torch/csrc/generic/serialization.cpp:17

In [None]:
compare_regex_model = torch.load('./compare_regex_model.pth')

### single input test

In [None]:
gold = ['( ( <M0> ) & ( [ <LET> ] ) ) . * ( [ <CAP> ] ) . *']
predict = ['( <M0> ) . * ( ( [ <CAP> ] ) & ( [ <CAP> ] ) ) . *']
target = [0]
gold_input, predict_input, target_input = make_input_seq(gold, predict, target)
gold_len = torch.tensor([torch.max(gold_input[0].data.nonzero()+1)])
predict_len = torch.tensor([torch.max(predict_input[0].data.nonzero()+1)])
print(gold_input)

In [None]:
new_vocab = dict(map(reversed, vocab.items()))
print(new_vocab)

In [None]:
' '.join([new_vocab[j] for j in [i for i in gold_input.tolist()[0]]]).replace('<pad>','')


In [None]:
import math
with torch.no_grad():
    score = compare_regex_model(1, gold_input, predict_input, [gold_len], [predict_len])
    print(math.exp(score[0][0]))
    print(math.exp(score[0][1]))


In [None]:
f = open('../pair_data/test.txt','r')


total_set = set()
lines1 = list()
lines2 = list()
targets = list()

count = 0
for line in f.read().splitlines():
    count += 1
    total_set.add(line)
    splitted = line.split('\t')
#     total_set.add('{}\t{}\t{}'.format(splitted[1],splitted[0],splitted[2]))
print(count)

count = 0
for line in total_set:
    count += 1
    splitted = line.split('\t')
    lines1.append(splitted[0])
    lines2.append(splitted[1])
    targets.append(splitted[2])


In [None]:
test_input1, test_input2, test_targets = make_input_seq(lines1, lines2, targets)

In [None]:
with torch.no_grad():
        print('test acc: {}'.format(evaluate_test(compare_regex_model, test_input1, test_input2,  test_targets)))