In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as Data
from tqdm import tqdm

import numpy as np
import json

USE_CUDA = True

In [2]:
def readfile(data):
    with open(data, "r", encoding="utf-8") as f:
        content = f.read().splitlines()
        
    return content

def schema_load(schema_root):
    raw_dict = "".join(readfile(schema_root))
    dict2json = "".join(raw_dict.split()[2:])

    json_acceptable_string = dict2json.replace("'", "\"")
    schema = json.loads(json_acceptable_string)
    
    return schema

def define_entity(schema):
    tag_type = list(schema['tagging'])
    
    entity_tag = []
    for k in list(schema['entity'].keys()):
        entity_tag.append(schema['entity'][k]['tag'])
        
    TAG = []
    for t in tag_type:
        for e in entity_tag:
            if t!='O':
                TAG.append(t+'-'+e)  
                
    TAG = [UNKOWN_TAG, PAD_TAG] + TAG + ['O']   

    return TAG

def tag2ix(TAG):
    tag_to_ix={t:i for i,t in enumerate(TAG)}
    return tag_to_ix

def define_relation(schema):
    relation_type = list(schema['relation'])
    
    relation_tag = []
    for k in list(schema['relation'].keys()):
        relation_tag.append(schema['relation'][k]['tag'])
    
    relation_tag = [REL_PAD] + [REL_NONE] + relation_tag
        
    return relation_tag

# ==================================================

def get_word_and_label(_content, start_w, end_w):
    word_list = []
    tag_list = []
    rel_list = []
    
    for word_set in _content[start_w:end_w]:
        word_set = word_set.split()
        if len(word_set)==1:
            word_list.append(' ')
            tag_list.append('O')
            rel_list.append(REL_NONE)
        
        else:
            word_list.append(word_set[0])
            tag_list.append(word_set[1])

            try:
                testerror = word_set[2]
            except IndexError:
                rel_list.append(REL_NONE)
            else:
                rel_list.append(word_set[2:])
    
    return word_list, tag_list, rel_list

def split_to_list(content):
    init = 0
    word_list = []
    tag_list = []
    rel_list = []

    for now_token, c in enumerate(content):
        if c=='':
            words, tags, rels = get_word_and_label(content, init, now_token)
            init = now_token+1
            word_list.append(words)
            tag_list.append(tags)
            rel_list.append(rels)
            
    return word_list, tag_list, rel_list

# ==================================================

def word2index(word_list):
    word_to_ix = {"<UNKNOWN>":0, "<PAD>":1}
    for sentence in word_list:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                
    return word_to_ix

def dict_inverse(tag_to_ix):
    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    return ix_to_tag

def index2tag(indexs, ix_to):
    to_tags = [ix_to[i] for i in indexs.cpu().numpy()]
    return to_tags

# ==================================================

def find_max_len(word_list):
    max_len = 0
    for i in range(len(word_list)):
        if max_len<len(word_list[i]):
            max_len=len(word_list[i])
            
    return max_len

# ====== filter the length of sentence more than MAX_LEN =======

def filter_len(word_list):
    reserved_index = []
    for i in range(len(word_list)):
        if len(word_list[i])<MAX_LEN:
            reserved_index.append(i)
            
    return reserved_index


def filter_sentence(reserved_index, word_list, tag_list, rel_list):
    filter_word = list(word_list[i] for i in reserved_index)
    filter_tag = list(tag_list[i] for i in reserved_index)
    filter_rel = list(rel_list[i] for i in reserved_index)
    return filter_word, filter_tag, filter_rel

# ==================================================

def pad_seq(seq, isrel):
    if isrel:
        seq += [REL_NONE for i in range(MAX_LEN-len(seq))]
    else:
        seq += [PAD_TAG for i in range(MAX_LEN-len(seq))]
    return seq

def pad_all(filter_word, filter_tag, filter_rel):
    input_padded = [pad_seq(s, False) for s in filter_word]
    target_padded = [pad_seq(s, False) for s in filter_tag]
    rel_padded = [pad_seq(s, True) for s in filter_rel]
    
    return input_padded, target_padded, rel_padded

# ==================================================

def prepare_sequence(seq, to_ix):
    idxs = []
    for w in seq:
        if w not in to_ix:
            idxs.append(to_ix[UNKOWN_TAG])
        else:
            idxs.append(to_ix[w])
    
#     idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_all(seqs, to_ix):
    seq_list = []
    for i in range(len(seqs)):
        seq_list.append(prepare_sequence(seqs[i], to_ix))
        
    seq_list = torch.stack(seq_list)
        
    return seq_list



def prepare_rel(rel_padded, to_ix):
    
    rel_ptr = torch.zeros(len(rel_padded), MAX_LEN, MAX_LEN, dtype=torch.long) 
    
    # 對當前的token，去比較之前所有出現過的entity，是否有關係，建成矩陣
    # [B*ML*ML]，第二維ML是當前token，第三維ML是根據當前token對之前出現過的entity紀錄關係，以index紀錄
    for i, rel_seq in enumerate(rel_padded):
        rel_dict = {}
        for j, token_seq in enumerate(rel_seq):
            rel_ptr[i][j][:j+1] = 1
            if token_seq != REL_NONE:
                for k, rel in enumerate(token_seq):

                    # if 是第一次出現，紀錄後面數字(標第幾對)和關係位置(A OR B)
                    # 假如下次出現又是同個關係位置(A)，依然紀錄
                    # 直到下次出現關係位置B，依照之前紀錄的A位置的字，然後在第三維去標關係

                    rel_token = rel.split('-')
                    if rel_token[1] not in rel_dict:
                        rel_dict[rel_token[1]] = {'rel':rel_token[0], 'loc':rel_token[2], 'idx':[j]}

                    elif rel_token[1] in rel_dict and rel_dict[rel_token[1]]['loc']==rel_token[2]:
                        rel_dict[rel_token[1]]['idx'].append(j)

                    else:
                        record_loc = rel_dict[rel_token[1]]['idx']
                        for idxx in record_loc:
                            rel_ptr[i][j][idxx] = to_ix[rel_token[0]]
                            
    return rel_ptr
                


# ==================================================

def dataload(input_var, target_var, rel_var):
    torch_dataset = Data.TensorDataset(input_var, target_var, rel_var)

    loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=2,       
        drop_last=True
    )
    
    return loader

# ==================================================
def softmax_output(output):
    output = output.view(BATCH_SIZE,tagset_size).argmax(1)
    return output

In [3]:
class Attn(nn.Module):
    def __init__(self, attn_input, attn_output, rel_size):
        super(Attn, self).__init__()
        
        self.attn_input = attn_input
        self.attn_output = attn_output
        self.rel_size = rel_size
        
        self.w1 = nn.Linear(self.attn_input, self.attn_output)
        self.w2 = nn.Linear(self.attn_input, self.attn_output)
        self.tanh = nn.Tanh()
        self.v = nn.Linear(self.attn_output, self.rel_size, bias=False)
        
        
    def forward(self, encoder_outputs):
        
        decoder = encoder_outputs[:,-1,:].unsqueeze(1)                       #B*1*(ts+LE) [128,1,8]
        encoder_score = self.w1(encoder_outputs)                             #B*now len*ATTN_OUT
        decoder_score = self.w2(decoder)                                     #B*1*ATTN_OUT
        energy = self.tanh(encoder_score+decoder_score)                      #B*now len*ATTN_OUT            
        
        energy = self.v(energy)                                              #B*now len*rel_size
        
        
        # 針對每個entity做softmax，去顯示他們的關係權重
        # 主要都會是rel_none
        # 對第二維(rel)做softmax
#         p = F.softmax(energy, dim=2)                                         #B*now len*rel_size
        
        return energy
    

In [4]:
class Entity_Typing(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim1, hidden_dim2, \
                 label_embed_dim, rel_tag_to_ix):
        super(Entity_Typing, self).__init__()
        self.embedding_dim = embedding_dim                   #E
        self.hidden_dim1 = hidden_dim1                       #h1
        self.hidden_dim2 = hidden_dim2                       #h2
        self.label_embed_dim = label_embed_dim               #LE
        self.vocab_size = vocab_size                         #vs
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)                    #ts
        self.rel_to_ix = rel_tag_to_ix
        self.rel_size = len(rel_tag_to_ix)                   #rs           
        
        self.dropout = nn.Dropout(p=0.3)
        self.bn = nn.BatchNorm1d(DENSE_OUT, momentum=0.5)
        
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim1 // 2,
                            num_layers=2, bidirectional=True, batch_first=True, dropout=0.2)
        
        self.dense = nn.Linear(hidden_dim1, DENSE_OUT)
        
        self.top_hidden = nn.LSTMCell(DENSE_OUT+label_embed_dim, hidden_dim2)          
        

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim2, self.tagset_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.label_embed = nn.Linear(self.tagset_size, self.label_embed_dim)
        
        self.attn = Attn(ATTN_IN, ATTN_OUT, rel_size)
        
        
    def init_hidden1(self):       
        hidden = torch.randn(2*2, BATCH_SIZE, self.hidden_dim1 // 2)    #4*B*(h1/2)
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_hidden2(self):       
        hidden = torch.randn(BATCH_SIZE, self.hidden_dim2)              #B*h2
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_label_embed(self):
        hidden = torch.zeros(BATCH_SIZE, self.label_embed_dim)          #B*LE
        return hidden.cuda()if USE_CUDA else hidden
    
    def create_entity(self):
        output_tensor = torch.zeros(BATCH_SIZE, MAX_LEN, self.tagset_size)  #B*ML*ts
        return output_tensor.cuda()if USE_CUDA else output_tensor
    
    def create_rel_matrix(self):
        rel_tensor = torch.zeros(BATCH_SIZE, MAX_LEN, MAX_LEN, self.rel_size)  #B*ML*ML*rs
        return rel_tensor.cuda()if USE_CUDA else rel_tensor
    
    
        
    def forward(self, sentence):
        self.hidden1 = self.init_hidden1()                      #4*B*(h1/2)
        entity_tensor = self.create_entity()                    #B*ML*ts
        rel_tensor = self.create_rel_matrix()                   #B*ML*ML*rs
        
        embeds = self.word_embeds(sentence)                     #B*ML*E,[128, 100, 20]
        
        bilstm_out, self.hidden1 = self.bilstm(embeds, self.hidden1)
        # bilstm_out -> B*ML*h1,[128, 100, 10]
        # self.hidden1 -> ( 4*B*(h1/2), 4*B*(h1/2) )
        
        # dropout
#         bilstm_out = self.dropout(bilstm_out)
        
        # bn
#         bilstm_out = self.bn(bilstm_out)
        
        
        dense_out = self.dense(bilstm_out)                      #B*ML*DENSE_OUT,[128, 100, 100]
        
        
        encoder_sequence_l = [] 

        for length in range(MAX_LEN):
            now_token = dense_out[:,length,:]
            now_token = torch.squeeze(now_token, 1)
            if length==0:
                
#                 fake_hidden=(100)
#                 noise_x = random(100)
                self.hidden2 = self.init_hidden2()
                self.zero_label_embed = self.init_label_embed()
                combine_x = torch.cat((now_token, self.zero_label_embed),1)  #B*(DENSE_OUT+LE),[128, 103]
                
            else:
#                 fake_hidden=h
                self.hidden2 = (h_next, c_next)
                combine_x = torch.cat((now_token, label),1)

            h_next, c_next = self.top_hidden(combine_x, self.hidden2)    #B*h2,[128, 8]           
            to_tags = self.hidden2tag(h_next)                            #B*ts,[128, 5]            
            output = self.softmax(to_tags)                               #B*ts,[128, 5]             
            label = self.label_embed(output)                             #B*LE,[128, 3]
            
            s_output = softmax_output(output)
            
            
            # Assignments to Variables are in-place operations.
            # Use that variable in lots of other contexts 
            # and some of the functions require it to not change. 
            to_tags_clone = to_tags.clone()
            label_clone = label.clone()
            
            
#             for i, tag in enumerate(s_output):
#                 if tag==ent_tag_to_ix['O']:
#                     to_tags_clone[i] = torch.FloatTensor([-999999 * self.tagset_size])
#                     label_clone[i] = torch.FloatTensor([-999999 * self.tagset_size])
                    
            # relation layer
            encoder_sequence_l.append(torch.cat((to_tags,label),1))
            encoder_sequence = torch.stack(encoder_sequence_l).t()     #B*len*(ts+LE), [128,1,8]          

            # Calculate attention weights 
            attn_weights = self.attn(encoder_sequence)

        
            entity_tensor[:,length,:] = output
            
            # rel_tensor[:,length, 頭~當前 ,:]
            rel_tensor[:,length,:length+1,:] = attn_weights

        
        
        '''NLLLoss input: Input: (N,C) where C = number of classes'''
        return entity_tensor.view(BATCH_SIZE*MAX_LEN, self.tagset_size), \
               rel_tensor.view(BATCH_SIZE*MAX_LEN*MAX_LEN, self.rel_size)

In [5]:
root = '/notebooks/sinica/dataset/'
train_data = root+'facial.train'
dev_data = root+'facial.dev'
test_data = root+'facial.test'

relation_data_old = root+'facial_r.old.train'
# relation_data = root+'facial_r.train'
relation_data = root+'facial_r2.train'
schema_root = root+'schema.txt'
dev_data = root+'facial_r2.dev'


UNKOWN_TAG = "<UNKNOWN>"
PAD_TAG = "<PAD>"
REL_NONE = 'Rel-None'
REL_PAD = 'Rel-Pad'

schema = schema_load(schema_root)
ENT_TAG = define_entity(schema)
REL_TAG = define_relation(schema)
ent_tag_to_ix = tag2ix(ENT_TAG)
'''{'<PAD>': 1,
 '<UNKNOWN>': 0,
 'B-FUNC': 2,
 'B-STAT': 3,
 'I-FUNC': 4,
 'I-STAT': 5,
 'O': 6}'''
rel_tag_to_ix = tag2ix(REL_TAG)
'''{'ApplyTo': 2, 'Rel-None': 1, 'Rel-Pad': 0}'''

# ========hyper-parameter-set==========

tagset_size = len(ent_tag_to_ix)
rel_size = len(rel_tag_to_ix)
MAX_LEN = 100
BATCH_SIZE = 18

EMBEDDING_DIM = 20
HIDDEN_DIM1 = 10
HIDDEN_DIM2 = 8
LABEL_EMBED_DIM = 3
DENSE_OUT = 100

ATTN_IN = tagset_size+LABEL_EMBED_DIM
ATTN_OUT = 6

In [6]:
def preprocess(relation_data):
    content = readfile(relation_data)
    word_list, tag_list, rel_list = split_to_list(content)
    word_to_ix = word2index(word_list)
    reserved_index = filter_len(word_list)
    filter_word, filter_tag, filter_rel = filter_sentence(reserved_index, word_list, tag_list, rel_list)
    input_padded, target_padded, rel_padded = pad_all(filter_word, filter_tag, filter_rel)
    #================================================
    input_var = prepare_all(input_padded, word_to_ix)
    target_var = prepare_all(target_padded, ent_tag_to_ix)
    rel_var = prepare_rel(rel_padded, rel_tag_to_ix)
    #================================================
    vocab_size = len(word_to_ix)
    
    return input_var, target_var, rel_var, vocab_size

In [7]:
ix_to_ent_tag = dict_inverse(ent_tag_to_ix)
ix_to_rel_tag = dict_inverse(rel_tag_to_ix)
#===============================================
input_var, target_var, rel_var, vocab_size = preprocess(relation_data)

In [8]:
loader = dataload(input_var, target_var, rel_var)
model = Entity_Typing(vocab_size, ent_tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2, \
              LABEL_EMBED_DIM, rel_tag_to_ix).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion_tag = nn.NLLLoss()
criterion_rel = nn.CrossEntropyLoss()

In [9]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [10]:
len(rel_var)

306

In [11]:
n_iters = 50
print_every = 12
all_losses = []
total_loss = 0 # Reset every plot_every iters
loss = 0

start = time.time()

for epoch in tqdm(range(n_iters)):  
    for step, (batch_x, batch_tag, batch_rel) in enumerate(loader):
        optimizer.zero_grad()
        tag_output, rel_output = model(batch_x.cuda() if USE_CUDA else batch_x)
        
        batch_tag = batch_tag.view(BATCH_SIZE*MAX_LEN)
        batch_rel = batch_rel.view(BATCH_SIZE*MAX_LEN*MAX_LEN)
        
        loss_tag = criterion_tag(tag_output, batch_tag.cuda() if USE_CUDA else batch_tag)
        loss_rel = criterion_rel(rel_output, batch_rel.cuda() if USE_CUDA else batch_rel)
        loss = loss_tag+loss_rel
        loss.backward()

#         loss.backward(retain_graph=True)
        optimizer.step()
        
        if step % print_every == 1:
            all_losses.append(loss.cpu())
        #    print('%.4f| epoch: %d| step: %d| %s' % (loss, epoch, step, timeSince(start)))
    print("epoch: %d | tag loss %.4f | rel loss %.4f | total loss %.4f" \
          % (epoch, loss_tag, loss_rel, loss))


  2%|▏         | 1/50 [00:06<05:37,  6.89s/it]

epoch: 0 | tag loss 0.6993 | rel loss 0.6970 | total loss 1.3963


  4%|▍         | 2/50 [00:14<05:39,  7.06s/it]

epoch: 1 | tag loss 0.3780 | rel loss 0.5813 | total loss 0.9594


  6%|▌         | 3/50 [00:20<05:26,  6.95s/it]

epoch: 2 | tag loss 0.3591 | rel loss 0.5603 | total loss 0.9195


  8%|▊         | 4/50 [00:27<05:17,  6.90s/it]

epoch: 3 | tag loss 0.2969 | rel loss 0.5561 | total loss 0.8530


 10%|█         | 5/50 [00:33<05:05,  6.79s/it]

epoch: 4 | tag loss 0.2931 | rel loss 0.5545 | total loss 0.8476


 12%|█▏        | 6/50 [00:41<05:00,  6.84s/it]

epoch: 5 | tag loss 0.2597 | rel loss 0.5519 | total loss 0.8116


 14%|█▍        | 7/50 [00:47<04:54,  6.85s/it]

epoch: 6 | tag loss 0.2061 | rel loss 0.5508 | total loss 0.7569


 16%|█▌        | 8/50 [00:54<04:46,  6.82s/it]

epoch: 7 | tag loss 0.1605 | rel loss 0.5506 | total loss 0.7111


 18%|█▊        | 9/50 [01:01<04:39,  6.81s/it]

epoch: 8 | tag loss 0.1469 | rel loss 0.5494 | total loss 0.6963


 20%|██        | 10/50 [01:08<04:33,  6.84s/it]

epoch: 9 | tag loss 0.1462 | rel loss 0.5488 | total loss 0.6950


 22%|██▏       | 11/50 [01:15<04:27,  6.85s/it]

epoch: 10 | tag loss 0.1204 | rel loss 0.5481 | total loss 0.6685


 24%|██▍       | 12/50 [01:22<04:20,  6.85s/it]

epoch: 11 | tag loss 0.1040 | rel loss 0.5484 | total loss 0.6525


 26%|██▌       | 13/50 [01:29<04:13,  6.85s/it]

epoch: 12 | tag loss 0.1016 | rel loss 0.5478 | total loss 0.6494


 28%|██▊       | 14/50 [01:36<04:07,  6.88s/it]

epoch: 13 | tag loss 0.1027 | rel loss 0.5478 | total loss 0.6505


 30%|███       | 15/50 [01:43<04:01,  6.89s/it]

epoch: 14 | tag loss 0.0918 | rel loss 0.5474 | total loss 0.6392


 32%|███▏      | 16/50 [01:50<03:53,  6.88s/it]

epoch: 15 | tag loss 0.0826 | rel loss 0.5474 | total loss 0.6301


 34%|███▍      | 17/50 [01:56<03:46,  6.86s/it]

epoch: 16 | tag loss 0.0697 | rel loss 0.5470 | total loss 0.6166


 36%|███▌      | 18/50 [02:03<03:39,  6.86s/it]

epoch: 17 | tag loss 0.0638 | rel loss 0.5466 | total loss 0.6104


 38%|███▊      | 19/50 [02:10<03:32,  6.85s/it]

epoch: 18 | tag loss 0.0563 | rel loss 0.5461 | total loss 0.6024


 40%|████      | 20/50 [02:16<03:25,  6.84s/it]

epoch: 19 | tag loss 0.0821 | rel loss 0.5468 | total loss 0.6289


 42%|████▏     | 21/50 [02:24<03:18,  6.86s/it]

epoch: 20 | tag loss 0.0564 | rel loss 0.5465 | total loss 0.6030


 44%|████▍     | 22/50 [02:31<03:12,  6.86s/it]

epoch: 21 | tag loss 0.0414 | rel loss 0.5463 | total loss 0.5877


 46%|████▌     | 23/50 [02:38<03:05,  6.87s/it]

epoch: 22 | tag loss 0.0397 | rel loss 0.5464 | total loss 0.5862


 48%|████▊     | 24/50 [02:44<02:58,  6.87s/it]

epoch: 23 | tag loss 0.0355 | rel loss 0.5459 | total loss 0.5814


 50%|█████     | 25/50 [02:51<02:51,  6.87s/it]

epoch: 24 | tag loss 0.0457 | rel loss 0.5459 | total loss 0.5916


 52%|█████▏    | 26/50 [02:58<02:44,  6.86s/it]

epoch: 25 | tag loss 0.0429 | rel loss 0.5466 | total loss 0.5895


 54%|█████▍    | 27/50 [03:04<02:37,  6.85s/it]

epoch: 26 | tag loss 0.0355 | rel loss 0.5461 | total loss 0.5816


 56%|█████▌    | 28/50 [03:11<02:30,  6.84s/it]

epoch: 27 | tag loss 0.0276 | rel loss 0.5459 | total loss 0.5735


 58%|█████▊    | 29/50 [03:18<02:23,  6.83s/it]

epoch: 28 | tag loss 0.0274 | rel loss 0.5460 | total loss 0.5734


 60%|██████    | 30/50 [03:24<02:16,  6.82s/it]

epoch: 29 | tag loss 0.0376 | rel loss 0.5464 | total loss 0.5839


 62%|██████▏   | 31/50 [03:31<02:09,  6.81s/it]

epoch: 30 | tag loss 0.0337 | rel loss 0.5458 | total loss 0.5796


 64%|██████▍   | 32/50 [03:37<02:02,  6.81s/it]

epoch: 31 | tag loss 0.0251 | rel loss 0.5457 | total loss 0.5708


 66%|██████▌   | 33/50 [03:44<01:55,  6.81s/it]

epoch: 32 | tag loss 0.0333 | rel loss 0.5457 | total loss 0.5790


 68%|██████▊   | 34/50 [03:51<01:48,  6.81s/it]

epoch: 33 | tag loss 0.0259 | rel loss 0.5455 | total loss 0.5715


 70%|███████   | 35/50 [03:58<01:42,  6.81s/it]

epoch: 34 | tag loss 0.0273 | rel loss 0.5452 | total loss 0.5725


 72%|███████▏  | 36/50 [04:04<01:35,  6.80s/it]

epoch: 35 | tag loss 0.0204 | rel loss 0.5455 | total loss 0.5659


 74%|███████▍  | 37/50 [04:11<01:28,  6.80s/it]

epoch: 36 | tag loss 0.0125 | rel loss 0.5454 | total loss 0.5578


 76%|███████▌  | 38/50 [04:18<01:21,  6.79s/it]

epoch: 37 | tag loss 0.0115 | rel loss 0.5453 | total loss 0.5568


 78%|███████▊  | 39/50 [04:24<01:14,  6.78s/it]

epoch: 38 | tag loss 0.0131 | rel loss 0.5452 | total loss 0.5583


 80%|████████  | 40/50 [04:31<01:07,  6.79s/it]

epoch: 39 | tag loss 0.0205 | rel loss 0.5453 | total loss 0.5658


 82%|████████▏ | 41/50 [04:38<01:01,  6.79s/it]

epoch: 40 | tag loss 0.0233 | rel loss 0.5454 | total loss 0.5687


 84%|████████▍ | 42/50 [04:45<00:54,  6.80s/it]

epoch: 41 | tag loss 0.0097 | rel loss 0.5453 | total loss 0.5549


 86%|████████▌ | 43/50 [04:52<00:47,  6.80s/it]

epoch: 42 | tag loss 0.0255 | rel loss 0.5452 | total loss 0.5707


 88%|████████▊ | 44/50 [04:59<00:40,  6.80s/it]

epoch: 43 | tag loss 0.0251 | rel loss 0.5456 | total loss 0.5707


 90%|█████████ | 45/50 [05:05<00:33,  6.80s/it]

epoch: 44 | tag loss 0.0180 | rel loss 0.5453 | total loss 0.5633


 92%|█████████▏| 46/50 [05:12<00:27,  6.79s/it]

epoch: 45 | tag loss 0.0128 | rel loss 0.5450 | total loss 0.5577


 94%|█████████▍| 47/50 [05:19<00:20,  6.80s/it]

epoch: 46 | tag loss 0.0205 | rel loss 0.5453 | total loss 0.5657


 96%|█████████▌| 48/50 [05:26<00:13,  6.80s/it]

epoch: 47 | tag loss 0.0102 | rel loss 0.5450 | total loss 0.5553


 98%|█████████▊| 49/50 [05:33<00:06,  6.80s/it]

epoch: 48 | tag loss 0.0171 | rel loss 0.5452 | total loss 0.5623


100%|██████████| 50/50 [05:39<00:00,  6.80s/it]

epoch: 49 | tag loss 0.0083 | rel loss 0.5452 | total loss 0.5535





In [12]:
import random
def random_choose(input_var):
    r_choose = []
    for i in range(BATCH_SIZE):
        r_choose.append(random.randint(0,len(input_var)))
    return r_choose
        
def total_output(output):
    output = output.view(BATCH_SIZE,MAX_LEN,tagset_size).argmax(2)
    return output

In [13]:
# Check predictions after training
with torch.no_grad():
    r_choose = random_choose(input_var)
    model.eval()
    tag_output, rel_output = model(input_var[r_choose].cuda() if USE_CUDA else input_var)
    
    tag_loss = criterion_tag(tag_output.cpu(), target_var[r_choose].view(BATCH_SIZE*MAX_LEN))
    tag_output = total_output(tag_output)
    
    rel_loss = criterion_rel(rel_output.cpu(), rel_var[r_choose].view(BATCH_SIZE*MAX_LEN*MAX_LEN))
    
    
#     print('predict :', tag_output[0])
#     print('true :', target_var[r_choose[0]])
    print()
    print('predict :', index2tag(tag_output[0], ix_to_ent_tag))
    print('true :', index2tag(target_var[r_choose[0]], ix_to_ent_tag))
    print()
    print('===================================================')
    print()
#     print('predict :', tag_output[1])
#     print('true :', target_var[r_choose[1]])
    print()
    print('predict :', index2tag(tag_output[1], ix_to_ent_tag))
    print('true :', index2tag(target_var[r_choose[1]], ix_to_ent_tag))
    
    print()
    print("Entity loss : %.4f" % tag_loss)
    print("Rel loss : %.4f" % rel_loss)


predict : ['B-FUNC', 'I-FUNC', 'O', 'O', 'O', 'B-FUNC', 'I-FUNC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FUNC', 'I-FUNC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-STAT', 'I-STAT', 'I-STAT', 'I-STAT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FUNC', 'I-FUNC', 'O', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
true : ['B-FUNC', 'I-FUNC', 'O', 'O', 'O', 'B-FUNC', 'I-FUNC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FUNC', 'I-FUNC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-STAT', 

In [14]:
def dev_dataload(input_var, target_var, rel_var):
    torch_dataset = Data.TensorDataset(input_var, target_var, rel_var)

    loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=2,       
        drop_last=True
    )
    
    return loader


input_dev, target_dev, rel_dev, _ = preprocess(dev_data)
dev_loader = dev_dataload(input_dev, target_dev, rel_dev)


# with torch.no_grad():
#     for step, (batch_x, batch_tag, batch_rel) in enumerate(dev_loader):
#         tag_output, rel_output = model(batch_x.cuda() if USE_CUDA else batch_x)
        
#         tag_loss = criterion_tag(tag_output.cpu(), batch_tag.view(BATCH_SIZE*MAX_LEN))
#         tag_output = total_output(tag_output)
    
#         print()
#         print('predict :', index2tag(tag_output[0], ix_to_ent_tag))
#         print('true :', index2tag(batch_tag[0], ix_to_ent_tag))
#         print()
        
#         print("Entity loss : %.4f" % tag_loss)


with torch.no_grad():
    r_choose = random_choose(input_dev)
    model.eval()
    tag_output, rel_output = model(input_dev[r_choose].cuda() if USE_CUDA else input_dev)
    
    tag_loss = criterion_tag(tag_output.cpu(), target_dev[r_choose].view(BATCH_SIZE*MAX_LEN))
    tag_output = total_output(tag_output)
    
    print(r_choose[0])
    print()
    print('predict :', index2tag(tag_output[0], ix_to_ent_tag))
    print('true :', index2tag(target_dev[r_choose[0]], ix_to_ent_tag))
    print()

    print("Entity loss : %.4f" % tag_loss)
    

36

predict : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
true : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FUNC', 'I-FUNC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-STAT', 'I-STAT', 'I-STAT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '<PAD>', 

In [15]:
rel_output.size()

torch.Size([180000, 3])

In [16]:
rel_output = rel_output.view(BATCH_SIZE, MAX_LEN, MAX_LEN, rel_size).argmax(3)


In [17]:
index2tag(rel_var[0], ix_to_rel_tag)

TypeError: unhashable type: 'numpy.ndarray'

In [None]:
to_tags = [ix_to_rel_tag[i] for i in rel_output[0][28].cpu().numpy()]

In [None]:
ss = ''
rel_output = rel_output.cpu().numpy()
for now_word in range(len(rel_output[0])):
    for per_word in rel_output[0][now_word]:
        ss = ix_to_rel_tag[per_word] 

In [None]:
tag_output[0]

In [None]:
to_tags

In [None]:
len(rel_output[0])

In [None]:
content = readfile(dev_data)
word_list, tag_list, rel_list = split_to_list(content)
word_to_ix = word2index(word_list)
reserved_index = filter_len(word_list)
filter_word, filter_tag, filter_rel = filter_sentence(reserved_index, word_list, tag_list, rel_list)

In [None]:
filter_word[30]

In [None]:
filter_tag[30]