In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as Data
from tqdm import tqdm

import numpy as np
import json

USE_CUDA = True

In [34]:
def readfile(data):
    with open(data, "r", encoding="utf-8") as f:
        content = f.read().splitlines()
        
    return content

def schema_load(schema_root):
    raw_dict = "".join(readfile(schema_root))
    dict2json = "".join(raw_dict.split()[2:])

    json_acceptable_string = dict2json.replace("'", "\"")
    schema = json.loads(json_acceptable_string)
    
    return schema

def define_entity(schema):
    tag_type = list(schema['tagging'])
    
    entity_tag = []
    for k in list(schema['entity'].keys()):
        entity_tag.append(schema['entity'][k]['tag'])
        
    TAG = []
    for t in tag_type:
        for e in entity_tag:
            if t!='O':
                TAG.append(t+'-'+e)  
                
    TAG = [UNKOWN_TAG, PAD_TAG] + TAG + ['O']   

    return TAG

def tag2ix(TAG):
    tag_to_ix={t:i for i,t in enumerate(TAG)}
    return tag_to_ix

def define_relation(schema):
    relation_type = list(schema['relation'])
    
    relation_tag = []
    for k in list(schema['relation'].keys()):
        relation_tag.append(schema['relation'][k]['tag'])
    
    relation_tag = [REL_NONE] + relation_tag
        
    return relation_tag

# ==================================================

def get_word_and_label(_content, start_w, end_w):
    word_list = []
    tag_list = []
    rel_list = []
    
    for word_set in _content[start_w:end_w]:
        word_set = word_set.split()
        if len(word_set)==1:
            word_list.append(' ')
            tag_list.append('O')
            rel_list.append(REL_NONE)
        
        else:
            word_list.append(word_set[0])
            tag_list.append(word_set[1])

            try:
                testerror = word_set[2]
            except IndexError:
                rel_list.append(REL_NONE)
            else:
                rel_list.append(word_set[2:])
    
    return word_list, tag_list, rel_list

def split_to_list(content):
    init = 0
    word_list = []
    tag_list = []
    rel_list = []

    for now_token, c in enumerate(content):
        if c=='':
            words, tags, rels = get_word_and_label(content, init, now_token)
            init = now_token+1
            word_list.append(words)
            tag_list.append(tags)
            rel_list.append(rels)
            
    return word_list, tag_list, rel_list

# ==================================================

def word2index(word_list):
    word_to_ix = {"<UNKNOWN>":0, "<PAD>":1}
    for sentence in word_list:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                
    return word_to_ix

def dict_inverse(tag_to_ix):
    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    return ix_to_tag

def index2tag(indexs, ix_to):
    to_tags = [ix_to[i] for i in indexs.cpu().numpy()]
    return to_tags

# ==================================================

def find_max_len(word_list):
    max_len = 0
    for i in range(len(word_list)):
        if max_len<len(word_list[i]):
            max_len=len(word_list[i])
            
    return max_len

# ====== filter the length of sentence more than MAX_LEN =======

def filter_len(word_list):
    reserved_index = []
    for i in range(len(word_list)):
        if len(word_list[i])<MAX_LEN:
            reserved_index.append(i)
            
    return reserved_index


def filter_sentence(reserved_index, word_list, tag_list, rel_list):
    filter_word = list(word_list[i] for i in reserved_index)
    filter_tag = list(tag_list[i] for i in reserved_index)
    filter_rel = list(rel_list[i] for i in reserved_index)
    return filter_word, filter_tag, filter_rel

# ==================================================

def pad_seq(seq, isrel):
    if isrel:
        seq += [REL_NONE for i in range(MAX_LEN-len(seq))]
    else:
        seq += [PAD_TAG for i in range(MAX_LEN-len(seq))]
    return seq

def pad_all(filter_word, filter_tag, filter_rel):
    input_padded = [pad_seq(s, False) for s in filter_word]
    target_padded = [pad_seq(s, False) for s in filter_tag]
    rel_padded = [pad_seq(s, True) for s in filter_rel]
    
    return input_padded, target_padded, rel_padded

# ==================================================

def prepare_sequence(seq, to_ix):
    idxs = []
    for w in seq:
        if w not in to_ix:
            idxs.append(to_ix[UNKOWN_TAG])
        else:
            idxs.append(to_ix[w])
    
#     idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_all(seqs, to_ix):
    seq_list = []
    for i in range(len(seqs)):
        seq_list.append(prepare_sequence(seqs[i], to_ix))
        
    seq_list = torch.stack(seq_list)
        
    return seq_list


def prepare_rel(seqs, to_ix):
    seq_list = []
    for seq in seqs:
        idxs = []
        for s in seq:
            if s != REL_NONE:
                rel_l = []
                for l in s:
                    rel_token = l.split('-')
                    to_ix[rel_token[0]]
                    idxs.append(to_ix[rel_token[0]])
            else:
                idxs.append(to_ix[REL_NONE])
        seq_list.append(torch.tensor(idxs, dtype=torch.long))
    
    seq_list = torch.stack(seq_list)
    
    return seq_list
                


# ==================================================

def dataload(input_var, target_var):
    torch_dataset = Data.TensorDataset(input_var, target_var)

    loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=2,       
        drop_last=True
    )
    
    return loader

# ==================================================
def softmax_output(output):
    output = output.view(BATCH_SIZE,tagset_size).argmax(1)
    return output

In [3]:
class Attn(nn.Module):
    def __init__(self, attn_input, attn_output):
        super(Attn, self).__init__()
        
        self.attn_input = attn_input
        self.attn_output = attn_output
        
        self.w1 = nn.Linear(self.attn_input, self.attn_output)
        self.w2 = nn.Linear(self.attn_input, self.attn_output)
        self.tanh = nn.Tanh()
        self.v = nn.Linear(self.attn_output, self.attn_output, bias=False)
#         self.v = nn.Parameter(torch.FloatTensor(128,1,attn_output))
        
        
    def forward(self, encoder_outputs):
#         this_batch_size = encoder_outputs.size(0)
#         max_len = encoder_outputs.size(1)
        
        decoder = encoder_outputs[:,-1,:].unsqueeze(1)                       #B*1*(ts+LE) [128,1,8]
        print(decoder.size())
        
        encoder_score = self.w1(encoder_outputs)
        decoder_score = self.w2(decoder)
        energy = self.tanh(encoder_score+decoder_score)
        print(energy.size())
        
        energy = torch.bmm(self.v, energy)
#         energy = self.v.bmm(energy)
        
        return F.softmax(energy)
    

In [18]:
class Entity_Typing(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim1, hidden_dim2, \
                 label_embed_dim):
        super(Entity_Typing, self).__init__()
        self.embedding_dim = embedding_dim                   #E
        self.hidden_dim1 = hidden_dim1                       #h1
        self.hidden_dim2 = hidden_dim2                       #h2
        self.label_embed_dim = label_embed_dim               #LE
        self.vocab_size = vocab_size                         #vs
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)                    #ts
        
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim1 // 2,
                            num_layers=2, bidirectional=True, batch_first=True)
        
        self.dense = nn.Linear(hidden_dim1, DENSE_OUT)
        
        self.top_hidden = nn.LSTMCell(DENSE_OUT+label_embed_dim, hidden_dim2)          
        

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim2, self.tagset_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.label_embed = nn.Linear(self.tagset_size, self.label_embed_dim)
        
        
    def init_hidden1(self):       
        hidden = torch.randn(2*2, BATCH_SIZE, self.hidden_dim1 // 2)    #4*B*(h1/2)
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_hidden2(self):       
        hidden = torch.randn(BATCH_SIZE, self.hidden_dim2)              #B*h2
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_label_embed(self):
        hidden = torch.zeros(BATCH_SIZE, self.label_embed_dim)          #B*LE
        return hidden.cuda()if USE_CUDA else hidden
    
    def create_output(self):
        output_tensor = torch.zeros(BATCH_SIZE, MAX_LEN, self.tagset_size)  #B*ML*ts
        return output_tensor.cuda()if USE_CUDA else output_tensor
        
    def forward(self, sentence):
        self.hidden1 = self.init_hidden1()                      #4*B*(h1/2)
        output_tensor = self.create_output()                    #B*ML*ts
        
        embeds = self.word_embeds(sentence)                     #B*ML*E,[128, 100, 20]
        
        bilstm_out, self.hidden1 = self.bilstm(embeds, self.hidden1)
        # bilstm_out -> B*ML*h1,[128, 100, 10]
        # self.hidden1 -> ( 4*B*(h1/2), 4*B*(h1/2) )
        
        dense_out = self.dense(bilstm_out)                      #B*ML*DENSE_OUT,[128, 100, 100]
               
        encoder_sequence_l = [] 
        decoder_sequence = []
        

        for length in range(MAX_LEN):
            now_token = dense_out[:,length,:]
            now_token = torch.squeeze(now_token, 1)
            if length==0:
                
#                 fake_hidden=(100)
#                 noise_x = random(100)
                self.hidden2 = self.init_hidden2()
                self.zero_label_embed = self.init_label_embed()
                combine_x = torch.cat((now_token, self.zero_label_embed),1)  #B*(DENSE_OUT+LE),[128, 103]
                
            else:
#                 fake_hidden=h
                self.hidden2 = (h_next, c_next)
                combine_x = torch.cat((now_token, label),1)

            h_next, c_next = self.top_hidden(combine_x, self.hidden2)    #B*h2,[128, 8]           
            to_tags = self.hidden2tag(h_next)                            #B*ts,[128, 5]            
            output = self.softmax(to_tags)                               #B*ts,[128, 5]             
            label = self.label_embed(output)                             #B*LE,[128, 3]
            
            s_output = softmax_output(output)   
            
            for i, tag in enumerate(s_output):
                if tag==ent_tag_to_ix['O']:
                    to_tags[i] = torch.FloatTensor([-999999 * self.tagset_size])
                    label[i] = torch.FloatTensor([-999999 * self.tagset_size])
                    
            # relation layer
            encoder_sequence_l.append(torch.cat((to_tags,label),1))
            encoder_sequence = torch.stack(encoder_sequence_l).t()
#             print(encoder_sequence)
            print(encoder_sequence.size())                              #B*len*(ts+LE), [128,1,8]

            # Calculate attention weights 
#             attn_weights = self.attn(encoder_sequence)

        
        
            
            output_tensor[:,length,:] = output
        
        
        '''NLLLoss input: Input: (N,C) where C = number of classes'''
        return output_tensor.view(BATCH_SIZE*MAX_LEN, self.tagset_size)

In [31]:
root = '/notebooks/sinica/dataset/'
train_data = root+'facial.train'
dev_data = root+'facial.dev'
test_data = root+'facial.test'

relation_data_old = root+'facial_r.old.train'
relation_data = root+'facial_r.train'
schema_root = root+'schema.txt'


UNKOWN_TAG = "<UNKNOWN>"
PAD_TAG = "<PAD>"
REL_NONE = 'Rel-None'

schema = schema_load(schema_root)
ENT_TAG = define_entity(schema)
REL_TAG = define_relation(schema)
ent_tag_to_ix = tag2ix(ENT_TAG)
'''{'<PAD>': 1,
 '<UNKNOWN>': 0,
 'B-FUNC': 2,
 'B-STAT': 3,
 'I-FUNC': 4,
 'I-STAT': 5,
 'O': 6}'''
rel_tag_to_ix = tag2ix(REL_TAG)
'''{'ApplyTo': 1, 'Rel-None': 0}'''

# ========hyper-parameter-set==========

tagset_size = len(ent_tag_to_ix)
MAX_LEN = 100
BATCH_SIZE = 2

EMBEDDING_DIM = 20
HIDDEN_DIM1 = 10
HIDDEN_DIM2 = 8
LABEL_EMBED_DIM = 3
DENSE_OUT = 100

ATTN_IN = tagset_size+LABEL_EMBED_DIM
ATTN_OUT = 6

In [35]:
tag_ix_to_tag = dict_inverse(ent_tag_to_ix)
#===============================================
content = readfile(relation_data)
word_list, tag_list, rel_list = split_to_list(content)
word_to_ix = word2index(word_list)
reserved_index = filter_len(word_list)
filter_word, filter_tag, filter_rel = filter_sentence(reserved_index, word_list, tag_list, rel_list)
input_padded, target_padded, rel_padded = pad_all(filter_word, filter_tag, filter_rel)
#================================================
input_var = prepare_all(input_padded, word_to_ix)
target_var = prepare_all(target_padded, ent_tag_to_ix)
rel_var = prepare_rel(rel_padded, rel_tag_to_ix)
#================================================
vocab_size = len(word_to_ix)

AttributeError: 'list' object has no attribute 'split'

In [19]:
loader = dataload(input_var, target_var)
model = Entity_Typing(vocab_size, ent_tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2, \
              LABEL_EMBED_DIM).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = nn.NLLLoss()

In [8]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [20]:
n_iters = 50
print_every = 12
all_losses = []
total_loss = 0 # Reset every plot_every iters
loss = 0

start = time.time()

for epoch in tqdm(range(n_iters)):  
    for step, (batch_x, batch_y) in enumerate(loader):
        optimizer.zero_grad()
        output = model(batch_x.cuda() if USE_CUDA else batch_x)
        batch_y = batch_y.view(BATCH_SIZE*MAX_LEN)
        loss = criterion(output, batch_y.cuda() if USE_CUDA else batch_y)
        loss.backward()
        break
#         loss.backward(retain_graph=True)
        optimizer.step()
        
        if step % print_every == 1:
            all_losses.append(loss.cpu())
        #    print('%.4f| epoch: %d| step: %d| %s' % (loss, epoch, step, timeSince(start)))
    print("epoch: %d | loss %.4f" % (epoch,loss))
break

  0%|          | 0/50 [00:00<?, ?it/s]

torch.Size([2, 1, 10])
torch.Size([2, 2, 10])
torch.Size([2, 3, 10])
torch.Size([2, 4, 10])
torch.Size([2, 5, 10])
torch.Size([2, 6, 10])
torch.Size([2, 7, 10])
torch.Size([2, 8, 10])
torch.Size([2, 9, 10])
torch.Size([2, 10, 10])
torch.Size([2, 11, 10])
torch.Size([2, 12, 10])
torch.Size([2, 13, 10])
torch.Size([2, 14, 10])
torch.Size([2, 15, 10])
torch.Size([2, 16, 10])
torch.Size([2, 17, 10])
torch.Size([2, 18, 10])
torch.Size([2, 19, 10])
torch.Size([2, 20, 10])
torch.Size([2, 21, 10])
torch.Size([2, 22, 10])
torch.Size([2, 23, 10])
torch.Size([2, 24, 10])
torch.Size([2, 25, 10])
torch.Size([2, 26, 10])
torch.Size([2, 27, 10])
torch.Size([2, 28, 10])
torch.Size([2, 29, 10])
torch.Size([2, 30, 10])
torch.Size([2, 31, 10])
torch.Size([2, 32, 10])
torch.Size([2, 33, 10])
torch.Size([2, 34, 10])
torch.Size([2, 35, 10])
torch.Size([2, 36, 10])
torch.Size([2, 37, 10])
torch.Size([2, 38, 10])
torch.Size([2, 39, 10])
torch.Size([2, 40, 10])
torch.Size([2, 41, 10])
torch.Size([2, 42, 10])
t




TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [10]:
import random
def random_choose(input_var):
    r_choose = []
    for i in range(BATCH_SIZE):
        r_choose.append(random.randint(0,len(input_var)))
    return r_choose
        
def total_output(output):
    output = output.view(BATCH_SIZE,MAX_LEN,tagset_size).argmax(2)
    return output

In [11]:
# Check predictions after training
with torch.no_grad():
    r_choose = random_choose(input_var)
    output = model(input_var[[3,5]].cuda() if USE_CUDA else input_var)
    
    loss = criterion(output.cpu(), target_var[[3,5]].view(BATCH_SIZE*100))
    output = total_output(output)
    
    print('predict :', output[0])
    print('true :', target_var[3])
    print()
    print('predict :', index2tag(output[0], tag_ix_to_tag))
    print('true :', index2tag(target_var[3], tag_ix_to_tag))
    print()
    
    print("Loss : %.4f" % loss)

predict : tensor([ 5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
         5,  5], device='cuda:0')
true : tensor([ 6,  6,  3,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  3,  5,  6,
         6,  6,  2,  4,  4,  4,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1])

predict : ['I-FUNC', 'I-FUNC', 'I-F

In [33]:
REL_TAG

['Rel-None', 'ApplyTo']

In [13]:
ent_tag_to_ix

{'<PAD>': 1,
 '<UNKNOWN>': 0,
 'B-FUNC': 3,
 'B-STAT': 2,
 'I-FUNC': 5,
 'I-STAT': 4,
 'O': 6}

In [32]:
rel_tag_to_ix

{'ApplyTo': 1, 'Rel-None': 0}

In [27]:
rel_padded

[['Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel

In [28]:
target_padded

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>',
  '<PAD>'],