In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as Data
from tqdm import tqdm

import numpy as np
import json

USE_CUDA = True

In [2]:
def readfile(data):
    with open(data, "r", encoding="utf-8") as f:
        content = f.read().splitlines()
        
    return content

def schema_load(schema_root):
    raw_dict = "".join(readfile(schema_root))
    dict2json = "".join(raw_dict.split()[2:])

    json_acceptable_string = dict2json.replace("'", "\"")
    schema = json.loads(json_acceptable_string)
    
    return schema

def define_entity(schema):
    tag_type = list(schema['tagging'])
    
    entity_tag = []
    for k in list(schema['entity'].keys()):
        entity_tag.append(schema['entity'][k]['tag'])
        
    TAG = []
    for t in tag_type:
        for e in entity_tag:
            if t!='O':
                TAG.append(t+'-'+e)  
                
    TAG = [UNKOWN_TAG, PAD_TAG] + TAG + ['O']   

    return TAG

def tag2ix(TAG):
    tag_to_ix={t:i for i,t in enumerate(TAG)}
    return tag_to_ix

def define_relation(schema):
    relation_type = list(schema['relation'])
    
    relation_tag = []
    for k in list(schema['relation'].keys()):
        relation_tag.append(schema['relation'][k]['tag'])
    
    relation_tag += [REL_NONE]
        
    return relation_tag

# ==================================================

def get_word_and_label(_content, start_w, end_w):
    word_list = []
    tag_list = []
    rel_list = []
    
    for word_set in _content[start_w:end_w]:
        word_set = word_set.split()
        if len(word_set)==1:
            word_list.append(' ')
            tag_list.append('O')
            rel_list.append(REL_NONE)
        
        else:
            word_list.append(word_set[0])
            tag_list.append(word_set[1])

            try:
                testerror = word_set[2]
            except IndexError:
                rel_list.append(REL_NONE)
            else:
                rel_list.append(word_set[2:])
    
    return word_list, tag_list, rel_list

def split_to_list(content):
    init = 0
    word_list = []
    tag_list = []
    rel_list = []

    for now_token, c in enumerate(content):
        if c=='':
            words, tags, rels = get_word_and_label(content, init, now_token)
            init = now_token+1
            word_list.append(words)
            tag_list.append(tags)
            rel_list.append(rels)
            
    return word_list, tag_list, rel_list

# ==================================================
    
def prepare_sequence(seq, to_ix):
    idxs = []
    for w in seq:
        if w not in to_ix:
            idxs.append(to_ix[UNKOWN_TAG])
        else:
            idxs.append(to_ix[w])
    
#     idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_all(seqs, to_ix):
    seq_list = []
    for i in range(len(seqs)):
        seq_list.append(prepare_sequence(seqs[i], to_ix))
        
    seq_list = torch.stack(seq_list)
        
    return seq_list

# ==================================================

def word2index(word_list):
    word_to_ix = {"<UNKNOWN>":0, "<PAD>":1}
    for sentence in word_list:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
                
    return word_to_ix

def dict_inverse(tag_to_ix):
    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    return ix_to_tag

def index2tag(indexs, ix_to):
    to_tags = [ix_to[i] for i in indexs.cpu().numpy()]
    return to_tags

# ==================================================

def find_max_len(word_list):
    max_len = 0
    for i in range(len(word_list)):
        if max_len<len(word_list[i]):
            max_len=len(word_list[i])
            
    return max_len

# ====== filter the length of sentence more than MAX_LEN =======

def filter_len(word_list):
    reserved_index = []
    for i in range(len(word_list)):
        if len(word_list[i])<MAX_LEN:
            reserved_index.append(i)
            
    return reserved_index


def filter_sentence(reserved_index, word_list, tag_list):
    filter_word = list(word_list[i] for i in reserved_index)
    filter_tag = list(tag_list[i] for i in reserved_index)
    return filter_word, filter_tag

# ==================================================

def pad_seq(seq):
    seq += [PAD_TAG for i in range(MAX_LEN-len(seq))]
    return seq

def pad_all(filter_word, filter_tag):
    input_padded = [pad_seq(s) for s in filter_word]
    target_padded = [pad_seq(s) for s in filter_tag]
    
    return input_padded, target_padded

# ==================================================
def dataload(input_var, target_var):
    torch_dataset = Data.TensorDataset(input_var, target_var)

    loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               
        num_workers=2,       
        drop_last=True
    )
    
    return loader

# ==================================================
def softmax_output(output):
    output = output.view(BATCH_SIZE,tagset_size).argmax(1)
    return output

In [3]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S

        if USE_CUDA:
            attn_energies = attn_energies.cuda()

        # For each batch of encoder outputs
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.v.dot(energy)
            return energy

In [4]:
class Entity_Typing(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim1, hidden_dim2, \
                 label_embed_dim):
        super(Entity_Typing, self).__init__()
        self.embedding_dim = embedding_dim                   #E
        self.hidden_dim1 = hidden_dim1                       #h1
        self.hidden_dim2 = hidden_dim2                       #h2
        self.label_embed_dim = label_embed_dim               #LE
        self.vocab_size = vocab_size                         #vs
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)                    #ts
        
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(embedding_dim, hidden_dim1 // 2,
                            num_layers=2, bidirectional=True, batch_first=True)
        
        self.dense = nn.Linear(hidden_dim1, DENSE_OUT)
        
        self.top_hidden = nn.LSTMCell(DENSE_OUT+label_embed_dim, hidden_dim2)          
        

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim2, self.tagset_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.label_embed = nn.Linear(self.tagset_size, self.label_embed_dim)
        
        
    def init_hidden1(self):       
        hidden = torch.randn(2*2, BATCH_SIZE, self.hidden_dim1 // 2)    #4*B*(h1/2)
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_hidden2(self):       
        hidden = torch.randn(BATCH_SIZE, self.hidden_dim2)              #B*h2
#         hidden = Variable(hidden.data, requires_grad=True)

        return (hidden.cuda(), hidden.cuda())if USE_CUDA else (hidden,hidden)
    
    def init_label_embed(self):
        hidden = torch.zeros(BATCH_SIZE, self.label_embed_dim)          #B*LE
        return hidden.cuda()if USE_CUDA else hidden
    
    def create_output(self):
        output_tensor = torch.zeros(BATCH_SIZE, MAX_LEN, self.tagset_size)  #B*ML*ts
        return output_tensor.cuda()if USE_CUDA else output_tensor
        
    def forward(self, sentence):
        self.hidden1 = self.init_hidden1()                      #4*B*(h1/2)
        output_tensor = self.create_output()                    #B*ML*ts
        
        embeds = self.word_embeds(sentence)                     #B*ML*E,[128, 100, 20]
        
        bilstm_out, self.hidden1 = self.bilstm(embeds, self.hidden1)
        # bilstm_out -> B*ML*h1,[128, 100, 10]
        # self.hidden1 -> ( 4*B*(h1/2), 4*B*(h1/2) )
        
        dense_out = self.dense(bilstm_out)                      #B*ML*DENSE_OUT,[128, 100, 100]
        
        
        encoder_sequence = []
        decoder_sequence = []
        
#         encoder_sequence = Variable(torch.zeros([BATCH_SIZE, 1]))
#         encoder_sequence = encoder_sequence.cuda() if USE_CUDA else encoder_sequence
        
        

        for length in range(MAX_LEN):
            now_token = dense_out[:,length,:]
            now_token = torch.squeeze(now_token, 1)
            if length==0:
                
#                 fake_hidden=(100)
#                 noise_x = random(100)
                self.hidden2 = self.init_hidden2()
                self.zero_label_embed = self.init_label_embed()
                combine_x = torch.cat((now_token, self.zero_label_embed),1)  #B*(DENSE_OUT+LE),[128, 103]
                
            else:
#                 fake_hidden=h
                self.hidden2 = (h_next, c_next)
                combine_x = torch.cat((now_token, label),1)

            h_next, c_next = self.top_hidden(combine_x, self.hidden2)    #B*h2,[128, 8]           
            to_tags = self.hidden2tag(h_next)                            #B*ts,[128, 5]            
            output = self.softmax(to_tags)                               #B*ts,[128, 5]             
            label = self.label_embed(output)                             #B*LE,[128, 3]
            
            s_output = softmax_output(output)            
            encoder_sequence.append(s_output)
            
        
        
            
            output_tensor[:,length,:] = output
        
        encoder_sequence = torch.stack(encoder_sequence).t()
#         print(encoder_sequence)
        
        '''NLLLoss input: Input: (N,C) where C = number of classes'''
        return output_tensor.view(BATCH_SIZE*MAX_LEN, self.tagset_size)

In [5]:
root = '/notebooks/sinica/dataset/'
train_data = root+'facial.train'
dev_data = root+'facial.dev'
test_data = root+'facial.test'

relation_data = root+'facial_r.train'
schema_root = root+'schema.txt'


UNKOWN_TAG = "<UNKNOWN>"
PAD_TAG = "<PAD>"
REL_NONE = 'Rel-None'

schema = schema_load(schema_root)
ENT_TAG = define_entity(schema)
REL_TAG = define_relation(schema)
ent_tag_to_ix = tag2ix(ENT_TAG)
rel_tag_to_ix = tag2ix(REL_TAG)

# ========hyper-parameter-set==========

tagset_size = len(ent_tag_to_ix)
MAX_LEN = 100
BATCH_SIZE = 2

EMBEDDING_DIM = 20
HIDDEN_DIM1 = 10
HIDDEN_DIM2 = 8
LABEL_EMBED_DIM = 3
DENSE_OUT = 100

In [6]:
tag_ix_to_tag = dict_inverse(ent_tag_to_ix)
#===============================================
content = readfile(relation_data)
word_list, tag_list, rel_list = split_to_list(content)
word_to_ix = word2index(word_list)
reserved_index = filter_len(word_list)
filter_word, filter_tag = filter_sentence(reserved_index, word_list, tag_list)
input_padded, target_padded = pad_all(filter_word, filter_tag)
#================================================
input_var = prepare_all(input_padded, word_to_ix)
target_var = prepare_all(target_padded, ent_tag_to_ix)
#================================================
vocab_size = len(word_to_ix)

In [7]:
loader = dataload(input_var, target_var)
model = Entity_Typing(vocab_size, ent_tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2, \
              LABEL_EMBED_DIM).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = nn.NLLLoss()

In [8]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [9]:
n_iters = 50
print_every = 12
all_losses = []
total_loss = 0 # Reset every plot_every iters
loss = 0

start = time.time()

for epoch in tqdm(range(n_iters)):  
    for step, (batch_x, batch_y) in enumerate(loader):
        optimizer.zero_grad()
        output = model(batch_x.cuda() if USE_CUDA else batch_x)
        batch_y = batch_y.view(BATCH_SIZE*MAX_LEN)
        loss = criterion(output, batch_y.cuda() if USE_CUDA else batch_y)
        loss.backward()
#         loss.backward(retain_graph=True)
        optimizer.step()
        
        if step % print_every == 1:
            all_losses.append(loss.cpu())
        #    print('%.4f| epoch: %d| step: %d| %s' % (loss, epoch, step, timeSince(start)))
    print("epoch: %d | loss %.4f" % (epoch,loss))

  2%|▏         | 1/50 [00:02<02:05,  2.56s/it]

epoch: 0 | loss 0.9074


  4%|▍         | 2/50 [00:05<02:02,  2.56s/it]

epoch: 1 | loss 0.7891


  6%|▌         | 3/50 [00:07<02:01,  2.59s/it]

epoch: 2 | loss 0.3236


  8%|▊         | 4/50 [00:10<01:59,  2.61s/it]

epoch: 3 | loss 0.1296


 10%|█         | 5/50 [00:13<01:57,  2.61s/it]

epoch: 4 | loss 0.1993


 12%|█▏        | 6/50 [00:15<01:54,  2.60s/it]

epoch: 5 | loss 0.0804


 14%|█▍        | 7/50 [00:18<01:51,  2.60s/it]

epoch: 6 | loss 0.1940


 16%|█▌        | 8/50 [00:20<01:48,  2.58s/it]

epoch: 7 | loss 0.0367


 18%|█▊        | 9/50 [00:23<01:45,  2.58s/it]

epoch: 8 | loss 0.1719


 20%|██        | 10/50 [00:25<01:43,  2.58s/it]

epoch: 9 | loss 0.1355


 22%|██▏       | 11/50 [00:28<01:40,  2.58s/it]

epoch: 10 | loss 0.1842


 24%|██▍       | 12/50 [00:30<01:38,  2.58s/it]

epoch: 11 | loss 0.0375


 26%|██▌       | 13/50 [00:33<01:35,  2.58s/it]

epoch: 12 | loss 0.0252


 28%|██▊       | 14/50 [00:36<01:32,  2.57s/it]

epoch: 13 | loss 0.3185


 30%|███       | 15/50 [00:38<01:30,  2.58s/it]

epoch: 14 | loss 0.0463


 32%|███▏      | 16/50 [00:41<01:27,  2.57s/it]

epoch: 15 | loss 0.1943


 34%|███▍      | 17/50 [00:43<01:25,  2.58s/it]

epoch: 16 | loss 0.1807


 36%|███▌      | 18/50 [00:46<01:22,  2.58s/it]

epoch: 17 | loss 0.0194


 38%|███▊      | 19/50 [00:49<01:20,  2.59s/it]

epoch: 18 | loss 0.2788


 40%|████      | 20/50 [00:51<01:17,  2.59s/it]

epoch: 19 | loss 0.1740


 42%|████▏     | 21/50 [00:54<01:14,  2.58s/it]

epoch: 20 | loss 0.1618


 44%|████▍     | 22/50 [00:56<01:11,  2.55s/it]

epoch: 21 | loss 0.1642


 46%|████▌     | 23/50 [00:58<01:08,  2.55s/it]

epoch: 22 | loss 0.0464


 48%|████▊     | 24/50 [01:01<01:06,  2.55s/it]

epoch: 23 | loss 0.1296


 50%|█████     | 25/50 [01:03<01:03,  2.55s/it]

epoch: 24 | loss 0.0168


 52%|█████▏    | 26/50 [01:06<01:01,  2.55s/it]

epoch: 25 | loss 0.0813


 54%|█████▍    | 27/50 [01:08<00:58,  2.55s/it]

epoch: 26 | loss 0.0867


 56%|█████▌    | 28/50 [01:11<00:56,  2.55s/it]

epoch: 27 | loss 0.0645


 58%|█████▊    | 29/50 [01:13<00:53,  2.55s/it]

epoch: 28 | loss 0.0603


 60%|██████    | 30/50 [01:16<00:50,  2.55s/it]

epoch: 29 | loss 0.0115


 62%|██████▏   | 31/50 [01:19<00:48,  2.55s/it]

epoch: 30 | loss 0.0641


 64%|██████▍   | 32/50 [01:21<00:45,  2.55s/it]

epoch: 31 | loss 0.0068


 66%|██████▌   | 33/50 [01:24<00:43,  2.55s/it]

epoch: 32 | loss 0.1071


 68%|██████▊   | 34/50 [01:26<00:40,  2.55s/it]

epoch: 33 | loss 0.1297


 70%|███████   | 35/50 [01:29<00:38,  2.55s/it]

epoch: 34 | loss 0.0051


 72%|███████▏  | 36/50 [01:32<00:35,  2.56s/it]

epoch: 35 | loss 0.0975


 74%|███████▍  | 37/50 [01:34<00:33,  2.56s/it]

epoch: 36 | loss 0.0680


 76%|███████▌  | 38/50 [01:37<00:30,  2.56s/it]

epoch: 37 | loss 0.0451


 78%|███████▊  | 39/50 [01:40<00:28,  2.56s/it]

epoch: 38 | loss 0.0441


 80%|████████  | 40/50 [01:42<00:25,  2.57s/it]

epoch: 39 | loss 0.0026


 82%|████████▏ | 41/50 [01:45<00:23,  2.57s/it]

epoch: 40 | loss 0.0571


 84%|████████▍ | 42/50 [01:47<00:20,  2.57s/it]

epoch: 41 | loss 0.0397


 86%|████████▌ | 43/50 [01:50<00:18,  2.57s/it]

epoch: 42 | loss 0.0024


 88%|████████▊ | 44/50 [01:53<00:15,  2.57s/it]

epoch: 43 | loss 0.0905


 90%|█████████ | 45/50 [01:55<00:12,  2.57s/it]

epoch: 44 | loss 0.0340


 92%|█████████▏| 46/50 [01:58<00:10,  2.58s/it]

epoch: 45 | loss 0.0039


 94%|█████████▍| 47/50 [02:01<00:07,  2.58s/it]

epoch: 46 | loss 0.0323


 96%|█████████▌| 48/50 [02:03<00:05,  2.58s/it]

epoch: 47 | loss 0.0048


 98%|█████████▊| 49/50 [02:06<00:02,  2.58s/it]

epoch: 48 | loss 0.0018


100%|██████████| 50/50 [02:08<00:00,  2.58s/it]

epoch: 49 | loss 0.0521





In [10]:
import random
def random_choose(input_var):
    r_choose = []
    for i in range(BATCH_SIZE):
        r_choose.append(random.randint(0,len(input_var)))
    return r_choose
        
def total_output(output):
    output = output.view(BATCH_SIZE,MAX_LEN,tagset_size).argmax(2)
    return output

In [33]:
# Check predictions after training
with torch.no_grad():
    r_choose = random_choose(input_var)
    output = model(input_var[[3,5]].cuda() if USE_CUDA else input_var)
    
    loss = criterion(output.cpu(), target_var[[3,5]].view(BATCH_SIZE*100))
    output = total_output(output)
    
    print('predict :', output[0])
    print('true :', target_var[3])
    print()
    print('predict :', index2tag(output[0], tag_ix_to_tag))
    print('true :', index2tag(target_var[3], tag_ix_to_tag))
    print()
    
    print("Loss : %.4f" % loss)

predict : tensor([ 6,  6,  2,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  6,
         6,  6,  6,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1], device='cuda:0')
true : tensor([ 6,  6,  2,  4,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  2,  4,  6,
         6,  6,  3,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,
         6,  6,  6,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1])

predict : ['O', 'O', 'B-STAT', 'I-F

In [34]:
rel_list

[['Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None'],
 ['Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None'],
 ['Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'Rel-None',
  'R