In [1]:
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/dependency/gsd-ud-train.conllu.txt
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/dependency/gsd-ud-test.conllu.txt
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/dependency/gsd-ud-dev.conllu.txt
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/dependency/augmented-dependency.json

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [3]:
with open('gsd-ud-train.conllu.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('gsd-ud-test.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))
    
with open('gsd-ud-dev.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))

In [4]:
import malaya
import re
from malaya.texts._text_functions import split_into_sentences
from malaya.texts import _regex
import numpy as np
import itertools
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = malaya.preprocessing._tokenizer
splitter = split_into_sentences

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def preprocessing(w):
    if is_number_regex(w):
        return '<NUM>'
    elif re.match(_regex._money, w):
        return '<MONEY>'
    elif re.match(_regex._date, w):
        return '<DATE>'
    elif re.match(_regex._expressions['email'], w):
        return '<EMAIL>'
    elif re.match(_regex._expressions['url'], w):
        return '<URL>'
    else:
        w = ''.join(''.join(s)[:2] for _, s in itertools.groupby(w))
        return w

def process_string(string):
    splitted = string.split()
    return [preprocessing(w) for w in splitted]

In [6]:
word2idx = {'PAD': 0,'UNK':1, '_ROOT': 2}
tag2idx = {'PAD': 0, '_<ROOT>': 1}
char2idx = {'PAD': 0,'UNK':1, '_ROOT': 2}
word_idx = 3
tag_idx = 2
char_idx = 3

special_tokens = ['<NUM>', '<MONEY>', '<DATE>', '<URL>', '<EMAIL>']

for t in special_tokens:
    word2idx[t] = word_idx
    word_idx += 1
    char2idx[t] = char_idx
    char_idx += 1
    
word2idx, char2idx

({'PAD': 0,
  'UNK': 1,
  '_ROOT': 2,
  '<NUM>': 3,
  '<MONEY>': 4,
  '<DATE>': 5,
  '<URL>': 6,
  '<EMAIL>': 7},
 {'PAD': 0,
  'UNK': 1,
  '_ROOT': 2,
  '<NUM>': 3,
  '<MONEY>': 4,
  '<DATE>': 5,
  '<URL>': 6,
  '<EMAIL>': 7})

In [7]:
PAD = "_PAD"
PAD_POS = "_PAD_POS"
PAD_TYPE = "_<PAD>"
PAD_CHAR = "_PAD_CHAR"
ROOT = "_ROOT"
ROOT_POS = "_ROOT_POS"
ROOT_TYPE = "_<ROOT>"
ROOT_CHAR = "_ROOT_CHAR"
END = "_END"
END_POS = "_END_POS"
END_TYPE = "_<END>"
END_CHAR = "_END_CHAR"

def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels, pos, chars = [], [], [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []
    first_time = True
    for sentence in corpus:
        try:
            if len(sentence):
                if sentence[0] == '#':
                    continue
                if first_time:
                    print(sentence)
                    first_time = False
                sentence = sentence.split('\t')
                for c in sentence[1]:
                    if c not in char2idx:
                        char2idx[c] = char_idx
                        char_idx += 1
                if sentence[7] not in tag2idx:
                    tag2idx[sentence[7]] = tag_idx
                    tag_idx += 1
                sentence[1] = preprocessing(sentence[1])
                if sentence[1] not in word2idx:
                    word2idx[sentence[1]] = word_idx
                    word_idx += 1
                temp_word.append(word2idx[sentence[1]])
                temp_depend.append(int(sentence[6]))
                temp_label.append(tag2idx[sentence[7]])
                temp_sentence.append(sentence[1])
                temp_pos.append(sentence[3])
            else:
                if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):
                    temp_word = []
                    temp_depend = []
                    temp_label = []
                    temp_sentence = []
                    temp_pos = []
                    continue
                words.append(temp_word)
                depends.append(temp_depend)
                labels.append(temp_label)
                sentences.append( temp_sentence)
                pos.append(temp_pos)
                char_ = [[char2idx['_ROOT']]]
                for w in temp_sentence:
                    if w in char2idx:
                        char_.append([char2idx[w]])
                    else:
                        char_.append([char2idx[c] for c in w])
                chars.append(char_)
                temp_word = []
                temp_depend = []
                temp_label = []
                temp_sentence = []
                temp_pos = []
        except Exception as e:
            print(e, sentence)
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], chars[:-1]

In [8]:
sentences, words, depends, labels, _, _ = process_corpus(corpus)

1	Sembungan	sembungan	PROPN	X--	_	4	nsubj	_	MorphInd=^sembungan<x>_X--$


In [9]:
import json

with open('augmented-dependency.json') as fopen:
    augmented = json.load(fopen)

In [10]:
text_augmented = []
for a in augmented:
    text_augmented.extend(a[0])
    depends.extend((np.array(a[1]) - 1).tolist())
    labels.extend((np.array(a[2]) + 1).tolist())

In [11]:
def parse_XY(texts):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    outside, sentences = [], []
    for no, text in enumerate(texts):
        s = process_string(text)
        sentences.append(s)
        inside = []
        for w in s:
            for c in w:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            
            if w not in word2idx:
                word2idx[w] = word_idx
                word_idx += 1
                
            inside.append(word2idx[w])
        outside.append(inside)
    return outside, sentences

In [12]:
outside, new_sentences = parse_XY(text_augmented)

In [13]:
words.extend(outside)
sentences.extend(new_sentences)

In [14]:
idx2word = {v:k for k, v in word2idx.items()}
idx2tag = {v:k for k, v in tag2idx.items()}
len(idx2word)

66579

In [15]:
from sklearn.model_selection import train_test_split

sentences_train, sentences_test, words_train, words_test, depends_train, depends_test, labels_train, labels_test \
= train_test_split(sentences, words, depends, labels, test_size = 0.2)

In [16]:
len(sentences_train), len(sentences_test)

(40289, 10073)

In [17]:
def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

In [18]:
generate_char_seq(sentences_train[:5]).shape

(5, 28, 13)

In [19]:
train_X = words_train
train_Y = labels_train
train_depends = depends_train
train_char = sentences_train

test_X = words_test
test_Y = labels_test
test_depends = depends_test
test_char = sentences_test

In [20]:
class BiAAttention:
    def __init__(self, input_size_encoder, input_size_decoder, num_labels):
        self.input_size_encoder = input_size_encoder
        self.input_size_decoder = input_size_decoder
        self.num_labels = num_labels
        
        self.W_d = tf.get_variable("W_d", shape=[self.num_labels, self.input_size_decoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_e = tf.get_variable("W_e", shape=[self.num_labels, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.U = tf.get_variable("U", shape=[self.num_labels, self.input_size_decoder, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        
    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        batch = tf.shape(input_d)[0]
        length_decoder = tf.shape(input_d)[1]
        length_encoder = tf.shape(input_e)[1]
        out_d = tf.expand_dims(tf.matmul(self.W_d, tf.transpose(input_d, [0, 2, 1])), 3)
        out_e = tf.expand_dims(tf.matmul(self.W_e, tf.transpose(input_e, [0, 2, 1])), 2)
        output = tf.matmul(tf.expand_dims(input_d, 1), self.U)
        output = tf.matmul(output, tf.transpose(tf.expand_dims(input_e, 1), [0, 1, 3, 2]))
        
        output = output + out_d + out_e
        
        if mask_d is not None:
            d = tf.expand_dims(tf.expand_dims(mask_d, 1), 3)
            e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)
            output = output * d * e
            
        return output

class Model:
    def __init__(
        self,
        dim_word,
        dim_char,
        dropout,
        learning_rate,
        hidden_size_char,
        hidden_size_word,
        num_layers
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                output_keep_prob = dropout,
            )
        
        def luong(embedded, size):
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                num_units = hidden_size_word, memory = embedded
            )
            return tf.contrib.seq2seq.AttentionWrapper(
                cell = cells(hidden_size_word),
                attention_mechanism = attention_mechanism,
                attention_layer_size = hidden_size_word,
            )
        
        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])
        self.labels = tf.placeholder(tf.int32, shape = [None, None])
        self.depends = tf.placeholder(tf.int32, shape = [None, None])
        self.maxlen = tf.shape(self.word_ids)[1]
        self.lengths = tf.count_nonzero(self.word_ids, 1)
        self.mask = tf.math.not_equal(self.word_ids, 0)
        float_mask = tf.cast(self.mask, tf.float32)
        
        self.arc_h = tf.layers.Dense(hidden_size_word)
        self.arc_c = tf.layers.Dense(hidden_size_word)
        self.attention = BiAAttention(hidden_size_word, hidden_size_word, 1)

        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )

        word_embedded = tf.nn.embedding_lookup(
            self.word_embeddings, self.word_ids
        )
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_char_%d' % (n),
            )
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]
        )
        word_embedded = tf.concat([word_embedded, output], axis = -1)

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = luong(word_embedded, hidden_size_word),
                cell_bw = luong(word_embedded, hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (n),
            )
            word_embedded = tf.concat((out_fw, out_bw), 2)

        logits = tf.layers.dense(word_embedded, len(idx2tag))
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, self.labels, self.lengths
        )
        arc_h = tf.nn.elu(self.arc_h(word_embedded))
        arc_c = tf.nn.elu(self.arc_c(word_embedded))
        out_arc = tf.squeeze(self.attention.forward(arc_h, arc_h, mask_d=float_mask, mask_e=float_mask), axis = 1)
        
        batch = tf.shape(out_arc)[0]
        batch_index = tf.range(0, batch)
        max_len = tf.shape(out_arc)[1]
        sec_max_len = tf.shape(out_arc)[2]
        
        minus_inf = -1e8
        minus_mask = (1 - float_mask) * minus_inf
        out_arc = out_arc + tf.expand_dims(minus_mask, axis = 2) + tf.expand_dims(minus_mask, axis = 1)
        loss_arc = tf.nn.log_softmax(out_arc, dim=1)
        loss_arc = loss_arc * tf.expand_dims(float_mask, axis = 2) * tf.expand_dims(float_mask, axis = 1)
        num = tf.reduce_sum(float_mask) - tf.cast(batch, tf.float32)
        
        child_index = tf.tile(tf.expand_dims(tf.range(0, max_len), 1), [1, batch])
        t = tf.transpose(self.depends)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(t, axis = 0),
                                               tf.expand_dims(child_index, axis = 0)], axis = 0))
        loss_arc = tf.gather_nd(loss_arc, concatenated)
        loss_arc = tf.transpose(loss_arc, [1, 0])[1:]
        
        loss_arc = tf.reduce_sum(-loss_arc) / num
        
        self.cost = tf.reduce_mean(-log_likelihood) + loss_arc
        
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        
        self.tags_seq, _ = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        
        out_arc = out_arc + tf.linalg.diag(tf.fill([max_len], -np.inf))
        minus_mask = tf.expand_dims(tf.cast(1.0 - float_mask, tf.bool), axis = 2)
        minus_mask = tf.tile(minus_mask, [1, 1, sec_max_len])
        out_arc = tf.where(minus_mask, tf.fill(tf.shape(out_arc), -np.inf), out_arc)
        self.heads = tf.argmax(out_arc, axis = 1)
        
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(self.labels, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        self.prediction = tf.cast(tf.boolean_mask(self.heads, mask), tf.int32)
        mask_label = tf.boolean_mask(self.depends, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy_depends = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [21]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 128
dim_char = 256
dropout = 0.8
learning_rate = 1e-3
hidden_size_char = 128
hidden_size_word = 128
num_layers = 2

model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
dim is deprecated, use axis instead


In [22]:
batch_x = train_X[:5]
batch_x = pad_sequences(batch_x,padding='post')
batch_char = train_char[:5]
batch_char = generate_char_seq(batch_char)
batch_y = train_Y[:5]
batch_y = pad_sequences(batch_y,padding='post')
batch_depends = train_depends[:5]
batch_depends = pad_sequences(batch_depends,padding='post')

In [23]:
sess.run([model.accuracy, model.accuracy_depends, model.cost],
        feed_dict = {model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends})

[0.08045977, 0.03448276, 64.58651]

In [24]:
from tqdm import tqdm

batch_size = 64
epoch = 10

for e in range(epoch):
    train_acc, train_loss = [], []
    test_acc, test_loss = [], []
    train_acc_depends, test_acc_depends = [], []
    
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_char = train_char[i: index]
        batch_char = generate_char_seq(batch_char)
        batch_y = train_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = train_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        train_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
        
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_char = test_char[i: index]
        batch_char = generate_char_seq(batch_char)
        batch_y = test_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = test_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        test_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    
    print(
    'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
    % (e, np.mean(train_loss), 
       np.mean(train_acc), 
       np.mean(train_acc_depends), 
       np.mean(test_loss), 
       np.mean(test_acc), 
       np.mean(test_acc_depends)
    ))

train minibatch loop: 100%|██████████| 630/630 [07:59<00:00,  1.32it/s, accuracy=0.783, accuracy_depends=0.43, cost=16.9] 
test minibatch loop: 100%|██████████| 158/158 [00:59<00:00,  2.65it/s, accuracy=0.862, accuracy_depends=0.468, cost=9.66]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 0, training loss: 26.720716, training acc: 0.667714, training depends: 0.311707, valid loss: 14.116356, valid acc: 0.827805, valid depends: 0.448278



train minibatch loop: 100%|██████████| 630/630 [08:00<00:00,  1.31it/s, accuracy=0.859, accuracy_depends=0.496, cost=10.5]
test minibatch loop: 100%|██████████| 158/158 [00:59<00:00,  2.65it/s, accuracy=0.904, accuracy_depends=0.513, cost=6.66]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 1, training loss: 10.821845, training acc: 0.865350, training depends: 0.486100, valid loss: 9.962124, valid acc: 0.876815, valid depends: 0.506657



train minibatch loop: 100%|██████████| 630/630 [08:02<00:00,  1.31it/s, accuracy=0.932, accuracy_depends=0.515, cost=5.7] 
test minibatch loop: 100%|██████████| 158/158 [00:59<00:00,  2.63it/s, accuracy=0.919, accuracy_depends=0.548, cost=5.64]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 2, training loss: 6.642692, training acc: 0.918603, training depends: 0.529994, valid loss: 7.397476, valid acc: 0.911560, valid depends: 0.532016



train minibatch loop: 100%|██████████| 630/630 [08:03<00:00,  1.30it/s, accuracy=0.961, accuracy_depends=0.537, cost=3.26]
test minibatch loop: 100%|██████████| 158/158 [01:00<00:00,  2.63it/s, accuracy=0.935, accuracy_depends=0.566, cost=5.11]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 3, training loss: 3.982908, training acc: 0.953158, training depends: 0.554902, valid loss: 6.154591, valid acc: 0.929721, valid depends: 0.546142



train minibatch loop: 100%|██████████| 630/630 [08:03<00:00,  1.30it/s, accuracy=0.985, accuracy_depends=0.555, cost=2.11]
test minibatch loop: 100%|██████████| 158/158 [01:00<00:00,  2.63it/s, accuracy=0.951, accuracy_depends=0.587, cost=4.09]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 4, training loss: 2.523261, training acc: 0.972771, training depends: 0.573750, valid loss: 5.353987, valid acc: 0.942186, valid depends: 0.563084



train minibatch loop: 100%|██████████| 630/630 [08:04<00:00,  1.30it/s, accuracy=0.993, accuracy_depends=0.576, cost=1.3]  
test minibatch loop: 100%|██████████| 158/158 [00:59<00:00,  2.64it/s, accuracy=0.961, accuracy_depends=0.603, cost=3.75]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 5, training loss: 1.777379, training acc: 0.982540, training depends: 0.589181, valid loss: 4.836964, valid acc: 0.950547, valid depends: 0.576298



train minibatch loop: 100%|██████████| 630/630 [07:52<00:00,  1.33it/s, accuracy=0.989, accuracy_depends=0.599, cost=1.23] 
test minibatch loop: 100%|██████████| 158/158 [00:58<00:00,  2.72it/s, accuracy=0.939, accuracy_depends=0.613, cost=4.85]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 6, training loss: 1.336731, training acc: 0.988133, training depends: 0.601860, valid loss: 4.743243, valid acc: 0.953598, valid depends: 0.586394



train minibatch loop: 100%|██████████| 630/630 [07:47<00:00,  1.35it/s, accuracy=0.994, accuracy_depends=0.602, cost=0.896]
test minibatch loop: 100%|██████████| 158/158 [00:57<00:00,  2.73it/s, accuracy=0.951, accuracy_depends=0.619, cost=5.12]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 7, training loss: 1.126532, training acc: 0.990448, training depends: 0.611440, valid loss: 4.796130, valid acc: 0.955561, valid depends: 0.592836



train minibatch loop: 100%|██████████| 630/630 [07:45<00:00,  1.35it/s, accuracy=0.994, accuracy_depends=0.613, cost=0.924]
test minibatch loop: 100%|██████████| 158/158 [00:57<00:00,  2.72it/s, accuracy=0.951, accuracy_depends=0.611, cost=4.14]
train minibatch loop:   0%|          | 0/630 [00:00<?, ?it/s]

epoch: 8, training loss: 0.951201, training acc: 0.992444, training depends: 0.620417, valid loss: 4.656150, valid acc: 0.957689, valid depends: 0.601366



train minibatch loop: 100%|██████████| 630/630 [07:47<00:00,  1.35it/s, accuracy=0.997, accuracy_depends=0.617, cost=0.594]
test minibatch loop: 100%|██████████| 158/158 [00:57<00:00,  2.73it/s, accuracy=0.965, accuracy_depends=0.65, cost=3.59] 

epoch: 9, training loss: 0.833203, training acc: 0.993714, training depends: 0.628561, valid loss: 4.518313, valid acc: 0.960464, valid depends: 0.607785






In [25]:
tags_seq, heads = sess.run(
    [model.tags_seq, model.heads],
    feed_dict = {
        model.word_ids: batch_x,
        model.char_ids: batch_char
    },
)
tags_seq[0], heads[0], batch_depends[0]

(array([ 6, 14,  4, 23,  5, 11, 15, 26,  8, 24, 17, 13, 16, 15, 20,  8, 24,
        17, 13, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], dtype=int32),
 array([ 5,  5,  1,  5,  0,  8,  8,  5, 10,  8, 11, 10, 17, 14, 17, 17, 17,
        16, 17,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0]),
 array([ 2,  5,  2,  5,  0,  5,  8,  5, 10,  8, 10, 10, 12, 12, 12, 17, 15,
        17, 18,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0], dtype=int32))

In [26]:
def evaluate(heads_pred, types_pred, heads, types, lengths,
             symbolic_root=False, symbolic_end=False):
    batch_size, _ = heads_pred.shape
    ucorr = 0.
    lcorr = 0.
    total = 0.
    ucomplete_match = 0.
    lcomplete_match = 0.

    corr_root = 0.
    total_root = 0.
    start = 1 if symbolic_root else 0
    end = 1 if symbolic_end else 0
    for i in range(batch_size):
        ucm = 1.
        lcm = 1.
        for j in range(start, lengths[i] - end):

            total += 1
            if heads[i, j] == heads_pred[i, j]:
                ucorr += 1
                if types[i, j] == types_pred[i, j]:
                    lcorr += 1
                else:
                    lcm = 0
            else:
                ucm = 0
                lcm = 0

            if heads[i, j] == 0:
                total_root += 1
                corr_root += 1 if heads_pred[i, j] == 0 else 0

        ucomplete_match += ucm
        lcomplete_match += lcm
    
    return ucorr / total, lcorr / total, corr_root / total_root

In [27]:
arc_accuracy, type_accuracy, root_accuracy = evaluate(heads, tags_seq, batch_depends, batch_y, 
        np.count_nonzero(batch_x, axis = 1))
arc_accuracy, type_accuracy, root_accuracy

(0.6293279022403259, 0.6028513238289206, 0.84)

In [28]:
arcs, types, roots = [], [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'test minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i: index]
    batch_x = pad_sequences(batch_x,padding='post')
    batch_char = test_char[i: index]
    batch_char = generate_char_seq(batch_char)
    batch_y = test_Y[i: index]
    batch_y = pad_sequences(batch_y,padding='post')
    batch_depends = test_depends[i: index]
    batch_depends = pad_sequences(batch_depends,padding='post')
    
    tags_seq, heads = sess.run(
        [model.tags_seq, model.heads],
        feed_dict = {
            model.word_ids: batch_x,
            model.char_ids: batch_char
        },
    )
    
    arc_accuracy, type_accuracy, root_accuracy = evaluate(heads, tags_seq, batch_depends, batch_y, 
            np.count_nonzero(batch_x, axis = 1))
    pbar.set_postfix(arc_accuracy = arc_accuracy, type_accuracy = type_accuracy, 
                     root_accuracy = root_accuracy)
    arcs.append(arc_accuracy)
    types.append(type_accuracy)
    roots.append(root_accuracy)

test minibatch loop: 100%|██████████| 158/158 [00:57<00:00,  2.74it/s, arc_accuracy=0.621, root_accuracy=0.88, type_accuracy=0.597] 


In [29]:
print('arc accuracy:', np.mean(arcs))
print('types accuracy:', np.mean(types))
print('root accuracy:', np.mean(roots))

arc accuracy: 0.606092917335251
types accuracy: 0.5906962311942543
root accuracy: 0.8976780063291139
