# TF2 기반 다대다 POS 태깅 

* GRU 사용
* label이 배열인 경우
* masked accuracy 설계

In [1]:
import os
import sys
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow import feature_column as fc
import tensorflow_datasets as tfds
plt.rcParams["font.family"] = 'NanumBarunGothic'
TENSORBOARD_BINARY = '/home/hoondori/anaconda3/envs/ai/bin/tensorboard'
os.environ['TENSORBOARD_BINARY'] =  TENSORBOARD_BINARY
%load_ext tensorboard

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

In [2]:
import nltk
import numpy as np
import os
import shutil
import tensorflow as tf

# 데이터 확보

In [3]:
def download_and_read(dataset_dir, num_pairs=None):
    sent_filename = os.path.join(dataset_dir, "treebank-sents.txt")
    poss_filename = os.path.join(dataset_dir, "treebank-poss.txt")
    if not(os.path.exists(sent_filename) and os.path.exists(poss_filename)):
        import nltk    

        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        fsents = open(sent_filename, "w")
        fposs = open(poss_filename, "w")
        sentences = nltk.corpus.treebank.tagged_sents()      # list of sentence, each sentence is list of (word, tag)
        for sent in sentences:
            fsents.write(" ".join([w for w, p in sent]) + "\n")
            fposs.write(" ".join([p for w, p in sent]) + "\n")

        fsents.close()
        fposs.close()
    sents, poss = [], []
    with open(sent_filename, "r") as fsent:
        for idx, line in enumerate(fsent):
            sents.append(line.strip())
            if num_pairs is not None and idx >= num_pairs:
                break
    with open(poss_filename, "r") as fposs:
        for idx, line in enumerate(fposs):
            poss.append(line.strip())
            if num_pairs is not None and idx >= num_pairs:
                break
    return sents, poss


# download and read source and target data into data structure
NUM_PAIRS = None
sents, poss = download_and_read("./datasets", num_pairs=NUM_PAIRS)
assert(len(sents) == len(poss))
print("# of records: {:d}".format(len(sents)))


# of records: 3914


In [5]:
def tokenize_and_build_vocab(texts, vocab_size, lower=True):
    
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=vocab_size+1, oov_token='oov', lower=lower      # vocab_size +1 due to oov inclusion
    )
    tokenizer.fit_on_texts(texts)
    
    # num_words가 안 먹히는 것의 work-around
    tokenizer.word_index = { word:idx for word, idx in tokenizer.word_index.items() if idx <=vocab_size+1 }  # cut down words only to topN
    
    # mapping
    word2idx = tokenizer.word_index
    idx2word  = { v:k for k, v in word2idx.items() }
    idx2word[0] = 'PAD'
    
    return word2idx, idx2word, tokenizer

word2idx_s, idx2word_s, tokenizer_s = tokenize_and_build_vocab(
    sents, vocab_size=9000)
word2idx_t, idx2word_t, tokenizer_t = tokenize_and_build_vocab(
    poss, vocab_size=38, lower=False)
source_vocab_size = len(word2idx_s)
target_vocab_size = len(word2idx_t)
print("vocab sizes (source): {:d}, (target): {:d}".format(
    source_vocab_size, target_vocab_size))

vocab sizes (source): 9001, (target): 39


In [6]:
max_seqlen = 271

sents_as_ints = tokenizer_s.texts_to_sequences(sents)
sents_as_ints = tf.keras.preprocessing.sequence.pad_sequences(sents_as_ints, maxlen=max_seqlen, padding="post")

poss_as_ints = tokenizer_t.texts_to_sequences(poss)
poss_as_ints = tf.keras.preprocessing.sequence.pad_sequences(poss_as_ints, maxlen=max_seqlen, padding="post")

dataset = tf.data.Dataset.from_tensor_slices((sents_as_ints, poss_as_ints))
for batch in dataset.take(1):
    sents_b, poss_b = batch
    print(sents_b.numpy()[:30])
    print(poss_b.numpy()[:30])

[5353 3744 1081   87  184   42 2453    2  132   23    6 2086  351  489
  787    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[ 4  4  9  7  8 20 12  5  2  3  5  8  2  4  9  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


In [7]:
# poss 를 one-hot로 변환 

poss_as_catints = []
for p in poss_as_ints:
    poss_as_catints.append(tf.keras.utils.to_categorical(p, num_classes=target_vocab_size, dtype="int32"))
poss_as_catints = tf.keras.preprocessing.sequence.pad_sequences(poss_as_catints, maxlen=max_seqlen)
dataset = tf.data.Dataset.from_tensor_slices((sents_as_ints, poss_as_catints))
for batch in dataset.take(1):
    sents_b, poss_b = batch
    print(sents_b.shape)
    print(poss_b.shape)

(271,)
(271, 39)


In [8]:
BATCH_SIZE = 128

# split into training, validation, and test datasets
dataset = dataset.shuffle(10000)
test_size = len(sents) // 3
val_size = (len(sents) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

# create batches
batch_size = BATCH_SIZE
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)
train_dataset

<BatchDataset shapes: ((None, 271), (None, 271, 39)), types: (tf.int32, tf.int32)>

In [9]:
class PosTaggingModel(tf.keras.Model):
    
    def __init__(self, source_vocab_size, target_vocab_size, embed_sz, rnn_output_size, max_seqlen, **kwargs):
        
        super(PosTaggingModel, self).__init__(self, **kwargs)
        self.embed = tf.keras.layers.Embedding(source_vocab_size, embed_sz, input_length=max_seqlen)
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        self.rnn = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(embed_sz, return_sequences=True)   # 다대다에서는 return_sequences=True 이어야 한다.
        )
        self.dense = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(target_vocab_size)
        )
        self.activation = tf.keras.layers.Activation('softmax')
        
    def call(self, x):
        
        x = self.embed(x)
        x = self.dropout(x)
        x = self.rnn(x)
        x = self.dense(x)
        x = self.activation(x)
            
        return x

EMBED_SZ = 100
RNN_OUTPUT_SIZE = 256
model = PosTaggingModel(source_vocab_size, target_vocab_size, EMBED_SZ, RNN_OUTPUT_SIZE, max_seqlen)
model.build(input_shape=(batch_size, max_seqlen))
model.summary()    

Model: "pos_tagging_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  900100    
_________________________________________________________________
spatial_dropout1d (SpatialDr multiple                  0         
_________________________________________________________________
bidirectional (Bidirectional multiple                  121200    
_________________________________________________________________
time_distributed (TimeDistri multiple                  7839      
_________________________________________________________________
activation (Activation)      multiple                  0         
Total params: 1,029,139
Trainable params: 1,029,139
Non-trainable params: 0
_________________________________________________________________


# 학습 

In [10]:
# 양쪽에 0인 것을 제외하고 정확도 계산 
def masked_accuracy():

    def masked_accuracy_fn(y_true, y_pred):
        
        # one-hot to integer label 
        y_true = tf.keras.backend.argmax(y_true, axis=-1)
        y_pred = tf.keras.backend.argmax(y_pred, axis=-1)
        
        # y_pred != 0 (즉 NOT PAD)인 위치 mask 생성
        mask = tf.keras.backend.cast(tf.keras.backend.not_equal(y_pred, 0), tf.int32)   # bool to int(0 or 1)
        
        # y_true = y_pred match 
        matches = tf.keras.backend.cast(tf.keras.backend.equal(y_true, y_pred), tf.int32) * mask 
        
        # 개수 세기 (only in mask only)
        nom = tf.keras.backend.sum(matches)
        
        # 분모 계산 
        denom = tf.keras.backend.maximum(tf.keras.backend.sum(mask), 1)
        
        accuracy = nom / denom
        
        return accuracy
    
    return masked_accuracy_fn

In [11]:
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam", 
    metrics=["accuracy", masked_accuracy()])

data_dir = '/tmp/logs/GRU'

# train

# 개선된 epoch 마다 모델 저장
checkpoint_path = data_dir + "/best-model.ckpt"
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
    save_weights_only=True,verbose=0, save_best_only=True) # save_freq='epoch'
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=data_dir)
num_epochs = 50
history = model.fit(train_dataset, epochs=num_epochs, 
    validation_data=val_dataset,
    callbacks=[checkpoint, tensorboard])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50


Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
# evaluate with test set
best_model = PosTaggingModel(source_vocab_size, target_vocab_size, EMBED_SZ, RNN_OUTPUT_SIZE, max_seqlen)
best_model.build(input_shape=(batch_size, max_seqlen))
best_model.load_weights(checkpoint_path)
best_model.compile(
    loss="categorical_crossentropy",
    optimizer="adam", 
    metrics=["accuracy", masked_accuracy()])

test_loss, test_acc, test_masked_acc = best_model.evaluate(test_dataset)
print("test loss: {:.3f}, test accuracy: {:.3f}, masked test accuracy: {:.3f}".format(
    test_loss, test_acc, test_masked_acc))

test loss: 0.096, test accuracy: 0.971, masked test accuracy: 0.663


In [18]:
labels, predictions = [], []
is_first_batch = True
accuracies = []

for test_batch in test_dataset:
    inputs_b, outputs_b = test_batch
    preds_b = best_model.predict(inputs_b)
    # convert from categorical to list of ints
    preds_b = np.argmax(preds_b, axis=-1)
    outputs_b = np.argmax(outputs_b.numpy(), axis=-1)
    for i, (pred_l, output_l) in enumerate(zip(preds_b, outputs_b)):
        assert(len(pred_l) == len(output_l))
        pad_len = np.nonzero(output_l)[0][0]
        acc = np.count_nonzero(
            np.equal(
                output_l[pad_len:], pred_l[pad_len:]
            )
        ) / len(output_l[pad_len:])
        accuracies.append(acc)
        if is_first_batch:
            words = [idx2word_s[x] for x in inputs_b.numpy()[i][pad_len:]]
            postags_l = [idx2word_t[x] for x in output_l[pad_len:] if x > 0]
            postags_p = [idx2word_t[x] for x in pred_l[pad_len:] if x > 0]
            print("labeled  : {:s}".format(" ".join(["{:s}/{:s}".format(w, p) 
                for (w, p) in zip(words, postags_l)])))
            print("predicted: {:s}".format(" ".join(["{:s}/{:s}".format(w, p) 
                for (w, p) in zip(words, postags_p)])))
            print(" ")
    is_first_batch = False

accuracy_score = np.mean(np.array(accuracies))
print("pos tagging accuracy: {:.3f}".format(accuracy_score))

labeled  : although/IN she/PRP was/VBD kind/JJ and/CC playful/JJ to/TO her/PRP children/NNS she/PRP was/VBD dreadful/JJ to/TO her/PRP war/JJ damaged/NN husband/PRP she/RB openly/VBD brought/PRP her/NN lover/IN into/PRP their/NN
predicted: although/IN she/PRP was/VBD kind/NN and/CC playful/JJ to/TO her/PRP children/NNS she/PRP was/VBD dreadful/JJ to/TO her/PRP war/NN damaged/NN husband/PRP she/PRP openly/VBD brought/VBD her/PRP lover/IN into/IN their/PRP
 
labeled  : furukawa/NNP said/VBD 0/NONE the/DT purchase/NN of/IN the/DT french/JJ and/CC german/JJ plants/NNS together/RB will/MD total/VB about/RB 40/CD billion/CD yen/NNS lrb/LRB 280/CD million/CD u/NONE rrb/RRB
predicted: furukawa/NNP said/VBD 0/NONE the/DT purchase/NN of/IN the/DT french/JJ and/CC german/JJ plants/NNS together/RB will/MD total/VB about/CD 40/CD billion/CD yen/NNS lrb/CD 280/CD million/CD u/NONE
 
labeled  : much/RB of/IN mr/NNP lane/NNP 's/POS film/NN takes/VBZ a/DT highly/RB romanticized/VBN view/NN of/IN life/NN

pos tagging accuracy: 0.971
