In [1]:
import os
import numpy as np
import tensorflow as tf
from transformers import *
from tensorflow.keras.callbacks import  ModelCheckpoint
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score
import pandas as pd
import matplotlib.pyplot as plt



In [2]:
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf


print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Tensor Flow Version: 2.5.0
Keras Version: 2.5.0

Python 3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]
Pandas 1.0.5
Scikit-Learn 0.23.1
GPU is available


In [3]:
#random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

BATCH_SIZE = 4
NUM_EPOCHS = 20
MAX_LEN = 512 

In [4]:
DATA_TRAIN_PATH = os.path.join("train_path")
DATA_TEST_PATH = os.path.join("test_path")


def read_file(input_path):
    """Read tsv file, and return words and label as list"""
    with open(input_path, "r", encoding="utf-8") as f:
        sentences = []
        labels = []
        for line in f:
            split_line = line.strip().split("\t")
            sentences.append(split_line[0])
            labels.append(split_line[1])
        return sentences[1:], labels[1:]

train_sentences, train_labels = read_file(DATA_TRAIN_PATH)
for i in range(len(train_labels)):
    train_labels[i] = train_labels[i][1:-1].replace('\'','').replace(' ','').split(",")
train_ner_dict = {"sentence": train_sentences, "label": train_labels}
train_ner_df = pd.DataFrame(train_ner_dict)

test_sentences, test_labels = read_file(DATA_TEST_PATH)
for i in range(len(test_labels)):
    test_labels[i] = test_labels[i][1:-1].replace('\'','').replace(' ','').split(",")
test_ner_dict = {"sentence": test_sentences, "label": test_labels}
test_ner_df = pd.DataFrame(test_ner_dict)

print("개체명 인식 학습 데이터 개수: {}".format(len(train_ner_df)))
print("개체명 인식 테스트 데이터 개수: {}".format(len(test_ner_df)))

개체명 인식 학습 데이터 개수: 21525
개체명 인식 테스트 데이터 개수: 5382


In [5]:
for i in range(len(train_ner_df)):
    if len(train_ner_df['sentence'][i]) > 512:
        train_ner_df = train_ner_df.drop(index=i, axis=0)

for i in range(len(test_ner_df)):
    if len(test_ner_df['sentence'][i]) > 512:
        test_ner_df = test_ner_df.drop(index=i, axis=0)

train_ner_df = train_ner_df.reset_index(drop=True)
test_ner_df = test_ner_df.reset_index(drop=True)
    
print("개체명 인식 학습 데이터 개수: {}".format(len(train_ner_df)))
print("개체명 인식 테스트 데이터 개수: {}".format(len(test_ner_df)))

개체명 인식 학습 데이터 개수: 21500
개체명 인식 테스트 데이터 개수: 5378


In [6]:
ner_labels = ['UNK', 'O', 'B-PDT', 'I-PDT', 'B-MOV', 'I-MOV', 'B-TRV', 'I-TRV']

In [7]:
# 버트 토크나이저 설정

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", cache_dir='bert_ckpt')

pad_token_id = tokenizer.pad_token_id # 0
pad_token_label_id = 0
cls_token_label_id = 0
sep_token_label_id = 0

In [8]:
def bert_tokenizer(sent, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        text = sent,
        truncation=True,
        add_special_tokens = True, #'[CLS]'와 '[SEP]' 추가
        max_length = MAX_LEN,           # 문장 패딩 및 자르기 진행
        pad_to_max_length = True,
        return_attention_mask = True   # 어탠션 마스크 생성
    )
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] 
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

def convert_label(words, labels_idx, ner_begin_label, max_seq_len):
            
    tokens = []
    label_ids = []

    for word, slot_label in zip(words, labels_idx):
        word_tokens = tokenizer.tokenize(word)
        tokens.extend(word_tokens)
        
        # 슬롯 레이블 값이 Begin이면 I로 추가
        if int(slot_label) in ner_begin_label:
            label_ids.extend([int(slot_label)] + [int(slot_label) + 1] * (len(word_tokens) - 1))
        else:
            label_ids.extend([int(slot_label)] * len(word_tokens))
  
    # [CLS] and [SEP] 설정
    special_tokens_count = 2
    if len(label_ids) > max_seq_len - special_tokens_count:
        label_ids = label_ids[: (max_seq_len - special_tokens_count)]

    # [SEP] 토큰 추가
    label_ids += [sep_token_label_id]

    # [CLS] 토큰 추가
    label_ids = [cls_token_label_id] + label_ids
    
    padding_length = max_seq_len - len(label_ids)
    label_ids = label_ids + ([pad_token_label_id] * padding_length)
    
    return label_ids

In [9]:
# 테스트용
ner_begin_label = [ner_labels.index(begin_label) for begin_label in ner_labels if "B" in begin_label]
ner_begin_label_string = [ner_labels[label_index] for label_index in ner_begin_label]

print(ner_begin_label)
print(ner_begin_label_string)

[2, 4, 6]
['B-PDT', 'B-MOV', 'B-TRV']


In [10]:
ner_begin_label = [ner_labels.index(begin_label) for begin_label in ner_labels if "B" in begin_label]

def create_inputs_targets(df):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    label_list = []

    for i in range(len(df)):
        sentence, labels = df['sentence'][i], df['label'][i]
        words = sentence.split()
        labels_idx = []

        for label in labels:
            labels_idx.append(ner_labels.index(label) if label in ner_labels else ner_labels.index("UNK"))


        #assert len(words) == len(labels_idx)

        input_id, attention_mask, token_type_id = bert_tokenizer(sentence, MAX_LEN)

        convert_label_id = convert_label(words, labels_idx, ner_begin_label, MAX_LEN)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        label_list.append(convert_label_id)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    label_list = np.asarray(label_list, dtype=int) #레이블 토크나이징 리스트
    inputs = (input_ids, attention_masks, token_type_ids)
    
    return inputs, label_list

In [11]:
train_inputs, train_labels = create_inputs_targets(train_ner_df)
test_inputs, test_labels = create_inputs_targets(test_ner_df)



In [12]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K


class CRF(tf.keras.layers.Layer):

    def __init__(self, units=None, chain_initializer="orthogonal", regularizer=None, **kwargs):
        super(CRF, self).__init__(**kwargs)
        self.chain_initializer = tf.keras.initializers.get(chain_initializer)
        self.regularizer = regularizer
        self.transitions = None
        self.supports_masking = True
        self.mask = None
        self.accuracy_fn = tf.keras.metrics.Accuracy()
        self.units = units
        if units is not None:
            self.dense = tf.keras.layers.Dense(units)

    def get_config(self):
        config = super(CRF, self).get_config()
        config.update({
            "chain_initializer": "orthogonal"
        })
        return config

    def build(self, input_shape):
        assert len(input_shape) == 3
        if self.units:
            units = self.units
        else:
            units = input_shape[-1]
        self.transitions = self.add_weight(
            name="transitions",
            shape=[units, units],
            initializer=self.chain_initializer,
            regularizer=self.regularizer
        )

    def call(self, inputs, mask=None, training=False):
        if mask is None:
            raw_input_shape = tf.slice(tf.shape(inputs), [0], [2])
            mask = tf.ones(raw_input_shape)
        sequence_lengths = K.sum(K.cast(mask, 'int32'), axis=-1)
        if self.units:
            inputs = self.dense(inputs)
        viterbi_sequence, _ = tfa.text.crf_decode(
            inputs, self.transitions, sequence_lengths
        )
        return viterbi_sequence, inputs, sequence_lengths, self.transitions

In [13]:
from typing import Union

import tensorflow as tf
from tensorflow_addons.text.crf import crf_log_likelihood


def unpack_data(data):
    if len(data) == 2:
        return data[0], data[1], None
    elif len(data) == 3:
        return data
    else:
        raise TypeError("Expected data to be a tuple of size 2 or 3.")


class ModelWithCRFLoss(tf.keras.Model):
    """
    Wrapper around the base model for custom training logic.
    Args:
        base_model: The model including the CRF layer
        sparse_target: if the y label is sparse or one-hot, default True
        metric: the metric for training, default 'accuracy'. Warning: Currently tensorflow metrics like AUC need the output and y_true to be one-hot to cauculate, they are not supported.
    """

    def __init__(self, base_model, sparse_target=True, metric: Union[str, object] = 'accuracy'):
        super().__init__()
        self.base_model = base_model
        self.sparse_target = sparse_target
        
        self.metric = metric
        if isinstance(metric, str):
            if metric == 'accuracy':
                self.metrics_fn = tf.keras.metrics.Accuracy(name='accuracy')
            else:
                raise ValueError('unknown metric name')
        else:
            self.metrics_fn = self.metric
        self.loss_tracker = tf.keras.metrics.Mean(name='loss')


    def call(self, inputs, training=False):
        if training:
            return self.base_model(inputs)
        else:
            return self.base_model(inputs)[0]

    def compute_loss(self, x, y, training=False):
        viterbi_sequence, potentials, sequence_length, chain_kernel = self(x, training=training)
        # we now add the CRF loss:
        crf_loss = -crf_log_likelihood(potentials, y, sequence_length, chain_kernel)[0]
        return viterbi_sequence, sequence_length, tf.reduce_mean(crf_loss)


    def train_step(self, data):
        x, y, sample_weight = unpack_data(data)
        # y : '(batch_size, seq_length)'
        if self.sparse_target:
            assert len(y.shape) == 2
        else:
            y = tf.argmax(y, axis=-1)
        with tf.GradientTape() as tape:
            viterbi_sequence, sequence_length, crf_loss = self.compute_loss(x, y, training=True)
            loss = crf_loss + tf.cast(tf.reduce_sum(self.losses), crf_loss.dtype)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.loss_tracker.update_state(loss)
        self.metrics_fn.update_state(y, viterbi_sequence, tf.sequence_mask(sequence_length, y.shape[1]))
        return {"loss": self.loss_tracker.result(), self.metrics_fn.name: self.metrics_fn.result()}

    @property
    def metrics(self):
        return [self.loss_tracker, self.metrics_fn]

    def test_step(self, data):
        x, y, sample_weight = unpack_data(data)
        # y : '(batch_size, seq_length)'
        if self.sparse_target:
            assert len(y.shape) == 2
        else:
            y = tf.argmax(y, axis=-1)
        viterbi_sequence, sequence_length, crf_loss = self.compute_loss(x, y, training=True)
        loss = crf_loss + tf.cast(tf.reduce_sum(self.losses), crf_loss.dtype)
        self.loss_tracker.update_state(loss)
        self.metrics_fn.update_state(y, viterbi_sequence, tf.sequence_mask(sequence_length, y.shape[1]))
        return {"loss_val": self.loss_tracker.result(), f'val_{self.metrics_fn.name}': self.metrics_fn.result()}

In [14]:
class TFBertNERClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertNERClassifier, self).__init__()

        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, 
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                                name="ner_classifier")
        self.crf = CRF(units=8, name='crf')

    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):

        #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]           
        sequence_output = self.dropout(sequence_output, training=training)
        logit = self.classifier(sequence_output)
        logits = self.crf(logit)
        

        return logits

In [15]:
ner_model = TFBertNERClassifier(model_name='bert-base-multilingual-cased',
                                  dir_path='bert_ckpt',
                                  num_class=len(ner_labels))

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [16]:
ner_model = ModelWithCRFLoss(ner_model)

In [17]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(3e-5)
ner_model.compile(optimizer=optimizer)

In [18]:
ner_labels = ['UNK', 'O', 'PDT-B', 'PDT-I', 'MOV-B', 'MOV-I', 'TRV-B', 'TRV-I']

In [19]:
class F1Metrics(tf.keras.callbacks.Callback):
    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def compute_f1_pre_rec(self, labels, preds):

        return {
            "accuracy": accuracy_score(labels, preds),
            "precision": precision_score(labels, preds, suffix=True),
            "recall": recall_score(labels, preds, suffix=True),
            "f1": f1_score(labels, preds, suffix=True)
        }


    def show_report(self, labels, preds):
        return classification_report(labels, preds, suffix=True)
        
    def on_epoch_end(self, epoch, logs=None):

        results = {}
        
        pred_argmax = self.model.predict(self.x_eval)
        label = self.y_eval

        slot_label_map = {i: label for i, label in enumerate(ner_labels)}

        out_label_list = [[] for _ in range(label.shape[0])]
        preds_list = [[] for _ in range(label.shape[0])]

        for i in range(label.shape[0]):
            for j in range(label.shape[1]):
                if label[i, j] != 0:
                    out_label_list[i].append(slot_label_map[label[i][j]])
                    if pred_argmax[i][j] ==0:
                        pred_argmax[i][j]=1
                    preds_list[i].append(slot_label_map[pred_argmax[i][j]])
                    
        result = self.compute_f1_pre_rec(out_label_list, preds_list)
        results.update(result)

        print("********")
        print("F1 Score")
        for key in sorted(results.keys()):
            print("{}, {:.4f}".format(key, results[key]))
        print("\n" + self.show_report(out_label_list, preds_list))
        print("********")

f1_score_callback = F1Metrics(test_inputs, test_labels)

In [20]:
model_name = "tf2_bert_ner"

DATA_IN_PATH = 'data_in/KOR'
DATA_OUT_PATH = "data_out/KOR"

checkpoint_path = os.path.join(DATA_OUT_PATH , model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = ModelCheckpoint(
    checkpoint_path, verbose=1, save_best_only=True, save_weights_only=True)

history = ner_model.fit(train_inputs,train_labels, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
                       callbacks=[cp_callback, f1_score_callback])

print(history.history)

data_out/KOR\tf2_bert_ner -- Folder already exists 

Epoch 1/20
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


  return py_builtins.overload_of(f)(*args)


********
F1 Score
accuracy, 0.7873
f1, 0.1058
precision, 0.1368
recall, 0.0862

              precision    recall  f1-score   support

         MOV       0.00      0.00      0.00      1363
         PDT       0.16      0.15      0.16      2258
         TRV       0.01      0.00      0.00       369

   micro avg       0.14      0.09      0.11      3990
   macro avg       0.06      0.05      0.05      3990
weighted avg       0.09      0.09      0.09      3990

********
Epoch 2/20
********
F1 Score
accuracy, 0.8390
f1, 0.3152
precision, 0.3291
recall, 0.3025

              precision    recall  f1-score   support

         MOV       0.22      0.21      0.21      1363
         PDT       0.44      0.40      0.42      2258
         TRV       0.08      0.07      0.08       369

   micro avg       0.33      0.30      0.32      3990
   macro avg       0.25      0.23      0.24      3990
weighted avg       0.33      0.30      0.32      3990

********
Epoch 3/20
********
F1 Score
accuracy, 0.8446
f1,

********
F1 Score
accuracy, 0.8356
f1, 0.4562
precision, 0.4807
recall, 0.4341

              precision    recall  f1-score   support

         MOV       0.40      0.30      0.35      1363
         PDT       0.55      0.55      0.55      2258
         TRV       0.24      0.21      0.23       369

   micro avg       0.48      0.43      0.46      3990
   macro avg       0.40      0.35      0.37      3990
weighted avg       0.47      0.43      0.45      3990

********
Epoch 13/20
********
F1 Score
accuracy, 0.8394
f1, 0.4554
precision, 0.4747
recall, 0.4376

              precision    recall  f1-score   support

         MOV       0.41      0.30      0.35      1363
         PDT       0.53      0.56      0.54      2258
         TRV       0.27      0.20      0.23       369

   micro avg       0.47      0.44      0.46      3990
   macro avg       0.40      0.35      0.37      3990
weighted avg       0.46      0.44      0.45      3990

********
Epoch 14/20
********
F1 Score
accuracy, 0.8423
f

In [None]:
ner_model.load_weights('save_model')

In [None]:
def new_predict(text):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    label_list = []
    text_list = []

    input_id, attention_mask, token_type_id = bert_tokenizer(text, MAX_LEN)
    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    
    new_input_ids = np.array(input_ids, dtype=int)
    new_attention_masks = np.array(attention_masks, dtype=int)
    new_type_ids = np.array(token_type_ids, dtype=int)
    new_inputs = (new_input_ids, new_attention_masks, new_type_ids)
    
    predict = ner_model.predict(new_inputs, batch_size=512)
    predict_list = predict[0].tolist()
    
    index = []
    for i in predict_list:
        index.append(i.index(max(i)))
    for i in index:
        label_list.append(ner_labels[i])
        
    for i,j in zip(input_id,label_list):
        if j in ['PDT', 'MOV', 'TRV']:
            text_list.append([tokenizer.decode(i),j])   
    for i,j in text_list:
        if i not in ["[ P A D ]","[ C L S ]","[ S E P ]","[ U N K ]"]:
            print(i,j)

In [None]:
new_predict("깔끔하게 부직포 포장으로 되어 있어서 그냥 뜨거운 물에 풍덩 넣어놓고 좀 휘젓어주면 금방 우러난다. 목욕할 때마다 넣어봤는데(샤워는 자주 해도 목욕은 그렇게 자주가 아님.. 이것도 약재는 약재이므로 용법은 알아서;;)신선한 한약풀 냄새가 욕실에 퍼져서 기분이 좋아졌다. 아직 때가 안 되서 효과까지는 모르겠는데 가려운 피부에도 효과가 있었으면 좋겠네. 박하 같은 것도 팔던데 지금으로서는 대만족이라 다음에는 상쾌하게 박하 사고 싶다. 혹시 오래된 거 팔지 않나 고민했었는데 쑥향기 자체가 페퍼민트처럼 신선하고 포장도 깔끔하고 사용도 간편하고 참.. 우리나라 인터넷 시장도 좋은 거 같다. 주문하니 이렇게 물에 넣기만 하면 되게 딱딱 만들어서 집까지 슝 배달해주고..")