In [2]:
import os
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import numpy as np
import re

In [3]:
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased",
                                         cache_dir='bert_ckpt',
                                         do_lower_case=False)

tf.random.set_seed(0)
np.random.seed(0)

BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN = 28 * 2

DATA_IN_PATH = 'data_in/KOR'
DATA_OUT_PATH = 'data_out/KOR'

print(tokenizer.all_special_tokens, '\n', tokenizer.all_special_ids)
# ['[PAD]', '[SEP]', '[MASK]', '[CLS]', '[UNK]']
# [0, 102, 103, 101, 100]

kor_encode = tokenizer.encode("안녕하세요, 반갑습니다.")
eng_encode = tokenizer.encode("Hello, world!")

kor_decode = tokenizer.decode(kor_encode)
eng_decode = tokenizer.decode(eng_encode)

print(kor_encode)
print(eng_encode)
print(kor_decode)
print(eng_decode)

['[MASK]', '[PAD]', '[SEP]', '[CLS]', '[UNK]'] 
 [103, 0, 102, 101, 100]
[101, 9521, 118741, 35506, 24982, 48549, 117, 9321, 118610, 119081, 48345, 119, 102]
[101, 31178, 117, 11356, 106, 102]
[CLS] 안녕하세요, 반갑습니다. [SEP]
[CLS] Hello, world! [SEP]


# KorSTS Dataset

In [5]:
TRAIN_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-train.tsv')
DEV_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-dev.tsv')

train_data = pd.read_csv(TRAIN_STS_DF, header=0, sep='\t', quoting=3)
dev_data = pd.read_csv(DEV_STS_DF, header=0, sep='\t', quoting=3)

print("Total # dataset: train - {}, dev - {}".format(len(train_data), len(dev_data)))

Total # dataset: train - 5749, dev - 1500


In [6]:
def bert_tokenizer_v2(sent1, sent2, MAX_LEN):
    
    encoded_dict = tokenizer.encode_plus(
        text = sent1,
        text_pair = sent2,
        add_special_tokens = True,
        max_length = MAX_LEN,
        pad_to_max_length = True,
        return_attention_mask = True,
        truncation = True)
    
    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']
    token_type_id = encoded_dict['token_type_ids']
    
    return input_id, attention_mask, token_type_id

In [7]:
def clean_text(sent):
    sent_clean = re.sub("[^a-zA-Z0-9ㄱ-ㅣ가-힣\\s]", " ", sent)
    return sent_clean

input_ids = []
attention_masks = []
token_type_ids = []
data_labels = []

for sent1, sent2, score in train_data[['sentence1', 'sentence2', 'score']].values:
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(score)
    except Exception as e:
        print(e)
        print(sent1, sent2)
        pass
    
train_input_ids = np.array(input_ids, dtype=int)
train_attention_masks = np.array(attention_masks, dtype=int)
train_type_ids = np.array(token_type_ids, dtype=int)
train_inputs = (train_input_ids, train_attention_masks, train_type_ids)
train_data_labels = np.array(data_labels)

In [26]:
train_data[['sentence1', 'sentence2', 'score']].values

array([['비행기가 이륙하고 있다.', '비행기가 이륙하고 있다.', 5.0],
       ['한 남자가 큰 플루트를 연주하고 있다.', '남자가 플루트를 연주하고 있다.', 3.8],
       ['한 남자가 피자에 치즈를 뿌려놓고 있다.', '한 남자가 구운 피자에 치즈 조각을 뿌려놓고 있다.', 3.8],
       ...,
       ['바레인으로 향하는 대통령', '시 주석 : 에볼라 퇴치를 계속 돕기 위한 중국', 0.0],
       ['중국, 인도는 양국 관계를 증진시키겠다고 맹세한다',
        '중국은 불안한 주식 거래자들을 안심시키기 위해 뒤뚱거리고 있다.', 0.0],
       ['푸틴 대변인 : 도핑 혐의는 근거 없는 것으로 보인다.',
        '가장 최근의 심한 날씨 : 토네이도 후 텍사스에서 1명 사망', 0.0]], dtype=object)

In [27]:
train_data[['sentence1', 'sentence2', 'score']].values.shape

(5749, 3)

# DEV SET Preprocessing

In [8]:
input_ids = []
attention_masks = []
token_type_ids = []
data_labels = []

for sent1, sent2, score in dev_data[['sentence1', 'sentence2', 'score']].values:
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(score)
    except Exception as e:
        print(e)
        print(sent1, sent2)
        pass
    
dev_input_ids = np.array(input_ids, dtype=int)
dev_attention_masks = np.array(attention_masks, dtype=int)
dev_type_ids = np.array(token_type_ids, dtype=int)
dev_inputs = (dev_input_ids, dev_attention_masks, dev_type_ids)
dev_data_labels = np.array(data_labels)

In [9]:
print("# train labels: {}, #dev labels: {}".format(len(train_data_labels), len(dev_data_labels)))

# train labels: 5749, #dev labels: 1500


In [10]:
class TFBertRegressor(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertRegressor, self).__init__()
        
        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.num_class = num_class
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.regressor = tf.keras.layers.Dense(self.num_class,
                                              kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                              name='regressor')
        
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.regressor(pooled_output)
        
        return logits

In [11]:
regression_model = TFBertRegressor(model_name='bert-base-multilingual-cased',
                                  dir_path='bert_ckpt',
                                  num_class=1)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


In [12]:
class PearsonCorrelationMetric(tf.keras.metrics.Metric):
    def __init__(self, name="pearson_correlation", **kargs):
        super(PearsonCorrelationMetric, self).__init__(name=name,**kargs)
        self.y_true_list = []
        self.y_pred_list = []
        
    """ 배치마다 나오는 결과값들을 평가함수에 적용 """
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, shape=[-1])
        y_pred = tf.reshape(y_pred, shape=[-1])
        self.y_true_list.append(y_true)
        self.y_pred_list.append(y_pred)
    
    """ 각 분포들을 하나의 벡터로 만들고 피어슨 상관계수 구함 """
    def result(self):
        y_true = tf.concat(self.y_true_list, -1)
        y_pred = tf.concat(self.y_pred_list, -1)
        pearson_correlation = self.pearson(y_true, y_pred)
        
        return pearson_correlation
    
    """ 각 에폭 끝난 후 리스트 초기화 """
    def reset_states(self):
        self.y_true_list = []
        self.y_pred_list = []
        
    def pearson(self, true, pred):
        m_true = tf.reduce_mean(true)
        m_pred = tf.reduce_mean(pred)
        m_true, m_pred = true - m_true, pred - m_pred
        num = tf.reduce_sum(tf.multiply(m_true, m_pred))
        den = tf.sqrt(tf.multiply(tf.reduce_sum(tf.square(m_true)),
                                 tf.reduce_sum(tf.square(m_pred)))) + 1e-12
        
        return num / den

In [13]:
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.MeanSquaredError()
metric = PearsonCorrelationMetric()
regression_model.compile(optimizer=optimizer, loss=loss, metrics=[metric], run_eagerly=True)
# run eagerly: 사용자 평가 지표 모듈 사용 시 에러 발생하지 않도록

In [18]:
#학습 진행하기
model_name = "tf2_BERT_KorSTS"

# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = EarlyStopping(monitor='val_pearson_correlation', min_delta=0.0001,patience=2,mode='max')
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_pearson_correlation', verbose=1, save_best_only=True, save_weights_only=True,mode='max')

# 학습과 eval 시작
history = regression_model.fit(train_inputs_short, train_data_labels_short, epochs=NUM_EPOCHS,
            validation_data = (dev_inputs_short, dev_data_labels_short),
            batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])

#steps_for_epoch
print(history.history)

data_out/KOR\tf2_BERT_KorSTS -- Folder already exists 

Epoch 1/3


  m.reset_state()


Epoch 1: val_pearson_correlation improved from -inf to -0.03929, saving model to data_out/KOR\tf2_BERT_KorSTS\weights.h5
Epoch 2/3
Epoch 2: val_pearson_correlation improved from -0.03929 to 0.62759, saving model to data_out/KOR\tf2_BERT_KorSTS\weights.h5
Epoch 3/3
Epoch 3: val_pearson_correlation improved from 0.62759 to 0.67898, saving model to data_out/KOR\tf2_BERT_KorSTS\weights.h5
{'loss': [7.0894246101379395, 2.219385862350464, 1.801527976989746], 'pearson_correlation': [-0.04010386019945145, 0.03434368222951889, 0.08315519988536835], 'val_loss': [3.8617241382598877, 1.5127766132354736, 1.5706486701965332], 'val_pearson_correlation': [-0.039288006722927094, 0.6275914311408997, 0.6789785027503967]}


In [17]:
train_inputs_short = tuple(i[:100] for i in train_inputs)
train_data_labels_short = train_data_labels[:100]

dev_inputs_short = tuple(i[:10] for i in dev_inputs)
dev_data_labels_short = dev_data_labels[:10]

그냥 하면 180배치, 2시간 반 

# KorSTS Test Datset

In [19]:
TEST_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-test.tsv')

test_data = pd.read_csv(TEST_STS_DF, header=0, delimiter = '\t', quoting = 3)
print(test_data.head())

input_ids = []
attention_masks = []
token_type_ids = []
data_labels = []

for sent1, sent2, score in test_data[['sentence1', 'sentence2', 'score']].values:
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(score)
    except Exception as e:
        print(e)
        print(sent1, sent2)
        pass
    
test_input_ids = np.array(input_ids, dtype=int)
test_attention_masks = np.array(attention_masks, dtype=int)
test_type_ids = np.array(token_type_ids, dtype=int)
test_inputs = (test_input_ids, test_attention_masks, test_type_ids)
test_data_labels = np.array(data_labels)

print("# sents: {}, # labels: {}".format(len(test_input_ids), len(test_data_labels)))

regression_model.load_weights(checkpoint_path)

results = regression_model.evaluate(test_inputs, test_data_labels, batch_size=512)
print("test loss, test pearson correlation: ", results)

           genre filename      year  id  score                sentence1  \
0  main-captions   MSRvid  2012test  24    2.5     한 소녀가 머리를 스타일링하고 있다.   
1  main-captions   MSRvid  2012test  33    3.6  한 무리의 남자들이 해변에서 축구를 한다.   
2  main-captions   MSRvid  2012test  45    5.0  한 여성이 다른 여성의 발목을 재고 있다.   
3  main-captions   MSRvid  2012test  63    4.2        한 남자가 오이를 자르고 있다.   
4  main-captions   MSRvid  2012test  66    1.5       한 남자가 하프를 연주하고 있다.   

                    sentence2  
0            한 소녀가 머리를 빗고 있다.  
1  한 무리의 소년들이 해변에서 축구를 하고 있다.  
2      한 여자는 다른 여자의 발목을 측정한다.  
3           한 남자가 오이를 자르고 있다.  
4         한 남자가 키보드를 연주하고 있다.  
# sents: 1379, # labels: 1379


  m.reset_state()


test loss, test pearson correlation:  [2.8333382606506348, 0.3583824038505554]
