In [1]:
import os
import tensorflow as tf 
from transformers import TFGPT2Model

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer

import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
import re

In [2]:
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)
np.random.seed(SEED_NUM)

## 데이터 준비

In [3]:
TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'

tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,
                                              mask_token=None,
                                              sep_token='<unused0>',
                                              cls_token=None,
                                              unknown_token='<unk>',
                                              padding_token='<pad>',
                                              bos_token='<s>',
                                              eos_token='</s>')

In [4]:
BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.1
SENT_MAX_LEN = 39

DATA_IN_PATH = './data_in/KOR'
DATA_OUT_PATH = './data_out/KOR'

DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_train.txt')
DATA_TEST_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_test.txt')


train_data = pd.read_csv(DATA_TRAIN_PATH, header=0, sep='\t', quoting=3)
train_data = train_data.dropna()
train_data.head()

print("Total # dataset: train - {}".format(len(train_data)))

def clean_text(sent):
    sent_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", sent)
    return sent_clean

train_data_sents = []
train_data_labels = []

for train_sent, train_label in train_data[['document', 'label']].values:
    train_tokenized_text = vocab[tokenizer(clean_text(train_sent))]
    
    tokens = [vocab[vocab.bos_token]] # [0]
    tokens += pad_sequences([train_tokenized_text],
                           SENT_MAX_LEN, # 39
                           value=vocab[vocab.padding_token], # 3
                           padding='post').tolist()[0]
    tokens += [vocab[vocab.eos_token]] # [0, ..., 1]
    
    train_data_sents.append(tokens)
    train_data_labels.append(train_label)
    
train_data_sents = np.array(train_data_sents, dtype=np.int64)
train_data_labels = np.array(train_data_labels, dtype=np.int64)

Total # dataset: train - 149995


## 모델 학습

In [5]:
class TFGPT2Classifier(tf.keras.Model):
    def __init__(self, dir_path, num_class):
        super(TFGPT2Classifier, self).__init__()
        
        self.gpt2 = TFGPT2Model.from_pretrained(dir_path)
        self.num_class = num_class
        
        self.dropout = tf.keras.layers.Dropout(self.gpt2.config.summary_first_dropout)
        self.classifier = tf.keras.layers.Dense(self.num_class,
                                               kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.gpt2.config.initializer_range),
                                               name="classifier")
        
    def call(self, inputs):
        outputs = self.gpt2(inputs)
        pooled_output = outputs[0][:, -1]
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

In [6]:
BASE_MODEL_PATH = './gpt_ckpt'
cls_model = TFGPT2Classifier(dir_path=BASE_MODEL_PATH, num_class=2)

optimizer = tf.keras.optimizers.Adam(learning_rate=6.25e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

All model checkpoint weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the model checkpoint at ./gpt_ckpt.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [10]:
model_name = 'tf2_gpt2_naver_movie'

earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)

checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

history = cls_model.fit(train_data_sents_short, 
                        train_data_labels_short, 
                        epochs=NUM_EPOCHS, 
                        batch_size=BATCH_SIZE,
                        validation_split=VALID_SPLIT, 
                        callbacks=[earlystop_callback, cp_callback])

./data_out/KOR\tf2_gpt2_naver_movie -- Folder already exists 

Epoch 1/3
Epoch 1: val_accuracy improved from -inf to 0.42000, saving model to ./data_out/KOR\tf2_gpt2_naver_movie\weights.h5
Epoch 2/3
Epoch 2: val_accuracy improved from 0.42000 to 0.55000, saving model to ./data_out/KOR\tf2_gpt2_naver_movie\weights.h5
Epoch 3/3
Epoch 3: val_accuracy did not improve from 0.55000


In [8]:
len(train_data_sents)

149995

In [9]:
train_data_sents_short = train_data_sents[:1000]
train_data_labels_short = train_data_labels[:1000]

## 모델 테스트

In [12]:
test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\t', quoting=3)
test_data = test_data.dropna() # 49997 테스트 데이터

test_data = test_data[:100]

test_data_sents = []
test_data_labels = []

for test_sent, test_label in test_data[['document','label']].values:
    test_tokenized_text = vocab[tokenizer(clean_text(test_sent))]

    tokens = [vocab[vocab.bos_token]]  
    tokens += pad_sequences([test_tokenized_text], 
                            SENT_MAX_LEN, 
                            value=vocab[vocab.padding_token], 
                            padding='post').tolist()[0] 
    tokens += [vocab[vocab.eos_token]]

    test_data_sents.append(tokens)
    test_data_labels.append(test_label)

test_data_sents = np.array(test_data_sents, dtype=np.int64)
test_data_labels = np.array(test_data_labels, dtype=np.int64)

print("num sents, labels {}, {}".format(len(test_data_sents), len(test_data_labels)))

cls_model.load_weights(checkpoint_path)

results = cls_model.evaluate(test_data_sents, test_data_labels, batch_size=1024)
print("test loss, test acc: ", results)

num sents, labels 100, 100
test loss, test acc:  [0.6902951002120972, 0.7099999785423279]
