# GPT(Generative Pre-trained Transformer) 2

* 참고: https://github.com/NLP-kr/tensorflow-ml-nlp-tf2

* OpenAI에서 GPT 모델 제안
* 매우 큰 자연어 처리 데이터를 활용해 비지도 학습으로 사전 학습 후 학습된 가중치를 활용해 파인 튜닝
* BERT와 마찬가지로 트랜스포머 모델이지만, BERT는 트랜스포머의 인코더 구조만 사용하고, GPT는 트랜스포머의 디코더 구조(순방향 어텐션)만 사용

* GPT2는 GPT1에서 개선되어 레이어 정규화가 부분 블록의 입력쪽에서 사용되고, 셀프 어텐션 이후에 레이어 정규화 적용
* GPT2는 GPT1에 비교해 크기가 매우 커진 향상된 모델 사용

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 라이브러리

In [None]:
!pip install transformers==2.11.0
!pip install tensorflow==2.2.0
!pip install sentencepiece==0.1.85
!pip install gluonnlp==0.9.1
!pip install mxnet==1.6.0

## 데이터 다운로드

* https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/finetune_data.txt

In [None]:
!mkdir -p gpt2
!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/finetune_data.txt

In [None]:
import os
import numpy as np
import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer
from nltk.tokenize import sent_tokenize

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import TFGPT2LMHeadModel

## 사전 학습 모델

* https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip

In [None]:
!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -0 gpt_ckpt.zip
!upzip -o gpt_ckpt.zip

In [None]:
class GPT2Model(tf.keras.Model):
    def __init__(self, dir_path):
        super(GPT2Model, self).__init__()
        self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)

    def call(self, inputs):
        return self.gpt2(inputs)[0]

In [None]:
BASE_MODEL_PATH = './gpt_ckpt'
gpt_model = GPT2Model(BASE_MODEL_PATH)

In [None]:
BATCH_SIZE = 16
NUM_EPOCHS = 10
MAX_LEN = 30
TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'

tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)
nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,
                                       mask_token = None,
                                       sep_token = None,
                                       cls_toeken = None,
                                       unknown_token = '<unk>',
                                       padding_token = '<pad>',
                                       bos_token = '<s>',
                                       eos_token = '<\s>')

In [None]:
def tf_top_k_top_p_filtering(logits, top_k = 0, top_p = 0.0, filter_value = 99999):
    _logits = logits.numpy()
    top_k = min(top_k, logits.shape[-1])
    if top_k > 0:
        indices_to_remove = logit < tf.math.top_k(logits, top_k)[0][..., -1, None]
        _logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits = tf.sort(logits, direction = 'DESCENDING')
        sorted_indices = tf.argsort(logits, direction = 'DESCENDING')
        cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis = -1), axis = -1)

        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove = tf.concat([[False], sorted_indices_to_remove[..., :-1]], axis = 0)
        indeices_to_remove = sorted_indices[sorted_indices_to_remove].numpy().tolist()

        _logits[indices_to_remove] = filter_value

    return tf.constant([_logits])

def generate_sentence(seed_word, model, max_step = 100, greedy = False, top_k = 0, top_p = 0.):
    sentence = seed_word
    toked = tokenizer(sentence)

    for _ in range(max_step):
        input_ids = tf.constant([vocab[vocab.bos_token], ]+ vocab[toked])[None, :]
        outputs = model(input_ids)[:, -1, :]
        if greedy:
            gen = vocab.to_tokens(tf.argmax(outputs, axis = -1).numpy().tolist()[0])
        else:
            output_logit = tf_top_k_top_p_filtering(outputs[0], top_k = top_k, top_p = top_p)
            gen = vocab.to_tokens(tf.random.categorical(output_logit, 1).numpy().tolist()[0])[0]
        if gen == '<\s>':
            break
        sentence += gen.replace('-', ' ')
        toked = tokenizer(sentence)

    return sentence

In [None]:
generate_sentence('일부', gpt_model, greedy = True)

In [None]:
generate_sentence('일부', gpt_model, top_k = 0, top_p = 0.95)

## 데이터 준비

In [None]:
DATA_IN_PATH = './gpt2/'
TRAIN_DATA_FILE = 'finetune_data.txt'

In [None]:
sentences = [s[:-1] for s in open(DATA_IN_PATH + TRAIN_DATA_FILE).readlines()]

input_data = []
output_data = []

for sentence in sentences:
    tokens = [vocab[vocab.bos_token], ] + vocab[tokenizer(sentence)] + [vocab[vocab.eos_token], ]
    input_data.append(tokens[:-1])
    output_data.append(tokens[1:])

input_data = pad_sequences(input_data, MAX_LEN, value = vocab[vocab.padding_token])
output_data = pad_sequences(output_data, MAX_LEN, value = vocab[vocab.padding_token])

input_data = np.array(input_data, dtype = np.int64)
output_data = np.array(output_data, dtype = np.int64)

## 모델 학습

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True,
                                                            reduction = 'none')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name = 'accuracy')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype = loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def accuracy_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))
    mask = tf.expand_dims(tf.cast(mask, dtype = pred.dtype), axis = -1)
    pred *= mask
    acc = train_accuracy(real, pred)

    return tf.reduce_mean(acc)

In [None]:
gpt_model.compile(loss = loss_function,
                  optimizer = tf.keras.optimizers.Adam(1e-4),
                  metrics = [accuracy_function])

In [None]:
history = gpt_model.fit(input_data, output_data,
                        batch_size = BATCH_SIZE, epochs = NUM_EPOCHS,
                        validation_split = 0.1)

In [None]:
DATA_OUT_PATH = './data_out'
model_name = 'tf2_gpt2_finetuned_model'

save_path = os.path.join(DATA_OUT_PATH, model_name)

if not os.path.exists(save_path):
    os.makedirs(save_path)

gpt_model.gpt2.save_pretrained(save_path)

loadded_gpt_model = GPT2Model(save_path)

In [None]:
generate_sentence('일부', gpt_model, greedy = True)

In [None]:
generate_sentence('일부', gpt_model, top_k = 0, top_p = 0.95)

# GPT2 네이버 영화 리뷰 분류

## 데이터 다운로드

In [None]:
import re
import urllib.request

import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

from transformers import TFGPT2Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
tf.random.set_seed(111)
np.random.seed(111)

## 데이터 준비

In [None]:
BATCH_SIZE = 32
NUM_EPOCHS = 3
VALID_SPLIT = 0.1
SENT_MAX_LEN = 39

In [None]:
TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'

tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,
                                               mask_token = None,
                                               sep_token = '<unused0>',
                                               cls_toeken = None,
                                               unknown_token = '<unk>',
                                               padding_token = '<pad>',
                                               bos_token = '<s>',
                                               eos_token = '<\s>')

* https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
* https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt


In [None]:
train_file = urllib.request.urlopen("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt")
test_file = urllib.request.urlopen("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt")

train_data = pd.read_table(train_file)
test_data = pd.read_table(test_file)

train_data = train_data.dropna()
test_data = test_data.dropna()

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
def clean_text(text):
    text_clean = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", text)

    return text_clean

In [None]:
train_data_sents = []
train_data_labels = []

for train_sent, train_label in train_data[['document', 'label']].values:
    train_tokenized_text = vocab[tokenizer(clean_text(train_sent))]

    tokens = [vocab[vocab.bos_token]]
    tokens += pad_sequences([train_tokenized_text],
                            SENT_MAX_LEN,
                            value = vocab[vocab.padding_token],
                            padding = 'post').tolist()[0]
    tokens += [vocab[vocab.eos_token]]

    train_data_sents.append(tokens)
    train_data_labels.append(train_label)

train_data_sents = np.array(train_data_sents, dtype = np.int64)
train_data_labels = np.array(train_data_labels, dtype = np.int64)

## 모델 학습

In [None]:
class TFGPT2Classifier(tf.keras.Model):
    def __init__(self, dir_path, num_class):
        super(TFGPT2Classifier, self).__init__()

        self.gpt2 = TFGPT2Model.from_pretrained(dir_path)
        self.num_class = num_class

        self.dropout = tf.keras.layers.Dropout(self.gpt2. config.summary_first_dropout)
        self.classifier = tf.keras.layers.Dense(self.num_class,
                                                kernel_initializer = tf.keras.initializers.TruncatedNormal(stddev = self.gpt2.config.initializer_range),
                                                name = 'classifier')
        
    def call(self, inputs):
        outputs = self.gpt2(inputs)
        pooled_output = outputs[0][:, -1]
        pooled_output = self.dropout(pooled_output)
        logits = self. classifier(pooled_output)

        return logits

In [None]:
BASE_MODEL_PATH = './gpt_ckpt'
cls_model = TFGPT2Classifier(dir_path = BASE_MODEL_PATH, num_class = 2)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 6.25e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer = optimizer, loss = loss, metrics = [metric])

In [None]:
model_name = 'tf2_gpt2_naver_movie'

es_callback = EarlyStopping(monitor = 'val_accuracy', min_delta = 0.0001, patience = 2)

checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} directory already exists\n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok = True)
    print("{} directory already complete\n".format(checkpoint_dir))

cp_callback = ModelCheckpoint(checkpoint_path,
                              monitor = 'val_accuracy',
                              verbose = 1,
                              save_best_only = True,
                              save_weights_only = True)

history = cls_model.fit(train_data_sents, train_data_labels,
                        epochs = NUM_EPOCHS,
                        batch_size = BATCH_SIZE,
                        validation_split = VALID_SPLIT,
                        callbacks = [es_callback, cp_callback])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'], '')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Loss', 'Validation Loss'])
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'], '')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Accuracy', 'Validation Accuracy'])
plt.show()

## 모델 평가

In [None]:
test_data_sents = []
test_data_labels = []

for test_sent, test_label in test_data[['document', 'label']].values:
    test_tokenized_text = vocab[tokenizer(clean_text(test_sent))]

    tokens = [vocab[vocab.bos_token]]
    tokens += pad_sequences([test_tokenized_text],
                            SENT_MAX_LEN,
                            value = vocab[vocab.padding_token],
                            padding = 'post').tolist()[0]
    tokens += [vocab[vocab.eos_token]]

    test_data_sents.append(tokens)
    test_data_labels.append(test_label)

test_data_sents = np.array(test_data_sents, dtype = np.int64)
test_data_labels = np.array(test_data_labels, dtype = np.int64)

In [None]:
cls_model.load_weights(checkpoint_path)
cls_model.evaluate(test_data_sents, test_data_labels, batch_size = 1024)