In [None]:
# !pip install transformers
# !pip install tokenizer
# !pip install gensim

In [None]:
import tensorflow as tf
from transformers import GPT2Tokenizer, GPT2Config, TFGPT2LMHeadModel
from transformers import WEIGHTS_NAME, CONFIG_NAME
from pathlib import Path
import os
from gensim.corpora import WikiCorpus
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, NFKD, Sequence, NFC, NFD
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel

In [1]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.normalizer = Sequence([NFD()])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(vocab_size=50000, show_progress=True, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
])

NameError: name 'Tokenizer' is not defined

In [None]:
paths = [str(x) for x in Path("./working/ko_corpuss/").glob("**/*.txt")]
tokenizer.train(files=paths, trainer=trainer)

In [None]:
save_path = './working/tokenized_data'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.model.save(save_path)

Train GPT2

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('./working/tokenized_data', unk_token="[UNK]")

In [None]:
tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})

In [None]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [None]:
model = TFGPT2LMHeadModel(config)


In [None]:
paths = [str(x) for x in Path("./ko_corpuss/").glob("**/*.txt")][:10000]
single_string = ''
for filename in paths:
    with open(filename, "r", encoding='utf-8') as f:
        x = f.read()
    single_string += x + tokenizer.eos_token

# tokenize dataset
string_tokenized = tokenizer.encode(single_string)
print("Done tokenizing")

In [None]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
    examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []

for ex in examples:
    inputs.append(ex[:-1])
    labels.append(ex[1:])

dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print("Done creating dataset")

In [None]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=[
              loss, *[None] * model.config.n_layer], metrics=[metric])

In [None]:
num_epoch = 10
history = model.fit(dataset, epochs=num_epoch, verbose=1)

In [None]:
save_path = './working/gpt2_korean'
if not os.path.exists(save_path):
    os.makedirs(save_path)

model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(save_path, WEIGHTS_NAME)
output_config_file = os.path.join(save_path, CONFIG_NAME)

model.save_pretrained(save_path)
model_to_save.config.to_json_file(output_config_file)

# save tokenizer
tokenizer.save_pretrained(save_path)

Prediction

In [None]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
output_dir = "./working/gpt2_korean/"
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = TFGPT2LMHeadModel.from_pretrained(output_dir)

In [None]:
text = "수학은 발전"
input_ids = tokenizer.encode(text, return_tensors='tf')

In [None]:
beam_output = model.generate(
    input_ids,
    max_length=50,
    num_beams=5,
    temperature=0.7,
    no_repeat_ngram_size=2,
    num_return_sequences=5
)

In [None]:
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))