# TF2 로 Chracter 생성 모델을 RNN 기반으로 만든다.

In [3]:
import os
import sys
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow import feature_column as fc
import tensorflow_datasets as tfds
plt.rcParams["font.family"] = 'NanumBarunGothic'
TENSORBOARD_BINARY = '/home/hoondori/anaconda3/envs/ai/bin/tensorboard'
os.environ['TENSORBOARD_BINARY'] =  TENSORBOARD_BINARY
%load_ext tensorboard

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

# 데이터 준비

In [11]:
DATA_DIR = "/tmp/logs/CharModel"
CHECKPOINT_DIR = os.path.join(DATA_DIR, "checkpoints")
LOG_DIR = os.path.join(DATA_DIR, "logs")

import re

def clean_logs():
    shutil.rmtree(CHECKPOINT_DIR, ignore_errors=True)
    shutil.rmtree(LOG_DIR, ignore_errors=True)


def download_and_read(urls):
    texts = []
    for i, url in enumerate(urls):
        p = tf.keras.utils.get_file("ex1-{:d}.txt".format(i), url,
            cache_dir=".")
        text = open(p, mode="r", encoding="utf-8").read()
        # remove byte order mark
        text = text.replace("\ufeff", "")
        # remove newlines
        text = text.replace('\n', ' ')
        text = re.sub(r'\s+', " ", text)
        # add it to the list
        texts.extend(text)
    return texts




# download and read into local data structure (list of chars)
texts = download_and_read([
    "http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
    "https://www.gutenberg.org/files/12/12-0.txt"
])
#clean_logs()
texts[:5]

['P', 'r', 'o', 'j', 'e']

In [15]:
# 어휘 생성
vocab = sorted(set(texts))
char2idx = {c:i for i, c in enumerate(vocab)}
idx2char = {i:c for i, c in enumerate(vocab)}
print("vocab size: {:d}".format(len(vocab)))
print(f"vocab: {vocab[:20]}")

vocab size: 90
vocab: [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4']


In [25]:
# Char seq => Int seq => tf dataset
text_arr = np.array([char2idx[c] for c in texts])
dataset = tf.data.Dataset.from_tensor_slices(text_arr)

# 무한히 긴 시퀀스를 일정 크기의 배치로 변경
#  - Inf => (None, 100)
seq_length = 100
dataset = dataset.batch(seq_length+1, drop_remainder=True)  # +1 은 X, y 처리를 고려

# X 는 0:-1 까지, y는 1:end 
def split_train_labels(sequence):
    input_seq = sequence[0:-1]
    output_seq = sequence[1:]
    return input_seq, output_seq

dataset = dataset.map(split_train_labels)

# 훈련 배치 설정 
# X = (None, 64, 100), Y = (None, 64, 100)
batch_size = 64
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# 모델 정의

In [42]:
class CharGenModel(tf.keras.Model):
    
    def __init__(self, vocab_size, emb_sz, rnn_output_dim, **kwargs):
        super(CharGenModel, self).__init__()
    
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=emb_sz)
    
        self.rnn = tf.keras.layers.GRU(
            rnn_output_dim,
            recurrent_initializer='glorot_uniform', # default: orthogonal
            recurrent_activation='sigmoid',  # default : tanh
            return_sequences=True,
            stateful=True
        )
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        x = self.embedding(x)
        x = self.rnn(x)
        x = self.dense(x)
        
        return x
    
vocab_size = len(vocab)
embedding_dim = 256
rnn_output_dim = embedding_dim

model = CharGenModel(vocab_size, embedding_dim, rnn_output_dim)
model.build(input_shape=(batch_size, seq_length))
model.summary()

Model: "char_gen_model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     multiple                  23040     
_________________________________________________________________
gru_8 (GRU)                  multiple                  394752    
_________________________________________________________________
dense_8 (Dense)              multiple                  23130     
Total params: 440,922
Trainable params: 440,922
Non-trainable params: 0
_________________________________________________________________


In [43]:
# output shape 확인
for input_batch, label_batch in dataset.take(1):
    pred_batch = model(input_batch)
    print(pred_batch.shape)
    
assert(pred_batch.shape[0] == batch_size)
assert(pred_batch.shape[1] == seq_length)
assert(pred_batch.shape[2] == vocab_size)    

(64, 100, 90)


# 학습

In [44]:
# text 생성기
def generate_text(model, prefix_string, char2idx, idx2char,
        num_chars_to_generate=1000, temperature=1.0):
    input = [char2idx[s] for s in prefix_string]
    input = tf.expand_dims(input, 0)
    #print(f'prefix: {input}')
    
    text_generated = []
    model.reset_states()
    for i in range(num_chars_to_generate):
        preds = model(input)
        preds = tf.squeeze(preds, 0) / temperature    # 카테고리 분포 pdf  리턴했으므로 아래에서 sampling
        
        # predict char returned by model
        pred_id = tf.random.categorical(preds, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[pred_id])
        # pass the prediction as the next input to the model
        input = tf.expand_dims([pred_id], 0)

    return prefix_string + "".join(text_generated)

In [45]:
def loss(labels, predictions):
    return tf.losses.sparse_categorical_crossentropy(
        labels,
        predictions,
        from_logits=True
    )

model.compile(optimizer=tf.optimizers.Adam(), loss=loss)

# we will train our model for 50 epochs, and after every 10 epochs
# we want to see how well it will generate text
num_epochs = 50
steps_per_epoch = len(texts) // seq_length // batch_size
for i in range(num_epochs // 10):
    model.fit(
        dataset.repeat(),
        epochs=10,
        steps_per_epoch=steps_per_epoch
        # callbacks=[checkpoint_callback, tensorboard_callback]
    )
    checkpoint_file = os.path.join(
        CHECKPOINT_DIR, "model_epoch_{:d}".format(i+1))
    model.save_weights(checkpoint_file)

    # create a generative model using the trained model so far
    gen_model = CharGenModel(vocab_size, embedding_dim, rnn_output_dim)
    gen_model.load_weights(checkpoint_file)
    gen_model.build(input_shape=(1, seq_length))

    print("after epoch: {:d}".format(i+1)*10)
    print(generate_text(gen_model, "Alice ", char2idx, idx2char))
    print("---")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1after epoch: 1
Alice Red Hum. And the Project Gry dever all, I'm somerenodem to creethed in she prothers, camest &m, astersaippdation? Orly pair, silo he too mest natten it?’ said SReeave On the Fromming fould agaid, but thin Dorto tem with she jainguns alled in acmish, so he’r-t ArTw such the cried as siet aftalled ap beganbler, ithat she ay, whot anything her atree! Don it!’ said the Mobe my in’t it: and she had and undinum).. Alice wed down reak it was quere to you and show is, its she cam o) consed?’ the thought, no bes--’ Mideraly,' shat in she!" Alice, ‘and some suing that’s a head wat THaTurn ageable sonen, as a shamp a notale trages, and sheme?" saids not. Alice had litterse dren herself.’ Hustle feat look a cridiceching and mene he had my justr

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
after epoch: 3after epoch: 3after epoch: 3after epoch: 3after epoch: 3after epoch: 3after epoch: 3after epoch: 3after epoch: 3after epoch: 3
Alice loudly (this--but she had never have do you have scroom half shawing the large was in the terms of this momenton-by it itted somewhere!" thought she trietly Speak round; at tick of the tables stoprief failt lodit: The Caterpillar. "As it each! How you don’t think I don’t means I know,’ cried Two," Said “Mich of dispeather out a rowong in a way. "I many walked by, that they gastaid the kittens agled to the door is sich dread.) And the Project Gutenberg-tm License (callu knew it!" and at his ranch of them was to the idea Drunched or other something losations looking back on his hurry and had flowly a good little direction Twreat little tarts queers a might nonser. ‘Well, I she had something. "May _They mysted at all-facted: ‘but, that afreed up,

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
after epoch: 5after epoch: 5after epoch: 5after epoch: 5after epoch: 5after epoch: 5after epoch: 5after epoch: 5after epoch: 5after epoch: 5
Alice went on expensation's quarked down!’ Alice asked "man,’ said the Queen, but she wanted to herself, all round, And as the Mouse could nos mouth we." Holder of a rule to get now,’ she said, as she spoke. ‘Yes, in char; came of sightion. [Illustration] The chow bagled to pretend the shalls Te up changed the owner in a grast. ‘'st" said Alice; "would not think of no "--the use of anything in her hair in fressy with a little anxiously: "jection with alshe. I should have sighs. "Very soundself logated when it was that likelt, before head to see the Queen creature. ‘You may come out." "There were dark with one of the three could me!’ thought Alice, I’m dark over his eyes * * * * Gryphon. "Nothing, you know." "Well, then the Hatter. "No, she’d smim-pl