<a href="https://colab.research.google.com/github/kimhwijin/HandsOnMachineLearing/blob/main/NLP_RNN_and_Attention_16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import tensorflow as tf
from tensorflow import keras 
import matplotlib.pyplot as plt
import numpy as np


np.random.seed(42)
tf.random.set_seed(42)

In [47]:
dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare", dataset_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [48]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [49]:
"".join(sorted((set(shakespeare_text.lower()))))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

# Tokenizer

In [50]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [51]:
print(tokenizer.word_index)
print(tokenizer.texts_to_sequences(["Shakespeare@e"]))
print(tokenizer.sequences_to_texts([[8, 7, 5, 25, 2, 8, 23, 2, 5, 9, 2]]))

{' ': 1, 'e': 2, 't': 3, 'o': 4, 'a': 5, 'i': 6, 'h': 7, 's': 8, 'r': 9, 'n': 10, '\n': 11, 'l': 12, 'd': 13, 'u': 14, 'm': 15, 'y': 16, 'w': 17, ',': 18, 'c': 19, 'f': 20, 'g': 21, 'b': 22, 'p': 23, ':': 24, 'k': 25, 'v': 26, '.': 27, "'": 28, ';': 29, '?': 30, '!': 31, '-': 32, 'j': 33, 'q': 34, 'x': 35, 'z': 36, '3': 37, '&': 38, '$': 39}
[[8, 7, 5, 25, 2, 8, 23, 2, 5, 9, 2, 2]]
['s h a k e s p e a r e']


In [52]:
max_id = len(tokenizer.word_index)
print(max_id)
dataset_size = tokenizer.document_count
print(dataset_size)
print(sum(tokenizer.word_counts.values()))

39
1115394
1115394


# Dataset

In [53]:
#index가 1 부터시작해서 -1 을 해줘서 0 부터로 조정함
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [54]:
#수백만개의 1D 차원 시퀀스 데이터를 window() 메서드로 작은 많은 텍스트로 변환한다.
n_steps = 100
window_length = n_steps + 1 # target = 다음 1 글자 input
#shift = 1 이면, 데이터셋을 꽉차게 사용한다. 0~100 , 1~101 , ...
#window는 데이터셋을 만들어서, 리스트의 리스트 같이, 중첩 데이터셋을 만듬
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [55]:
for a in dataset.take(1):
    #데이터셋
    print(len(a), a)

101 <_VariantDataset shapes: (), types: tf.int64>


In [56]:
#중첩 데이터셋을 덴서를 포함한 데이터셋으로 변경함.
dataset = dataset.flat_map(lambda window : window.batch(window_length))
for a in dataset.take(1):
    print(len(a), a)

101 tf.Tensor(
[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
  0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
  4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
 17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
 10 15  3 13  0], shape=(101,), dtype=int64)


In [57]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
#이전 100개의 글자와 타깃값 1글자를 분리함
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [58]:
#각 글자를 one hot 벡터로 치환한다.
#원래는 글자수가 많으면 임베딩을 사용함
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [59]:
for x, y in dataset.take(1):
    print(x[0])
    print(y[0])


tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(100, 39), dtype=float32)
tf.Tensor(
[ 5  7  0  7  4 15  7  0  2  6  1  0 21  1 11 11 15 17  0 14  4  8 24  0
 14  1 17 31 31 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23 10  4 15
 17  0  7  5  8 28  0 16  1 11 11 17  0 16  1 11 11 26 10 10 14  1  9  1
  9  5 13  7 23 10 27  2  6  3 13 20  6  0  4 11 11  0  4  2  0  3  9 18
  1  0 18  4], shape=(100,), dtype=int64)


# Model

In [60]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [None]:
history = model.fit(dataset, epochs=10)

Epoch 1/10
  12396/Unknown - 200s 15ms/step - loss: 1.6762

In [None]:
import os
model_path = '/content/drive/MyDrive/Model/shakespeare/'
model_name = 'shakespeare'
model_path = model_path + model_name
if not os.path.exists(model_path):
    model.save(model_path)

In [45]:
new_model = tf.keras.models.load_model(model_path)