<a href="https://colab.research.google.com/github/kimhwijin/HandsOnMachineLearing/blob/main/NLP_RNN_and_Attention_16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import tensorflow as tf
from tensorflow import keras 
import matplotlib.pyplot as plt
import numpy as np


np.random.seed(42)
tf.random.set_seed(42)

In [41]:
dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare", dataset_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [42]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [43]:
"".join(sorted((set(shakespeare_text.lower()))))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

# Tokenizer

In [44]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [45]:
print(tokenizer.word_index)
print(tokenizer.texts_to_sequences(["Shakespeare@e"]))
print(tokenizer.sequences_to_texts([[8, 7, 5, 25, 2, 8, 23, 2, 5, 9, 2]]))

{' ': 1, 'e': 2, 't': 3, 'o': 4, 'a': 5, 'i': 6, 'h': 7, 's': 8, 'r': 9, 'n': 10, '\n': 11, 'l': 12, 'd': 13, 'u': 14, 'm': 15, 'y': 16, 'w': 17, ',': 18, 'c': 19, 'f': 20, 'g': 21, 'b': 22, 'p': 23, ':': 24, 'k': 25, 'v': 26, '.': 27, "'": 28, ';': 29, '?': 30, '!': 31, '-': 32, 'j': 33, 'q': 34, 'x': 35, 'z': 36, '3': 37, '&': 38, '$': 39}
[[8, 7, 5, 25, 2, 8, 23, 2, 5, 9, 2, 2]]
['s h a k e s p e a r e']


In [46]:
max_id = len(tokenizer.word_index)
print(max_id)
dataset_size = tokenizer.document_count
print(dataset_size)
print(sum(tokenizer.word_counts.values()))

39
1115394
1115394


# Dataset

In [47]:
#index가 1 부터시작해서 -1 을 해줘서 0 부터로 조정함
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [48]:
#수백만개의 1D 차원 시퀀스 데이터를 window() 메서드로 작은 많은 텍스트로 변환한다.
n_steps = 100
window_length = n_steps + 1 # target = 다음 1 글자 input
#shift = 1 이면, 데이터셋을 꽉차게 사용한다. 0~100 , 1~101 , ...
#window는 데이터셋을 만들어서, 리스트의 리스트 같이, 중첩 데이터셋을 만듬
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [49]:
for a in dataset.take(1):
    #데이터셋
    print(len(a), a)

101 <_VariantDataset shapes: (), types: tf.int64>


In [50]:
#중첩 데이터셋을 덴서를 포함한 데이터셋으로 변경함.
dataset = dataset.flat_map(lambda window : window.batch(window_length))
for a in dataset.take(1):
    print(len(a), a)

101 tf.Tensor(
[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
  0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
  4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
 17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
 10 15  3 13  0], shape=(101,), dtype=int64)


In [51]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
#이전 100개의 글자와 타깃값 1글자를 분리함
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [52]:
#각 글자를 one hot 벡터로 치환한다.
#원래는 글자수가 많으면 임베딩을 사용함
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [53]:
for x, y in dataset.take(1):
    print(x[0])
    print(y[0])


tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(100, 39), dtype=float32)
tf.Tensor(
[ 5  7  0  7  4 15  7  0  2  6  1  0 21  1 11 11 15 17  0 14  4  8 24  0
 14  1 17 31 31 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23 10  4 15
 17  0  7  5  8 28  0 16  1 11 11 17  0 16  1 11 11 26 10 10 14  1  9  1
  9  5 13  7 23 10 27  2  6  3 13 20  6  0  4 11 11  0  4  2  0  3  9 18
  1  0 18  4], shape=(100,), dtype=int64)


# Model

In [54]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [55]:
import os
model_path = '/content/drive/MyDrive/Model/shakespeare/'
model_name = 'shakespeare'
model_path = model_path + model_name


if not os.path.exists(model_path):
    history = model.fit(dataset, epochs=10)
    model.save(model_path)
    print('-'*20)
else:
    model = tf.keras.models.load_model(model_path)

#TextGenerate

In [56]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

X_new = preprocess(["who are yo"])
Y_pred = np.argmax(model(X_new), axis=-1)
print("who are yo" + tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]) # 1st sentence, last char


who are you


In [57]:
def next_char(text, temperature=1):
    #to one_hot arrays
    X_new = preprocess([text])
    y_proba = model(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [58]:
print(complete_text("i", n_chars=100, temperature=0.2))
print('---')
print(complete_text("i", n_chars=100, temperature=1))

in the rest was so beheld and hearting and my father raiment for the rest words signior gremio.

grum
---
ice?

petruchio:
bepake her burst in all in happy eneep,
that s sasder it, sir, yet myself as's solen


#StatefulRNN

In [59]:
batch_size = 1
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda window: (window[:, :-1], window[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

#StatefullRNN_Model

In [60]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

#CallBack

In [61]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

#StatefullTrain

In [62]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [63]:
import os
model_path = '/content/drive/MyDrive/Model/shakespeare/'
model_name = 'shakespeare_stateful'
model_path = model_path + model_name


if not os.path.exists(model_path):
    os.mkdir(model_path)
    history = model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback()])
    model.save(model_path)
    print('-'*20)
else:
    model = tf.keras.models.load_model(model_path)

#Sentiment_Analysis

In [64]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

In [65]:
print(X_train[0][:10])
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]
1


In [66]:
word_index = keras.datasets.imdb.get_word_index()
#0,1,2 토큰은 패딩토큰, SOS start_or_sequence 토큰, 알수없는 단어를 의미하는 토큰 이다.
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0]])

"<sos> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and shou

In [67]:
import tensorflow_datasets as tfds

#전처리되지 않은 원본 리뷰데이터
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [68]:
train_size = info.splits['train'].num_examples
test_size = info.splits['test'].num_examples
print(*datasets.keys())

test train unsupervised


In [69]:
for X, y in datasets['train'].take(2):
    review = X.numpy()
    label = y.numpy()
    print("Review:", review.decode("utf-8")[:200], "...")
    print("Label:", label, "= Positive" if label else "= Negative")
    print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



#Preprocessing

In [70]:
def preprocess(X_batch, y_batch):
    #300 글자 제한
    X_batch = tf.strings.substr(X_batch, 0, 300)
    #<br /> 태그를 공백으로
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    #a~Z 가 아니면 공백으로
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    #공백 기준으로 분리
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

for X_batch, y_batch in datasets['train'].batch(3).take(1):
    print(preprocess(X_batch, y_batch))

(<tf.Tensor: shape=(3, 53), dtype=string, numpy=
array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
        b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
        b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
        b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
        b'their', b'worst', b'role', b'in', b'history', b'Even',
        b'their', b'great', b'acting', b'could', b'not', b'redeem',
        b'this', b"movie's", b'ridiculous', b'storyline', b'This',
        b'movie', b'is', b'an', b'early', b'nineties', b'US',
        b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
       [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
        b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
        b'to', b'a', b'combination', b'of', b'things', b'including',
        b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
        b'on', b'the', b'sette', b'and', b'having', b'just', b'eaten'

In [71]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

print(vocabulary.most_common()[:10])
print(len(vocabulary))

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564), (b'of', 33983), (b'and', 33431), (b'to', 27707), (b'I', 27019), (b'is', 25719), (b'in', 18966), (b'this', 18490)]
53893


In [72]:
#가장 많이 등장하는 단어 개수
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]
]
word_to_id = {word : index for index, word in enumerate(truncated_vocabulary)}

In [73]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets=num_oov_buckets)

In [74]:
table.lookup(tf.constant([b"this is so crazzzzy".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[    9,     7,    34, 10991]])>

In [75]:
#table
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [76]:
train_set = datasets['train'].batch(32).map(preprocess).map(encode_words).prefetch(1)

In [77]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#API_Masking

In [82]:
#<pad> 토큰을 제외하고 학습
K = keras.backend
embed_size = 128
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.models.Model(inputs=[inputs], outputs=[outputs])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
