## Setup

In [84]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tensorflow-addons
    !pip install -q -U transformers
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

No GPU was detected. LSTMs and CNNs can be very slow without a GPU.
Go to Runtime > Change runtime and select a GPU hardware accelerator.


## 16.1 Char-RNN을 사용해 셰익스피어 문체 생성하기

### 16.1.1 훈련 데이터셋 만들기

In [85]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [86]:
shakespeare_text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [87]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [88]:
# texts_to_sequences
tokenizer.texts_to_sequences(['First']) # >>> [[20, 6, 9, 8, 3]]
# sequences_to_texts
tokenizer.sequences_to_texts([[20, 6, 9]]) # >>> ['f i r']
# word_index : set(char:index)
max_id = len(tokenizer.word_index) # >>> 39; a-z, 숫자, 특수문자.
# document_count
dataset_size = tokenizer.document_count # >>> 1115394; corpus

In [89]:
# 분석할 'shakespeare_text'를 sequences로 변환 하고, 
# index가 0부터 시작하도록
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

### 16.1.2 순차 데이터셋을 나누는 방법

In [90]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

### 16.1.3 순차 데이터를 윈도 여러 개로 자르기

In [91]:
n_steps = 100
window_length = n_steps + 1 # target = 1글자 앞의 input
# 중첩 데이터셋, nested dataset
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [92]:
# 중첩 >>> 플랫 데이터셋, flat dataset
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [93]:
# Shuffle & Batch
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset.element_spec

TensorSpec(shape=(None, None), dtype=tf.int64, name=None)

In [94]:
# Split features & target
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [95]:
batch_index = 2
for ds in dataset.take(1):
    X, y = ds
    # print(y)
    texts = tokenizer.sequences_to_texts([X[batch_index].numpy()+1])
    target = tokenizer.sequences_to_texts([y[batch_index].numpy()+1])
    print(f"[inputs]:\n{texts[0]}")
    print(f"\n[ouputs]:\n{target[0]}")

[inputs]:
l d   b y   t h e   c o r m o r a n t   b e l l y   b e   r e s t r a i n ' d , 
 w h o   i s   t h e   s i n k   o '   t h e   b o d y , - - 
 
 m e n e n i u s : 
 w e l l ,   w h a t   t h e n ? 


[ouputs]:
d   b y   t h e   c o r m o r a n t   b e l l y   b e   r e s t r a i n ' d , 
 w h o   i s   t h e   s i n k   o '   t h e   b o d y , - - 
 
 m e n e n i u s : 
 w e l l ,   w h a t   t h e n ? 
 



In [96]:
# Encoding: 원-핫 벡터
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [97]:
# Prefetch
dataset = dataset.prefetch(1)

In [98]:
# GRU 레이어를 GPU에서 사용하기 위해선 recurrent_dropout등이 기본값 이어야 한다.
model = tf.keras.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                    #  recurrent_dropout=0.2, 
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                    #  recurrent_dropout=0.2, 
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, 'softmax'))
])

In [99]:
# !pip install -q pyyaml h5py  # HDF5 포맷으로 모델을 저장하기 위해서 필요합니다

In [100]:
model_path = "/content/drive/MyDrive/Colab_Handson/model/weights.ckpt"
model_dir = os.path.dirname(model_path)

try:
    latest = tf.train.latest_checkpoint(model_dir)
    model.load_weights(latest)
    
except:
    model.compile(optimizer="adam",
                loss="sparse_categorical_crossentropy")
    %%time
    history = model.fit(dataset, epochs=1)
    model.save_weights(model_path)

### 16.1.5 Char-RNN 모델 사용하기

In [101]:
def preprocess(text):
    X = np.array(tokenizer.texts_to_sequences(text)) - 1
    return tf.one_hot(X, max_id)

In [102]:
X_new = preprocess(["How are yo"])
Y_pred = np.argmax(model.predict(X_new), axis=-1)
tokenizer.sequences_to_texts(Y_pred+1)[0][-1]

'u'

### 16.1.6 가짜 셰익스피어 텍스트를 생성하기

In [103]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [104]:
def complete_text(text, n_chars=50, temperature=1.):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [105]:
print(complete_text("t", temperature=0.2))

ther for my elders.

gremio:
and i will not hear me


In [106]:
print(complete_text("t", temperature=1))

teatly't putlumed you all be blayers.
that wea is w


In [107]:
print(complete_text("w", temperature=1))

wweysian furst;
as she, o great ill nother? but hea


### 16.1.7 상태가 있는 RNN

In [108]:
len(encoded[:train_size])

1003854

In [109]:
# dataset의 shape
# tf.data.experimental.cardinality(dataset)

In [110]:
# (1003854,)
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
# (10038, 101)
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
# (10038, 1, 101)
dataset = dataset.batch(1)
# ((10038, 1, 100), (10038, 1, 100))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
# ((10038, 1, 100, 39), (10038, 1, 100))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
dataset = dataset.prefetch(1)

In [111]:
for ds in dataset.take(1):
    X, y = ds
    print(X.shape)
    print(y.shape)

(1, 100, 39)
(1, 100)


In [112]:
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
    # (31371,)
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    # (313, 101)
    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
# 연속적인 배치 생성: Dataset.zip >>> tf.stack
# (32, 313, 101)
dataset = tf.data.Dataset.zip(tuple(datasets))
dataset = dataset.map(lambda *windows: tf.stack(windows))
# ((32, 313, 100), (32, 313, 100))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
# ((32, 313, 100, 39), (32, 313, 100))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [113]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     #dropout=0.2, recurrent_dropout=0.2,
                     dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     #dropout=0.2, recurrent_dropout=0.2),
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

In [114]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [115]:
model_path = "/content/drive/MyDrive/Colab_Handson/model/statefulRNN"
if not os.path.isdir(model_path):
    os.mkdir(model_path)
    print("경로를 생성하였습니다.")
weight_path = os.path.join(model_path, "weights.ckpt")
weight_dir = os.path.dirname(weight_path)

In [116]:
try:
    latest = tf.train.latest_checkpoint(weight_dir)
    model.load_weights(latest)
    
except:
    model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam')
    history = model.fit(dataset, epochs=5, 
                        # callbacks=[ResetStatesCallback()]
                        )
    model.save_weights(weight_path)

#### 배치내에서 연속적인 데이터셋

In [117]:
dataset = tf.data.Dataset.from_tensor_slices(np.arange(25))
dataset = dataset.window(5, shift=4, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(3)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
for ds  in dataset.take(26):
    X, y = ds
    print("[X]\n",X.numpy(), "\n[y]\n", y.numpy())
    print("================")

[X]
 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]] 
[y]
 [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[X]
 [[12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]] 
[y]
 [[13 14 15 16]
 [17 18 19 20]
 [21 22 23 24]]


#### 다음 배치와 연속적인 데이터셋

In [118]:
batch_size = 3
encoded_parts = np.array_split(np.arange(27), batch_size)
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(5, shift=4, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets))
dataset = dataset.map(lambda *windows: tf.stack(windows))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.prefetch(1)
for ds  in dataset.take(26):
    X, y = ds
    print("[X]\n",X.numpy(), "\n[y]\n", y.numpy())
    print("================")

[X]
 [[ 0  1  2  3]
 [ 9 10 11 12]
 [18 19 20 21]] 
[y]
 [[ 1  2  3  4]
 [10 11 12 13]
 [19 20 21 22]]
[X]
 [[ 4  5  6  7]
 [13 14 15 16]
 [22 23 24 25]] 
[y]
 [[ 5  6  7  8]
 [14 15 16 17]
 [23 24 25 26]]


## 16.2 감성 분석

In [119]:
tf.random.set_seed(42)

In [120]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()





























































































In [121]:
word_index = keras.datasets.imdb.get_word_index()

In [122]:
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token

In [123]:
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [124]:
import tensorflow_datasets as tfds

In [125]:
%%time
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

CPU times: user 165 ms, sys: 6.36 ms, total: 171 ms
Wall time: 167 ms


In [126]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [127]:
from collections import Counter

In [128]:
%%time
vocabulary = Counter()
for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))
print(vocabulary.most_common()[:5])

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564), (b'of', 33983), (b'and', 33431)]
CPU times: user 12.2 s, sys: 1.34 s, total: 13.5 s
Wall time: 9.48 s


In [129]:
print(vocabulary.most_common()[10000])

(b'Legion', 7)


In [130]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [131]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [132]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [133]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [134]:
train_set = datasets['train'].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [135]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])





























































































In [136]:
model.compile(loss="binary_crossentropy", 
              optimizer="adam", 
              metrics=["accuracy"])
# history = model.fit(train_set, epochs=1)

### 16.2.1 마스킹

In [137]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])

In [138]:
K = keras.backend
embed_size = 128
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.models.Model(inputs=[inputs], outputs=[outputs])

In [139]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# history = model.fit(train_set, epochs=1)

### 16.2.2 사전훈련된 임베딩 재사용하기

In [140]:
import tensorflow_hub as hub

In [141]:
model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

In [142]:
model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, 50)                48190600  
_________________________________________________________________
dense_15 (Dense)             (None, 128)               6528      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 129       
Total params: 48,197,257
Trainable params: 6,657
Non-trainable params: 48,190,600
_________________________________________________________________


In [143]:
# load dataset
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

# batch, prefetch
batch_size = 32
train_set = datasets["train"].batch(batch_size).prefetch(1)

In [144]:
# fit
# history = model.fit(train_set, epochs=5)
# model.predict(["This is my worst movie ever"])

## 16.3 신경망 기계 번역을 위한 인코더-디코더 네트워크

In [145]:
import tensorflow_addons as tfa

In [146]:
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,
                                                 output_layer=output_layer)

final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state,
    sequence_length=sequence_lengths)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.Model(inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
                    outputs=[Y_proba])

In [147]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [148]:
X = np.random.randint(100, size=10*1000).reshape(1000, 10)
Y = np.random.randint(100, size=15*1000).reshape(1000, 15)
X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]
seq_lengths = np.full([1000], 15)

# history = model.fit([X, X_decoder, seq_lengths], Y, epochs=3)

### 16.3.1 양방향 RNN

In [149]:
gru_layer = keras.layers.GRU(10, return_sequences=True)
bidir_layer = keras.layers.Bidirectional(gru_layer)

In [150]:
sample = np.random.rand(1, 3, 4)
bidir_layer(sample).shape

TensorShape([1, 3, 20])

### 16.3.2 빔 검색

```python
beam_width = 10
decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder(
    cell=decoder_cell, beam_width=beam_width, output_layer=output_layer
)
decoder_initial_state = tfa.seq2seq.beam_search_decoder.tile_batch(
    encoder_state, multiplier=beam_width
)
outputs, _, _ = decoder(
    embedding_decoder, start_tokens=start_tokens, end_token=end_token,
    initial_state=decoder_initial_state
)
```

## 16.4 어텐션 메커니즘