# Chapter 16

In [1]:
"""
Google Colaboratoryの場合のみ実行
"""
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
colab_path = "/content/drive/MyDrive/100programming/pythonML/"

In [13]:
"""
tensorflow_datasets.features.textが使えない（tensorflow_datasets.deprecated.textでは使える）ので
その代わりにtensorflow-textを用いようと思ったけどよくわからんエラー出たので
tensorflow_datasets.deprecated.texを用いることにした。
"""
# !pip install tensorflow-text



In [14]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
# import tensorflow_text as tf_text
from collections import Counter
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, Dense, Embedding, GRU, LSTM, SimpleRNN

## 16.2

In [None]:
tf.random.set_seed(1)
rnn_layer = tf.keras.layers.SimpleRNN(units=2, use_bias=True, return_sequences=True)
rnn_layer.build(input_shape=(None, None, 5))
w_xh, w_oo, b_h = rnn_layer.weights
print("W_xh shape:", w_xh.shape)
print("W_oo shape:", w_oo.shape)
print("b_h shape:", b_h.shape)

x_seq = tf.convert_to_tensor([[1.0]*5, [2.0]*5, [3.0]*5], dtype=tf.float32)
output = rnn_layer(tf.reshape(x_seq, shape=(1, 3, 5)))
out_man = []
for t in range(len(x_seq)):
    xt = tf.reshape(x_seq[t], (1, 5))
    print((f"Time step {t} =>"))
    print("   Input           :", xt.numpy())

    ht = tf.matmul(xt, w_xh) + b_h
    print("   Hidden          :", ht.numpy())

    if t > 0:
        prev_o = out_man[t - 1]
    else:
        prev_o = tf.zeros(shape=(ht.shape))
    ot = ht + tf.matmul(prev_o, w_oo)
    ot = tf.math.tanh(ot)
    out_man.append(ot)
    print("   Output (manual) :", ot.numpy())
    print("   SimpleRNN output:".format(t), output[0][t].numpy())
    print()

W_xh shape: (5, 2)
W_oo shape: (2, 2)
b_h shape: (2,)
Time step 0 =>
   Input           : [[1. 1. 1. 1. 1.]]
   Hidden          : [[0.41464037 0.96012145]]
   Output (manual) : [[0.39240566 0.74433106]]
   SimpleRNN output: [0.39240566 0.74433106]

Time step 1 =>
   Input           : [[2. 2. 2. 2. 2.]]
   Hidden          : [[0.82928073 1.9202429 ]]
   Output (manual) : [[0.80116504 0.99129474]]
   SimpleRNN output: [0.80116504 0.99129474]

Time step 2 =>
   Input           : [[3. 3. 3. 3. 3.]]
   Hidden          : [[1.243921  2.8803642]]
   Output (manual) : [[0.95468265 0.99930704]]
   SimpleRNN output: [0.95468265 0.99930704]



## 16.3

In [18]:
df = pd.read_csv(colab_path+"movie_data.csv", encoding="utf-8")

target = df.pop("sentiment")
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1])

tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [19]:
tokenizer = tfds.deprecated.text.Tokenizer()  # テキストと異なる
token_counts = Counter()
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
print("Vocab-size:", len(token_counts))

Vocab-size: 87007


In [27]:
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = "This is an example"
print(encoder.encode(example_str))

def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print("Sequence length:", example[0].shape)

ds_subset = ds_train.take(8)
for example in ds_subset:
    print("Individual size:", example[0].shape)

ds_batched = ds_subset.padded_batch(4, padded_shapes=([-1], []))
for batch in ds_batched:
    print("Batch dimension:", batch[0].shape)

train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

[232, 9, 270, 1123]
Sequence length: (24,)
Sequence length: (179,)
Sequence length: (262,)
Sequence length: (535,)
Sequence length: (130,)
Individual size: (119,)
Individual size: (688,)
Individual size: (308,)
Individual size: (204,)
Individual size: (326,)
Individual size: (240,)
Individual size: (127,)
Individual size: (453,)
Batch dimension: (4, 688)
Batch dimension: (4, 453)


In [28]:
model = tf.keras.Sequential()
model.add(Embedding(input_dim=100, output_dim=6, input_length=20, name="embed-layer"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, 20, 6)             600       
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________
