In [2]:
from tensorflow import keras
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [104]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [105]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [106]:
tokenizer.sequences_to_texts([[20,6,9,8,3]])

['f i r s t']

In [107]:
max_id = len(tokenizer.word_index)
print("The number of distinct characters: ", max_id)

The number of distinct characters:  39


In [108]:
dataset_size = tokenizer.document_count
print("Total number of characters: ", dataset_size)

Total number of characters:  1115394


In [109]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 #From 0 to 38, starts at 1
print(encoded)

[19  5  8 ... 20 26 10]


In [110]:
train_size = dataset_size* 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

We can't train the entire dataset for an RNN, as it will be unrolled as a single record, rather we batch it, and so we use the window()

In [111]:
n_steps = 100
window_length = n_steps + 1 #target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1,drop_remainder=True)

In [112]:
#Lets flatten the dataset
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [113]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:,1:]))

In [114]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch,depth=max_id), Y_batch))

In [121]:
dataset = dataset.prefetch(1)
dataset

<PrefetchDataset shapes: ((None, None, 39), (None, None)), types: (tf.float32, tf.int64)>

In [120]:
print(next(iter(dataset)))

(<tf.Tensor: shape=(32, 100, 39), dtype=float32, numpy=
array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 1., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [1., 0.

In [122]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [123]:
model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer=keras.optimizers.Adam())

In [124]:
history = model.fit(dataset, epochs=20)

Epoch 1/20
    422/Unknown - 93s 220ms/step - loss: 2.4561

KeyboardInterrupt: 

You can also create a stateful RNN, which takes the output of the one and uses it for the input for the next, however this would mean that your batches would need to be non overlapping chunks, also the shift should be set to the n_steps and you cannot shuffle.

In [127]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [125]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2, 
                    batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2, 
                    batch_input_shape=[batch_size, None, max_id]),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

In [126]:
# We need to reset the states before we go to the beginning of the text

class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [128]:
model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer=keras.optimizers.Adam())
model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback()])

Epoch 1/50
      1/Unknown - 4s 4s/step

InvalidArgumentError:  Specified a list with shape [32,39] from a tensor with shape [1,39]
	 [[node sequential_1/gru_2/TensorArrayUnstack/TensorListFromTensor (defined at <ipython-input-128-94b76f3bbc7a>:2) ]] [Op:__inference_distributed_function_115817]

Function call stack:
distributed_function


In [3]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
print(X_train[0][:10])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]


In [4]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token

" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [6]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/jjhira/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m




Shuffling and writing examples to /Users/jjhira/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete634J37/imdb_reviews-train.tfrecord
Shuffling and writing examples to /Users/jjhira/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete634J37/imdb_reviews-test.tfrecord
Shuffling and writing examples to /Users/jjhira/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete634J37/imdb_reviews-unsupervised.tfrecord
[1mDataset imdb_reviews downloaded and prepared to /Users/jjhira/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

In [7]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [10]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

print(vocabulary.most_common()[:3])

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]


In [11]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [12]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 10000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [13]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 18053]])>

In [14]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [18]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128,),
    keras.layers.Dense(1, activation=keras.activations.sigmoid)
])

In [19]:
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(), metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
    412/Unknown - 55s 134ms/step - loss: 0.6905 - accuracy: 0.5108

KeyboardInterrupt: 

You can also use something called tensorflow hub which has plenty of pretrained models which can then be used inside your model.

In [3]:
import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation=keras.activations.relu),
    keras.layers.Dense(1, activation=keras.activations.sigmoid)
])

KeyboardInterrupt: 

In [None]:
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(), metrics=["accuracy"])

Bidirectional RNN,which can look back and forth

In [None]:
keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))