<a href="https://colab.research.google.com/github/krishnamani77/TF2_Notebooks/blob/master/char-rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import keras
import keras.utils

sh_url = 'https://homl.info/shakespeare'
sh_file = "sh.txt"
filepath = keras.utils.get_file(sh_file, sh_url)

Using TensorFlow backend.


Downloading data from https://homl.info/shakespeare


In [0]:
with open(filepath) as f:
  sh_txt = f.read()  

In [0]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([sh_txt])

In [0]:
print(tokenizer.texts_to_sequences(['Hello world']))
print(tokenizer.sequences_to_texts([[7,2,12,12,4], [1,17,4,9,12,13]]))

[[7, 2, 12, 12, 4, 1, 17, 4, 9, 12, 13]]
['h e l l o', '  w o r l d']


In [0]:
max_id = len(tokenizer.word_index)
print(max_id)

39


In [0]:
print(tokenizer.word_index)

{' ': 1, 'e': 2, 't': 3, 'o': 4, 'a': 5, 'i': 6, 'h': 7, 's': 8, 'r': 9, 'n': 10, '\n': 11, 'l': 12, 'd': 13, 'u': 14, 'm': 15, 'y': 16, 'w': 17, ',': 18, 'c': 19, 'f': 20, 'g': 21, 'b': 22, 'p': 23, ':': 24, 'k': 25, 'v': 26, '.': 27, "'": 28, ';': 29, '?': 30, '!': 31, '-': 32, 'j': 33, 'q': 34, 'x': 35, 'z': 36, '3': 37, '&': 38, '$': 39}


In [0]:
import numpy as np
[encoded] = np.array(tokenizer.texts_to_sequences([sh_txt]))

In [0]:
print(encoded[10:20])

[36  2 10 24 11 22  2 20  4  9]


In [0]:
print(encoded.__class__)

<class 'numpy.ndarray'>


In [0]:
subseq = encoded[10:20]
subseq = subseq.reshape([subseq.shape[0],1])
print(tokenizer.sequences_to_texts(subseq))

['z', 'e', 'n', ':', '\n', 'b', 'e', 'f', 'o', 'r']


In [0]:
print(len(encoded))

1115394


In [0]:
dataset_size = len(encoded)
train_size = dataset_size * 90 // 100
val_size = dataset_size - train_size
X_train = encoded[:train_size]
X_val = encoded[:val_size]
print(len(X_train), len(X_val))

1003854 111540


In [0]:
n_steps = 100  # number of chars in a sequence
window_length = n_steps + 1  # target is the char just after n_steps.

In [0]:
def get_chunks(sequence, n_steps):
  # 'abcdefghijkl'  when n_steps = 2 should return ['ab','bc','cd','ef',....'jk'] and ['c','d',....'l']
  index = 0
  X = []
  y = []
  while index + n_steps <= len(sequence)-1:
    X.append(sequence[index:index+n_steps])
    y.append(sequence[index+n_steps])
    index += 1
  return X, y

In [0]:
X, y = get_chunks('abcdefghijkl', 3)
print(X, y)

['abc', 'bcd', 'cde', 'def', 'efg', 'fgh', 'ghi', 'hij', 'ijk'] ['d', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l']


In [0]:
X_train, y_train = get_chunks(X_train, 100)
X_val, y_val = get_chunks(X_val, 100)

In [0]:
print(X_train[876], y_train[876])
print(X_val[6578], y_val[6578])

[ 2  1  3  7  6  8  1 17  6  3  7 11  4 14  9  1 23  6 25  2  8 18  1  2
  9  2  1 17  2  1 22  2 19  4 15  2  1  9  5 25  2  8 24  1 20  4  9  1
  3  7  2  1 21  4 13  8  1 25 10  4 17  1  6 11  8 23  2  5 25  1  3  7
  6  8  1  6 10  1  7 14 10 21  2  9  1 20  4  9  1 22  9  2  5 13 18  1
 10  4  3  1] 6
[ 8  1  4 20  1  9  4 15  2  1  5  9  2  1  3  7  6  8  1 21  4  4 13  1
 22  2 12 12 16 18 11  5 10 13  1 16  4 14  1  3  7  2  1 15 14  3  6 10
  4 14  8  1 15  2 15 22  2  9  8 29  1 20  4  9  1  2 35  5 15  6 10  2
 11  3  7  2  6  9  1 19  4 14 10  8  2 12  8  1  5 10 13  1  3  7  2  6
  9  1 19  5] 9


In [0]:
total_chars = (len(X_train)*100+ len(y_train)+ len(X_val)*100+ len(y_val))
mem = total_chars/1024/1024/1024
print(mem)

0.10489914007484913


In [0]:
#for i in range(10):
#  print(''.join(c for c in tokenizer.sequences_to_texts(X_train[10+i].reshape([X_train[10+i].shape[0],1]))))
#  print(tokenizer.sequences_to_texts(y_train[10+i].reshape([1,1])))

In [0]:
import tensorflow as tf
def get_one_hot(X, max_id):
  return tf.one_hot(X, depth=max_id)

In [0]:
def get_batch(X,y,batch_size=32,oneHot=False,max_id=1):
  for offset in range(0, len(X), batch_size):
    if not oneHot:
      yield X[offset:offset+batch_size], y[offset:offset+batch_size]
    else:
      yield get_one_hot(X[offset:offset+batch_size], max_id=max_id), y[offset:offset+batch_size]

In [0]:
print(get_one_hot(X_train[0], max_id))

tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(100, 39), dtype=float32)


In [0]:
layer_input = keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2)
layer_hidden1 = keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)
layer_output = keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
model = keras.models.Sequential([layer_input, layer_hidden1, layer_output])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")


In [0]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_11 (GRU)                 (None, None, 128)         64512     
_________________________________________________________________
gru_12 (GRU)                 (None, None, 128)         98688     
_________________________________________________________________
time_distributed_5 (TimeDist (None, None, 39)          5031      
Total params: 168,231
Trainable params: 168,231
Non-trainable params: 0
_________________________________________________________________


In [0]:
train_generator = get_batch(X_train, y_train, batch_size=32, oneHot=True, max_id=max_id)
val_generator = get_batch(X_val, y_val, batch_size=32, oneHot=True, max_id=max_id)
#model.fit_generator(train_generator, epochs=20, validation_data=(X_val, y_val), steps_per_epoch=10, validation_steps=10)
model.fit_generator(train_generator, epochs=20, steps_per_epoch=10)

Epoch 1/20


ValueError: ignored