In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras 

In [3]:
text = open("shakespeare.txt","r").read()

In [4]:
len(text)

5445609

In [5]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [6]:
vocab = sorted(set(text))
# vocab

In [7]:
len(vocab)

84

In [8]:
char_to_ind = {char:ind for ind,char in enumerate(vocab)}

In [9]:
char_to_ind["H"]

33

In [10]:
encoded_text = np.array([char_to_ind[c] for c in text])

In [11]:
encoded_text[:50]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78])

In [12]:
encoded_text.shape

(5445609,)

In [13]:
seq_len = 120

total_seq_len = len(text)//(seq_len+1) # Probable 45005 batches

total_seq_len

45005

In [14]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [15]:
sequences = char_dataset.batch(batch_size=seq_len+1, drop_remainder=True)

In [16]:
def create_seq_targets(seq):    # If sentence = hello my name
    input_text = seq[:-1]      # hello my nam
    target_text = seq[1:]      # ello my name
    return input_text,target_text

In [17]:
dataset = sequences.map(create_seq_targets)

In [18]:
type(dataset)

tensorflow.python.data.ops.dataset_ops.MapDataset

In [19]:
ind_to_char = np.array(vocab)
ind_to_char

array(['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1',
       '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       '[', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
       'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
       'w', 'x', 'y', 'z', '|', '}'], dtype='<U1')

In [20]:
# First 2 batches

for i,j in dataset.take(2):
    print(i.numpy())
    print("".join(ind_to_char[i.numpy()]))
    print(j.numpy())
    print("".join(ind_to_char[j.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 
[56 74  1 75 63 60  1 73 6

In [21]:
batch_size = 128

In [22]:
buffer_size = 10000
dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size, drop_remainder=True)

In [23]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

In [24]:
vocab_size = len(vocab)
embed_dim = 64
rnn_neurons = 1024

In [25]:
from keras.losses import sparse_categorical_crossentropy

In [26]:
def sparse_cat_loss(y_true,y_pred):
    return sparse_categorical_crossentropy(y_true,y_pred, from_logits=True)

In [27]:
def gen_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = keras.Sequential([keras.layers.Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size,None]),
                              keras.layers.GRU(rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer="glorot_uniform"),
                              keras.layers.Dense(vocab_size)
                              ])
    model.compile("adam", loss = sparse_cat_loss)
    return model

In [28]:
model = gen_model(vocab_size = vocab_size, embed_dim = embed_dim, rnn_neurons = rnn_neurons, batch_size = batch_size)

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1024)         3348480   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86100     
Total params: 3,439,956
Trainable params: 3,439,956
Non-trainable params: 0
_________________________________________________________________


Set runtime type to GPU before training

In [33]:
# model.fit(dataset, epochs=30)

In [32]:
# model.save("GColab_Shakespeare_Textgen.h5")