In [1]:
import numpy as np
import tensorflow as tf
import os
import keras.preprocessing.sequence
from tensorflow import keras

In [2]:
# loading shakespeare text from google storage api to use for text prediction
file = keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
# opening the file and decoding it to use with python format
text = open(file,'rb').read().decode(encoding='utf-8')
print(text[:200])
print(len(text))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you
1115394


In [4]:
vocab = sorted(set(text)) # creating arranged unique words
char2id = {l:no for no, l in enumerate(vocab)} # creating my bag of words(contains words and their index)
id2char = np.array(vocab)
print(vocab[:200])
print(char2id)
print(id2char)


['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' '

In [5]:
# creating a function tha turn words to vector
def text_to_int(text):
  """ this function takes in texts and get the index of the word"""
  word = [char2id[t] for t in text]
  return np.array(word)
def int_to_text(text):
  try: 
    text.numpy()
  except:
    pass
  return ''.join(id2char[text])

In [6]:
text2int = text_to_int(text)
text2int[:15]

array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0])

In [7]:
print(id2char.shape)
print(int_to_text(text2int[:15]))
print(vocab[18])

(65,)
First Citizen:

F


In [8]:
#  sequence length is the length of the training data to be used
# what we want is that the output should contain the remaining input 
sequence_length = 100
examples_per_epoch = len(text)//(sequence_length+1) # this shows the number of epochs for the text

char_dataset = tf.data.Dataset.from_tensor_slices(text2int)
# print(list(char_dataset.as_numpy_iterator()))
sequence = char_dataset.batch(sequence_length+1,drop_remainder=True) # this create a batch with the number of sequence and drops the remaining


In [None]:
# to split the dataset into training and output
def split_input(chunk):
  """ this function takes in the input and remove the last word and output removes the first word and brings out everything"""
  input = chunk[:-1]
  output = chunk[1:]
  return input,output

dataset = sequence.map(split_input)
dataset.take(2)


<TakeDataset shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>

In [None]:
for l,n in dataset.take(2):
  print(f'input:{int_to_text(l)}','\n')
  print(f'output:{int_to_text(n)}')

input:First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 

output:irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 
input:are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you  

output:re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


In [None]:
# time to train our model
batch_size = 32
vocab_size = len(vocab)
rnn_units = 1024
embedding_dim = 256

train_data = dataset.shuffle(10000).batch(batch_size,drop_remainder=True)

In [None]:
def create_model(vocab_size,batch_size,rnn_units,embedding_dim):
  model = keras.models.Sequential()
  model.add(keras.layers.Embedding(vocab_size+1,embedding_dim,batch_input_shape=[batch_size,None]))
  model.add(keras.layers.LSTM(1024,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'))
  model.add(keras.layers.Dense(vocab_size))
  return model

model = create_model(vocab_size,batch_size,rnn_units,embedding_dim)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (32, None, 256)           16896     
                                                                 
 lstm (LSTM)                 (32, None, 1024)          5246976   
                                                                 
 dense (Dense)               (32, None, 65)            66625     
                                                                 
Total params: 5,330,497
Trainable params: 5,330,497
Non-trainable params: 0
_________________________________________________________________


In [None]:
train_data.take(2)

<TakeDataset shapes: ((32, 100), (32, 100)), types: (tf.int64, tf.int64)>

In [None]:
# making predictions for the first batch of words
# it shows 32 predictions of length 100 with total of 65 possible characters
for training,testing in train_data.take(1):
  predicted_val = model(training)
  print(predicted_val.shape)
  print(predicted_val[0])

(32, 100, 65)
tf.Tensor(
[[-0.00114096 -0.00890215 -0.00502146 ...  0.00069315 -0.00154731
   0.00291908]
 [ 0.00216486 -0.01031527 -0.00512168 ...  0.00121373 -0.00083322
   0.00103331]
 [ 0.00048114 -0.01574672  0.00030828 ...  0.00710972 -0.00610176
   0.001842  ]
 ...
 [ 0.00932314 -0.00120466  0.00075075 ... -0.00386311 -0.00023657
   0.00653169]
 [ 0.00741927 -0.00341913 -0.00326885 ... -0.00458156  0.00500496
   0.00694083]
 [ 0.00572156 -0.00730574 -0.00156941 ...  0.00052032 -0.00093141
   0.00639114]], shape=(100, 65), dtype=float32)


In [None]:
sampling_pred = tf.random.categorical(predicted_val[0],1)
print(sampling_pred)
sampled_indice = np.reshape(sampling_pred,(1,-1))[0]
print(sampled_indice)
print(sampled_indice.shape)
print(int_to_text(sampled_indice))

tf.Tensor(
[[ 8]
 [24]
 [26]
 [14]
 [59]
 [16]
 [ 6]
 [59]
 [64]
 [28]
 [ 1]
 [ 5]
 [ 0]
 [59]
 [13]
 [31]
 [10]
 [62]
 [10]
 [50]
 [ 6]
 [17]
 [62]
 [57]
 [ 0]
 [14]
 [19]
 [35]
 [51]
 [48]
 [13]
 [38]
 [ 0]
 [33]
 [59]
 [ 1]
 [ 7]
 [27]
 [26]
 [ 5]
 [51]
 [56]
 [54]
 [58]
 [ 0]
 [36]
 [49]
 [42]
 [52]
 [64]
 [15]
 [33]
 [31]
 [11]
 [42]
 [17]
 [57]
 [50]
 [56]
 [36]
 [54]
 [40]
 [ 2]
 [11]
 [14]
 [48]
 [27]
 [44]
 [17]
 [45]
 [16]
 [44]
 [59]
 [15]
 [10]
 [18]
 [ 9]
 [26]
 [18]
 [15]
 [ 7]
 [32]
 [16]
 [30]
 [30]
 [38]
 [59]
 [29]
 [58]
 [45]
 [28]
 [19]
 [36]
 [10]
 [32]
 [16]
 [28]
 [17]
 [39]
 [41]], shape=(100, 1), dtype=int64)
[ 8 24 26 14 59 16  6 59 64 28  1  5  0 59 13 31 10 62 10 50  6 17 62 57
  0 14 19 35 51 48 13 38  0 33 59  1  7 27 26  5 51 56 54 58  0 36 49 42
 52 64 15 33 31 11 42 17 57 50 56 36 54 40  2 11 14 48 27 44 17 45 16 44
 59 15 10 18  9 26 18 15  7 32 16 30 30 38 59 29 58 45 28 19 36 10 32 16
 28 17 39 41]
(100,)
.LNBuD,uzP '
uAS:x:l,Exs
BGWmjAZ
Uu -ON'mrpt


In [None]:
# creating a loss function that helps calculate accuracy
def loss(label,logit):
  return tf.keras.losses.sparse_categorical_crossentropy(label,logit,from_logits=True)

In [None]:
model.compile(optimizer='adam',loss=loss)

In [None]:
# creating checkpoint to save the weight we used when training 

checkpoint_path = os.path.join('./training_checkpoint','ckpt_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,save_weights_only=True)

In [None]:
model.fit(train_data,callbacks=[checkpoint_callback],epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f7620571950>

In [None]:
# creating the model again but with one batch
model = create_model(vocab_size,1,rnn_units,embedding_dim)
model.load_weights(tf.train.latest_checkpoint('./training_checkpoint'))
model.build(tf.TensorShape([1,None]))

In [None]:
def generate_text(model,starting_string):
  # numbers of character to generate
  number_to_generate = 200
  # converts the text to vector
  input = text_to_int(starting_string)
  input = tf.expand_dims(input,0)
  generated_text = []
  # reset the model so i can pass one value at the last layer
  model.reset_states()
  for i in range(number_to_generate):
    pred = model(input)
    pred = tf.squeeze(pred,0)
    pred_ind = tf.random.categorical(pred,1)[-1,0].numpy()
    # passing the predicted value as input
    input =  tf.expand_dims([pred_ind],0)
    generated_text.append(int_to_text(pred_ind))
  return (starting_string + ''.join(generated_text))


In [None]:
starting_string = input('type in a word:')
print(generate_text(model,starting_string))

type in a word:kunle
kunled!

LADY CAPULET:
I would it gentlemen. More lord to Lord Hunger:
You stoop it, truly if thou darest.

KING EDWARD IV:
Take him up in Plantagenet. A plague o' both your house:
I'll look pale
To take o
