In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import tensorflow as tf
import numpy as np
import pandas as pd
import nltk
import os
import time

#check if decoding is needed: text may need to be decoded as utf-8
text = open('/kaggle/input/shakespeare-text/text.txt', 'r').read() 
print(text[:200])
#Find Vocabulary (set of characters)
vocabulary = sorted(set(text))
print('No. of unique characters: {}'.format(len(vocabulary)))
#character to index mapping
char2index = {c:i for i,c in enumerate(vocabulary)}
int_text = np.array([char2index[i] for i in text])

#Index to character mapping
index2char = np.array(vocabulary)

#Testing
print("Character to Index: \n")
for char,_ in zip(char2index, range(65)):
    print('  {:4s}: {:3d}'.format(repr(char), char2index[char]))

print("\nInput text to Integer: \n")
print('{} mapped to {}'.format(repr(text[:20]),int_text[:20])) #use repr() for debugging

seq_length= 150 #max number of characters that can be fed as a single input
examples_per_epoch = len(text)

#converts text (vector) into character index stream
#Reference: https://www.tensorflow.org/api_docs/python/tf/data/Dataset
char_dataset = tf.data.Dataset.from_tensor_slices(int_text)

#Create sequences from the individual characters. Our required size will be seq_length + 1 (character RNN)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#Testing
print("Character Stream: \n")
for i in char_dataset.take(10):
  print(index2char[i.numpy()])  

print("\nSequence: \n")
for i in sequences.take(10):
  print(repr(''.join(index2char[i.numpy()])))  #use repr() for more clarity. str() keeps formatting it

def create_input_target_pair(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(create_input_target_pair)

#Testing
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(index2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(index2char[target_example.numpy()])))
    
#Creating batches

BATCH_SIZE = 64

# Buffer used to shuffle the dataset 
# Reference: https://stackoverflow.com/questions/46444018/meaning-of-buffer-size-in-dataset-map-dataset-prefetch-and-dataset-shuffle
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

vocab_size = len(vocabulary)
embedding_dim = 256
rnn_units= 1024


def build_model_lstm(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units, 
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

# Reference for theory: https://jhui.github.io/2017/03/15/RNN-LSTM-GRU/

lstm_model = build_model_lstm(
  vocab_size = vocab_size,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

#Testing: shape
for input_example_batch, target_example_batch in dataset.take(1):
    example_prediction = lstm_model(input_example_batch)
    assert (example_prediction.shape == (BATCH_SIZE, seq_length, vocab_size)), "Shape error"
    #print(example_prediction.shape)
    
sampled_indices = tf.random.categorical(example_prediction[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

#Loss Function reference: https://www.dlology.com/blog/how-to-use-keras-sparse_categorical_crossentropy/

example_loss  = loss(target_example_batch, example_prediction)
print("Prediction shape: ", example_prediction.shape)
print("Loss:      ", example_loss.numpy().mean())

lstm_model.compile(optimizer='adam', loss=loss)

lstm_dir_checkpoints= './training_checkpoints_LSTM'
checkpoint_prefix = os.path.join(lstm_dir_checkpoints, "checkpt_{epoch}") #name
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)


EPOCHS=50 #increase number of epochs for better results (lesser loss)

history = lstm_model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

tf.train.latest_checkpoint(lstm_dir_checkpoints)

lstm_model = build_model_lstm(vocab_size, embedding_dim, rnn_units, batch_size=1)
lstm_model.load_weights(tf.train.latest_checkpoint(lstm_dir_checkpoints))
lstm_model.build(tf.TensorShape([1, None]))

lstm_model.summary()

def generate_text(model, start_string):
    num_generate = 1000 #Number of characters to be generated

    input_eval = [char2index[s] for s in start_string] #vectorising input
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 0.5

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(index2char[predicted_id])

    return (start_string + ''.join(text_generated))

#Prediction with User Input
lstm_test = input("Enter your starting string: ")
print(generate_text(lstm_model, start_string=lstm_test))

/kaggle/input/shakespeare-text/text.txt
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you
No. of unique characters: 65
Character to Index: 

  '\n':   0
  ' ' :   1
  '!' :   2
  '$' :   3
  '&' :   4
  "'" :   5
  ',' :   6
  '-' :   7
  '.' :   8
  '3' :   9
  ':' :  10
  ';' :  11
  '?' :  12
  'A' :  13
  'B' :  14
  'C' :  15
  'D' :  16
  'E' :  17
  'F' :  18
  'G' :  19
  'H' :  20
  'I' :  21
  'J' :  22
  'K' :  23
  'L' :  24
  'M' :  25
  'N' :  26
  'O' :  27
  'P' :  28
  'Q' :  29
  'R' :  30
  'S' :  31
  'T' :  32
  'U' :  33
  'V' :  34
  'W' :  35
  'X' :  36
  'Y' :  37
  'Z' :  38
  'a' :  39
  'b' :  40
  'c' :  41
  'd' :  42
  'e' :  43
  'f' :  44
  'g' :  45
  'h' :  46
  'i' :  47
  'j' :  48
  'k' :  49
  'l' :  50
  'm' :  51
  'n' :  52
  'o' :  53
  'p' :  54
  'q' :  55
  'r' :  56
  's' :  57
  't' :  58


Enter your starting string:  romeo


romeos cannot oppress the
courtesy, and they are croop'dou do so increasy
Than can my banisher.

PETRUCHIO:
Whose word is this forlight story to be,
The late of England, there they mut and bid
The priest is not in the mutiate of the house,
When I have might have hid more resolved with
the sharp to the angel, which may make her the deed
As throne and holds upon him as the morn;
For she is secure a stranger, and most accursed
And all the justice of your mistress ere have done
But darkness in this place of hell his works it is.

NORTHUMBERLAND:
Nor I.

CLARENCE:
We have anoney bore me?

BIONDELLO:
When we breathed depated the morning dried.
And, here's no more of them all; then she may, and such as you
Have the nightingale: it was too sudden'd with the night,
When calm death in heaven.

HENRY BOLINGBROKE:
Cousin, stand from me, sir; and therefore fire
Priers of the mind to play at the goose?

MERCUTIO:
Why, sir, I spake like not of many of mine: then, as I love myself,
That I might steal 