# Text Generation with Neural Networks

## Imports

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, GRU
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import TensorBoard
import time

## Get Text Data

This is the text we'll use as a basis for our generations: let's try to generate 'Shakespearean' texts.

This text is from Shakespeare's Sonnet 1. It's one of the 154 sonnets written by William Shakespeare that were first published in 1609. This particular sonnet, like many others, discusses themes of beauty, procreation, and the transient nature of life, urging the beautiful to reproduce so their beauty can live on through their offspring.

In [None]:
path_to_file = 'shakespeare.txt'
text = open(path_to_file, 'r').read()

In [None]:
print('First 600 chars: \n')
print(text[:600])

## Preparing textual data

We need to encode our data to give the model a proper numerical representation of our text.

In [None]:
# creates a set of unique characters found in the text
vocab = sorted(set(text))
# vocab

### Text Vectorization

In [None]:
char_to_int = {u:i for i, u in enumerate(vocab)}
# assigns a unique integer to each character in a dictionary format, 
# creating a mapping that can later be used to transform encoded predictions back into characters
# char_to_int

In [None]:
int_to_char = np.array(vocab)
# reverses the decoder dictionary, providing a mapping from characters to their respective assigned integers, which is used to encode the text.
# int_to_char

In [None]:
encoded_text = np.array([char_to_int[c] for c in text])
# encodes the entire text as an array of integers, with each integer representing the character at that position
#in the text according to the encoder dictionary
# encoded_text

## Creating Training Batches

Training batches are a way of dividing the dataset into smaller, manageable groups of data points that are fed into a machine learning model during the training process.

In [None]:
seq_len = 120 # length of sequence for a training example
total_num_seq = len(text)//(seq_len+1) # total number of training examples

# Create Training Sequences
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

In [None]:
def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

In [None]:
dataset = sequences.map(create_seq_targets)

In [None]:
# Batch size
batch_size = 128
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

## Creating the GRU Model

In [None]:
# Length of the vocabulary in chars
vocab_size = len(vocab)
# The embedding dimension
embed_dim = 64
# Number of RNN units
rnn_neurons = 1026

In [None]:
def sparse_cat_loss(y_true,y_pred):
  return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer, Embedding, GRU, Dense

def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(InputLayer(batch_shape=(batch_size, None)))
    
    model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim))

    model.add(GRU(rnn_neurons,
                  return_sequences=True,
                  stateful=True,
                  recurrent_initializer='glorot_uniform'))

    model.add(Dense(vocab_size))
    model.compile(optimizer='adam', loss=sparse_cat_loss)
    return model

model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size)
model.summary()


## Instance of the Model

In [None]:
model = create_model(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    rnn_neurons=rnn_neurons,
    batch_size=batch_size
)


In [None]:
# TensorBoard
log_dir = "/phoenix/tensorboard/tensorlogs"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

## Training the model

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):

  # Predict off some random batch
  example_batch_predictions = model(input_example_batch)

  # Display the dimensions of the predictions
  print(example_batch_predictions.shape, " <=== (batch_size, sequence_length, vocab_size)")

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
# Reformat to not be a lists of lists
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
epochs = 20
model.fit(dataset,epochs=epochs, callbacks=[tensorboard_callback])

## Saving the Model

In [None]:
model_name = 'tf_rnn_model.h5'

In [None]:
model.save(f'models/{model_name}') 

## Load Model

In [None]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)

model.load_weights(f'models/{model_name}')

model.build(tf.TensorShape([1, None]))

# Generating Predictions

In [None]:
def generate_text(model, start_seed="The ", gen_size=100, temp=1.0):
    num_generate = gen_size
    input_eval = [char_to_int[s] for s in start_seed]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    temperature = temp


    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(int_to_char[predicted_id])

    return start_seed + ''.join(text_generated)


#### Generating a text with 1000 chars starting with word 'Confidence'

In [None]:
for layer in model.layers:
    if hasattr(layer, 'reset_states'):
        layer.reset_states()


#### Generating a text with 1000 chars starting with word 'Love'

In [None]:
print(generate_text(model, start_seed="Love ", gen_size=1000))