# Generating Text with Neural Networks

Repository Link: https://github.com/just-ahh/AI-A.github.io.git


# Getting the Data

In [1]:
import tensorflow as tf
# data has been ordered using tensorflow
#gathered the data from the link below, using keras as the loading 
shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
#the file with all the data on being extracted from the url above.
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
# This code is allowing the data to be opened and intially viewed in its origional form and order
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


Above is downlaoding the data, there is over 1 million individual pieces of data including all letters, numbers, punctuation and special characters in the english language. 

In [2]:
print(shakespeare_text[:80]) # not relevant to machine learning but relevant to exploring the data

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


1. This code runs the first 80 characters in the data set. It has not been shuffled yet and includes punctuations, spaces and 'enter'/line breaks. The colon before the number 80 means only the data entries 0-79 making 80 entries of characters. 

2. The line that is printed is from shakespheare's Coriolanus, and the text above is the first two lines of Act 1, Scene 1. Info taken from Washinton State University at (https://public.wsu.edu/~delahoyd/shakespeare/coriolanus1.html#:~:text=%22Before%20we%20proceed%20any%20further,being%20warehoused%20at%20unfair%20prices)

# Preparing the Data

In [3]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character",
                                                   standardize="lower")

#The code above is to standardise the all the characters in the data set, making sure all text is in lower case and will be capitalised when required in the final output text. 
#Creating a layer of the text to check the shape in the next stage.
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

Below is the print out of the layer to check the shape, the character number assigned to the data, the amount of data entered. 

In [4]:
print(text_vec_layer([shakespeare_text]))

tf.Tensor([[21  7 10 ... 22 28 12]], shape=(1, 1115394), dtype=int64)


In [5]:
encoded -= 2  # drop tokens 0 (pad) and 1 (unknown), which we will not use
n_tokens = text_vec_layer.vocabulary_size() - 2  # number of distinct chars = 39
dataset_size = len(encoded)  # total number of chars = 1,115,394

Code below output showing the amount of unique characters, and the total number of characters available. 1,115,394

In [6]:
print(n_tokens, dataset_size)

39 1115394


In [7]:
#Moving the text to a dataset that can train the model.

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [8]:
#Splitting the data into test and train data. Train data being all data before 1,000,000 and being shuffled.
#Validation data is 1,000,000 to 1,060,000.
#Test data is 1,060,000 to 1,115,394. Neither Valid data or Test data are shuffled.

length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True,
                       seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

# Building and Training the Model

In [9]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU

#Model structure
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

#Compiling the Data, looking out for accuracy, running both the training data and the validation date, keeping the most accurate results and running for 10 epochs.

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10,
                    callbacks=[model_ckpt])

Epoch 1/10
  31247/Unknown - 2174s 69ms/step - loss: 1.3902 - accuracy: 0.5744



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 2/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 3/10
Epoch 4/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 5/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 6/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 7/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 8/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 9/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets


Epoch 10/10



INFO:tensorflow:Assets written to: my_shakespeare_model\assets


INFO:tensorflow:Assets written to: my_shakespeare_model\assets




Each epoch took around 40 mins, total time to rrain was 6 and a half hours.
Both the validation accuracy and the regular accuracy gained over the run time and the loss decreased as the model trained.
Below is keeping the sequence and merging layers.

In [10]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
    model
])

# Generating Text

In [11]:
y_proba = shakespeare_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)  # choose the most probable character ID
text_vec_layer.get_vocabulary()[y_pred + 2]



'e'

Putting in a base input for analysis and using the model to predict what the next letter will be. The phrase comes from Hamlet Act 3, Scene 1. (info at: https://www.rsc.org.uk/shakespeare-learning-zone/hamlet/language/to-be-or-not-to-be#:~:text=To%20be%20or%20not%20to%20be&text=Hamlet%20is%20in%20a%20state,or%20end%20them%20by%20dying )



In [12]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])  # probas = 50%, 40%, and 10%
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)  # draw 8 samples

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 1, 0, 2, 1, 0, 0, 1]], dtype=int64)>

Running and shuffling and gathering 8 random samples.

In [13]:
def next_char(text, temperature=1):
    y_proba = shakespeare_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

#Generating the next character based on the input provided.

In [14]:
def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

#extending the text by an extra 50 characters after the 'To be or not to be' section.

In [15]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU

In [16]:
#Basic small temperature means safe and small additions based on the original sentence.

print(extend_text("To be or not to be", temperature=0.01))

To be or not to be a strange
and so much shall be so with his father


Created a 67 length string including spaces. The output is clear and makes sense.

In [17]:
print(extend_text("To be or not to be", temperature=1))

To be or not to be broke a grumio!
what's my instruction own thing
a


Upps the temperature to 1 and the model starts behaving incorrectly. The string is now 66 characters long but the sentence makes no sense nor is from shakespheare's works. Yet it is spaced and capitalised correctly with most of the correct punctuation.

In [18]:
print(extend_text("To be or not to be", temperature=100))

To be or not to bef ,mt'&o3f:ady-$
wh!nse?pws3ert--vgerdjw?c-y-ewxnj


With the temperature at 100, the model breaks completely. The added characyers are no longer spaced correctly nor do they resemble any similar words to english. The punctuation and special characters are everywhere they shouldn't be. The fails the models job of predicting and generating text but the model is young in its development so this is to be expected.