In [1]:
# loading libraries for data manipulation
import numpy as np
import pandas as pd

# loading libraries for data visualization
import matplotlib.pyplot as plt
from plotnine import *
from PIL import Image

# import tensorflow and keras packages
import tensorflow as tf
from tensorflow import keras

# let's also include different Models, Layers directly from keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding,Input,GRU

# use requests package to download some text
import requests

import warnings
warnings.filterwarnings('ignore')

Let's train a sequential model using the text from Alice in Wonderland. Project Gutenberg website hosts text versions of many classics. Feel free to donwload another if you want to. 

In [2]:
# url to Alice in Wonderland in text form
url = "https://gutenberg.org/cache/epub/11/pg11.txt"
text = requests.get(url).text

text = text[1451:] # exclude metadata
print(f"Length of text: {len(text)} characters")

Length of text: 166223 characters


In [3]:
# let's print the first 500 characters from text 
print(text[:500])

CHAPTER I.
Down the Rabbit-Hole


Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought Alice
“without pictures or conversations?”

So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a 


Models do not understand text like we do. We will need to create some mapping from text to integers to then pass that along to the model.

In [4]:
# lowercase the text
text = text.lower()

# create a list of all characters in the text
chars = sorted(set(text))

print("Characters in the text:",chars,"\n")

# we will create two look up dictionaries 
## char2idx: maps each character to a unique integer (ID)
## idx2char: maps integer IDs back to characters
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for i, c in enumerate(chars)}

print("char2idx",char2idx,"\n")
print("idx2char",idx2char)

Characters in the text: ['\n', '\r', ' ', '!', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ù', '—', '‘', '’', '“', '”', '•', '™'] 

char2idx {'\n': 0, '\r': 1, ' ': 2, '!': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55, 'ù': 56, '—': 57, '‘': 58, '’': 59, '“': 60, '”': 61, '•': 62, '™': 63} 

idx2char {0: '\n', 1: '\r', 2: ' ', 3: '!', 4: '$', 5: '%', 6: "'", 7: 

Now we can convert the entire text into a series of integers.

In [5]:
text_as_int = np.array([char2idx[c] for c in text], dtype=np.int32)

print("Characters in the text:",len(text_as_int))
print(text_as_int[:500])

Characters in the text: 166223
[32 37 30 45 49 34 47  2 38 12  1  0 33 44 52 43  2 49 37 34  2 47 30 31
 31 38 49 11 37 44 41 34  1  0  1  0  1  0 30 41 38 32 34  2 52 30 48  2
 31 34 36 38 43 43 38 43 36  2 49 44  2 36 34 49  2 51 34 47 54  2 49 38
 47 34 33  2 44 35  2 48 38 49 49 38 43 36  2 31 54  2 37 34 47  2 48 38
 48 49 34 47  2 44 43  2 49 37 34  1  0 31 30 43 40 10  2 30 43 33  2 44
 35  2 37 30 51 38 43 36  2 43 44 49 37 38 43 36  2 49 44  2 33 44 24  2
 44 43 32 34  2 44 47  2 49 52 38 32 34  2 48 37 34  2 37 30 33  2 45 34
 34 45 34 33  2 38 43 49 44  1  0 49 37 34  2 31 44 44 40  2 37 34 47  2
 48 38 48 49 34 47  2 52 30 48  2 47 34 30 33 38 43 36 10  2 31 50 49  2
 38 49  2 37 30 33  2 43 44  2 45 38 32 49 50 47 34 48  2 44 47  1  0 32
 44 43 51 34 47 48 30 49 38 44 43 48  2 38 43  2 38 49 10  2 60 30 43 33
  2 52 37 30 49  2 38 48  2 49 37 34  2 50 48 34  2 44 35  2 30  2 31 44
 44 40 10 61  2 49 37 44 50 36 37 49  2 30 41 38 32 34  1  0 60 52 38 49
 37 44 50 49  2 45 3

Now let's define the input for the model. This model will predict the **next character** given an input and not the next word. We will create our inputs to be 100 characters long. Think of this as a time window with 100 steps. 

In [6]:
# sequence length
seq_length = 100

# use sequence length to calculate number of sequences we can produce
examples_per_epoch = len(text_as_int) // (seq_length + 1)

print(examples_per_epoch,"sequences in the input")

1645 sequences in the input


Next, we will use tensorflow's from_tensor_slices function to create a stream of sequences. 


Text: "alice in wonderland"


↓


Integer IDs: [1, 12, 9, 3, 5, ...]


↓


Dataset from tensor slices:


  [1] → [12] → [9] → [3] → [5] ...

In [8]:
# create a dataset where each element is a single character (integer-version)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# a sequence is then a group of these characters 
# + 1 so that we take 100 characters as input and predict the character shifted by 1
# drop_remainder to drop a sequence if it's length is below our desired length
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)


2025-11-05 17:59:10.643026: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-11-05 17:59:10.643357: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-11-05 17:59:10.643758: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-11-05 17:59:10.644306: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-05 17:59:10.644840: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
# print the first 10 characters in the data
for i, item in enumerate(char_dataset.take(10)):
    print(item.numpy())

# print the first sequence 
for i, item in enumerate(sequences.take(1)):
    print(item.numpy())


32
37
30
45
49
34
47
2
38
12
[32 37 30 45 49 34 47  2 38 12  1  0 33 44 52 43  2 49 37 34  2 47 30 31
 31 38 49 11 37 44 41 34  1  0  1  0  1  0 30 41 38 32 34  2 52 30 48  2
 31 34 36 38 43 43 38 43 36  2 49 44  2 36 34 49  2 51 34 47 54  2 49 38
 47 34 33  2 44 35  2 48 38 49 49 38 43 36  2 31 54  2 37 34 47  2 48 38
 48 49 34 47  2]


2025-11-05 17:59:13.046413: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-11-05 17:59:13.055873: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Next, we can define a function that creates our dataset of sequences. 

In [10]:
#   input_text (first 100 chars)
#   target_text (the next 100 chars, shifted by one position)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# apply the function to sequences
dataset = sequences.map(split_input_target)

In [11]:
for input_example, target_example in dataset.take(1):
    print("Input shape:", input_example.shape)
    print("Target shape:", target_example.shape)
    print("First input example (as IDs):", input_example[0].numpy())
    print("First target example (as IDs):", target_example[0].numpy())

Input shape: (100,)
Target shape: (100,)
First input example (as IDs): 32
First target example (as IDs): 37


2025-11-05 18:06:38.671094: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [12]:
BATCH_SIZE = 64 # how many sequences the model sees at once
BUFFER_SIZE = 10000 # controls randomness of shuffle

# shuffle randomly picks elements from a buffer of size 10000 - large buffer = more random shuffling
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

We have now created a dataset where each sequence is 100 characters long and the target for that sequence is also 100 characters long shifted by 1 character. We have also shuffled the input to the model to add some randomness. Note that buffer size if larger than the dataset size means an ideal situation for random selection. 

Let's also define the parameters for our network. While converting characters into integers was the first step, these integer IDs are arbitrary and do not truly represent characters and how they should be represented. The Embedding layer will allow us to learn the relationship between characters. This is much better than one-hot encoding. So as part of predicting a sequence of characters, our model will also learn to better represent each character. 

In [13]:
# define hyperparameters for the network
vocab_size = len(chars)   # number of unique characters
embedding_dim = 256       # dimensions of character embeddings
rnn_units = 512           # LSTM hidden units

model = Sequential([
    Input(shape=(None,)), # None makes the model general to different sizes of inputs
    Embedding(vocab_size, embedding_dim), # add an Embedding layer to convert integer 
                                          #representation of characters into vector representation
    LSTM(rnn_units, return_sequences=True), # return output at each time step
    Dropout(0.2),
    Dense(vocab_size) # output is a probability distribution across all characters
])

model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True) # sparse categories so apply softmax to get probability
)

In [14]:
# train model
history = model.fit(dataset, epochs=20,verbose=1)

Epoch 1/20


2025-11-05 18:14:55.000585: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step - loss: 3.3526
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 141ms/step - loss: 2.7700
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 122ms/step - loss: 2.4588
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 127ms/step - loss: 2.3161
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 134ms/step - loss: 2.2087
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 145ms/step - loss: 2.1140
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 143ms/step - loss: 2.0359
Epoch 8/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 116ms/step - loss: 1.9656
Epoch 9/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 128ms/step - loss: 1.8981
Epoch 10/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 129ms/step - loss: 1.8432
E

In [15]:
model.summary()

Once our network is trained, we can now make predictions from it. We will define a function that takes in a starting sequence and then predicts what the next character should be. That then becomes the input again and we can keep predicting the next character to build sentences. 

Temperature regulates how conservative or random the prediction should be. Predictions are the raw predictions from the model which are to passed to a softmax function to calculate probabilities. By dividing the logits with temperature, we can change the shape of the probability distribution. 
- T = 1.0: no change 
- T < 1.0: model is more predictable/confident - largest logits become more prominent
- T > 1.0: model is more random/creative - flattens distribution

In [16]:
def generate_text(model,starting_seq,num_generate=1000,temperature=1.0):
    input_eval = [char2idx[c] for c in starting_seq.lower()] # convert input chars to ints
    input_eval = tf.expand_dims(input_eval, 0)  # add batch dimension for tf

    generated_text = [] # here we will store the predicted characters

    for i in range(num_generate):
        predictions = model.predict(input_eval, verbose=0)
        predictions = tf.squeeze(predictions, 0) # remove batch dimension

        # apply temperature
        predictions = predictions / temperature

        # get the predicted character for each time step
        # but we only need the very last predicted character (-1)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        # now this predicted character becomes the new input to make the next prediction
        input_eval = tf.expand_dims([predicted_id], 0)

        generated_text.append(idx2char[predicted_id]) # add prediction to list
    
    return starting_seq + ''.join(generated_text)

In [17]:
generate_text(model,"alice ",5,1.0)

'alice wase '

In [18]:
generate_text(model,"alice ",5,5.0)

"alice !g$v'"

In [19]:
generate_text(model,"alice ",5,0.1)

'alice whe t'

In [20]:
print(generate_text(model, "alice ", num_generate=1000, temperature=0.5))

alice athere tit at st n binid ang fored alit s theshe bone thineanithe anine ge tithed he lle but s winad t s t at t t alenth ithe me ag arathe ag tingre t talin therus byot de an at mure are t lat ak t ath t t blld ast t t t whes athe t t t d t inthe wenid ory are it t he arory s the them tarouco t d the t t oront ing.
ary t ares aby arengutherary are tre in lid sere walenouthe t be ingred t t nt care t t t t d of as t ast therend whe tus the s t as t inon’s a the t t ined t t whe ang thery t tenge angre th t t t ared igrent a are sinon whe  anout t an wrare a angerere te iches s tin t id the ananothe an anous t the sed t d mathed ore t anongren s aste t s t okede ant ind _ cure t aralinanore y t t and are aged angre ale an t ong t an t t whe ctont win ant t an, t t t y it s aling are tore t alithe t _ i t tinoherere in ary t thend t d an ite wanthedeve ares or it t t withed a t t sh an by an t shed analot t an the t ait ig ble t ale wharare t blle thit _ blind ines s t t ithe an t a

Let's now train a GRU using the same setup. 

In [None]:
# define hyperparameters for the network
vocab_size = len(chars)   # number of unique characters
embedding_dim = 256       # dimensions of character embeddings
rnn_units = 512           # LSTM hidden units

# the dropout here is within the GRU layer call
# dropout will randomly dropinput features at each time step
# recurrent_dropout will randomly drop hidden states from being passed to the next one
model = Sequential([
    Input(shape=(None,)), # None makes the model general to different sizes of inputs
    Embedding(vocab_size, embedding_dim), # add an Embedding layer to convert integer 
                                          #representation of characters into vector representation
    GRU(rnn_units, return_sequences=True,dropout=0.3,recurrent_dropout=0.3), 
    Dense(vocab_size) # output is a probability distribution across all characters
])

model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True) # sparse categories so apply softmax to get probability
)

In [None]:
# train model
history = model.fit(dataset, epochs=20,verbose=1)

In [None]:
model.summary()

In [None]:
generate_text(model,"alice ",5,1.0)

In [None]:
generate_text(model,"alice ",5,5.0)

In [None]:
generate_text(model,"alice ",5,0.1)

In [None]:
print(generate_text(model, "alice ", num_generate=1000, temperature=0.5))