In [1]:
# loading libraries for data manipulation
import numpy as np
import pandas as pd

# loading libraries for data visualization
import matplotlib.pyplot as plt
from plotnine import *
from PIL import Image

# import tensorflow and keras packages
import tensorflow as tf
from tensorflow import keras

# let's also include different Models, Layers directly from keras
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding,Input,GRU

# use requests package to download some text
import requests

import warnings
warnings.filterwarnings('ignore')

In [2]:
# url to Romeo and Juliet in text form
url = "https://gutenberg.org/cache/epub/1513/pg1513.txt"
text = requests.get(url).text

# clean text
text = text[text.find("Chapter I.]")+10:text.find("*** END OF THE PROJECT")] # exclude metadata
text = text.lower()
print(f"Length of text: {len(text)} characters")

Length of text: 148586 characters


In [3]:
# identify unique words in text
words = text.split()
print(f"Total words: {len(words)}")

Total words: 26093


In [4]:
# generate the two dictionaries
vocab = sorted(set(words))
print(f"Unique words: {len(vocab)}")

word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}

Unique words: 5775


In [7]:
text_as_int = np.array([word2idx[w] for w in words], dtype=np.int32)
print("First 20 encoded words:", text_as_int[:20])

First 20 encoded words: [1507 2120 1504 3446 4060  220 2579 4931 1504 2511 1842 4869 5246 3446
  245  248 2468 4869 5196 4609]


In [8]:
seq_length = 30
examples_per_epoch = len(text_as_int) // (seq_length + 1)
print(f"Number of sequences: {examples_per_epoch}")

Number of sequences: 841


In [9]:
word_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = word_dataset.batch(seq_length + 1, drop_remainder=True)

In [9]:
# print the first words characters in the data
for i, item in enumerate(word_dataset.take(10)):
    print(item.numpy())

# print the first sequence
for i, item in enumerate(sequences.take(1)):
    print(item.numpy())

1507
2120
1504
3446
4060
220
2579
4931
1504
2511
[1507 2120 1504 3446 4060  220 2579 4931 1504 2511 1842 4869 5246 3446
  245  248 2468 4869 5196 4609  220]


In [10]:
#   input_text (first 30 chars)
#   target_text (the next 30 chars, shifted by one position)
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

# apply the function to sequences
dataset = sequences.map(split_input_target)

In [11]:
for input_example, target_example in dataset.take(1):
    print("Input shape:", input_example.shape)
    print("Target shape:", target_example.shape)
    print("First input example (as IDs):", input_example[0].numpy())
    print("First target example (as IDs):", target_example[0].numpy())

Input shape: (20,)
Target shape: (20,)
First input example (as IDs): 1507
First target example (as IDs): 2120


In [12]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [15]:
# define hyperparameters for the network
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

model = Sequential([
    Input(shape=(None,)),
    Embedding(vocab_size, embedding_dim),
    LSTM(rnn_units, return_sequences=True),
    LSTM(rnn_units, return_sequences=True),   # new layer
    Dropout(0.2),
    Dense(vocab_size)
])

model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True)
)

In [16]:
# train model
history = model.fit(dataset, epochs=20,verbose=1)


Epoch 1/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 3s/step - loss: 8.3786
Epoch 2/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 3s/step - loss: 7.1990
Epoch 3/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 3s/step - loss: 7.0193
Epoch 4/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - loss: 6.9842
Epoch 5/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 3s/step - loss: 6.9305
Epoch 6/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3s/step - loss: 6.8900
Epoch 7/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - loss: 6.7877
Epoch 8/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - loss: 6.7198
Epoch 9/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3s/step - loss: 6.6780
Epoch 10/20
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 3s/step - loss: 6.6123
Epoch 11/

In [17]:
model.summary()

In [18]:
def generate_text(model, start_seq, num_generate=50, temperature=1.0):
    # Tokenize the starting sequence into words
    input_eval = [word2idx.get(w, 0) for w in start_seq.lower().split()]
    input_eval = tf.expand_dims(input_eval, 0)

    generated_words = []

    for _ in range(num_generate):
        predictions = model.predict(input_eval, verbose=0)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature

        predicted_id = tf.random.categorical(predictions[-1:], num_samples=1)[0, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        generated_words.append(idx2word[predicted_id])

    return start_seq + ' ' + ' '.join(generated_words)

In [19]:
generate_text(model, "romeo", 40, temperature=0.8)

'romeo meanest this foot bride! earthen pardon compare and men. romeo. that eye; and scene case mov’d? vitae. county or gone! than in make how and company: o’clock than sir. stick beats satisfied. sin, eyes, her child than will bed read'

In [20]:
output = generate_text(model, "juliet", num_generate=1000, temperature=0.5)
output = output.split(".")
for sentence in output:
    print(sentence)

juliet to a lawrence
 and thou art, lets dovehouse beauties: or the father highway doves peace, nurse
 and montague
 and this mother romeo
 the sentence course; full for and and the will the lawrence
 the bed
 report
 grudge the hand
 the lawrence
 the hurt and my minute in my lawrence
 the man and a door, clouds, gentler virtue and in and i i will the tomb; leg taker the man fares that in the hall, youthful good lawrence
 and for to a errand
 augmenting and a will and the man with and be an old lady and and am nurse
 have thou lawrence
 this will the man and and and the lawrence
 with i are and the much
 the my lawrence
 heaven for with love, and romeo, her a hath and my night, the man of the lawrence
 the letter’s that the lawrence
 and the thing gracious judgment and with to these love juliet, of and and in have this traces, robes brain
 that in and to my mother?’ ancestors i a man i have a fair flesh
 the sentence chinks
 fish and thou to a foul nuptial dried and the time her fearf