In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
text = open('shakespeare.txt', 'r').read()
print('Length of text: {} characters'.format(len(text)))

Length of text: 5445609 characters


In [3]:
vocab = sorted(set(text))

In [4]:
for pair in enumerate(vocab):
    print(pair) 

(0, '\n')
(1, ' ')
(2, '!')
(3, '"')
(4, '&')
(5, "'")
(6, '(')
(7, ')')
(8, ',')
(9, '-')
(10, '.')
(11, '0')
(12, '1')
(13, '2')
(14, '3')
(15, '4')
(16, '5')
(17, '6')
(18, '7')
(19, '8')
(20, '9')
(21, ':')
(22, ';')
(23, '<')
(24, '>')
(25, '?')
(26, 'A')
(27, 'B')
(28, 'C')
(29, 'D')
(30, 'E')
(31, 'F')
(32, 'G')
(33, 'H')
(34, 'I')
(35, 'J')
(36, 'K')
(37, 'L')
(38, 'M')
(39, 'N')
(40, 'O')
(41, 'P')
(42, 'Q')
(43, 'R')
(44, 'S')
(45, 'T')
(46, 'U')
(47, 'V')
(48, 'W')
(49, 'X')
(50, 'Y')
(51, 'Z')
(52, '[')
(53, ']')
(54, '_')
(55, '`')
(56, 'a')
(57, 'b')
(58, 'c')
(59, 'd')
(60, 'e')
(61, 'f')
(62, 'g')
(63, 'h')
(64, 'i')
(65, 'j')
(66, 'k')
(67, 'l')
(68, 'm')
(69, 'n')
(70, 'o')
(71, 'p')
(72, 'q')
(73, 'r')
(74, 's')
(75, 't')
(76, 'u')
(77, 'v')
(78, 'w')
(79, 'x')
(80, 'y')
(81, 'z')
(82, '|')
(83, '}')


In [5]:
char2idx = {u:i for i, u in enumerate(vocab)} # Create a mapping from unique characters to indices
idx2char = np.array(vocab)

In [6]:
encoded_text = np.array([char2idx[c] for c in text]) # Convert the entire text from characters to indices

In [7]:
print(text[:500])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bu


In [8]:
line = """
 From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,"""
print('Length of line: {} characters'.format(len(line))) # On average, a line has 40 characters, so a sequence length of 120 is reasonable


Length of line: 133 characters


In [9]:
seq_length = 120
total_num_seq = len(text) // (seq_length + 1)
print('Total number of sequences: {}'.format(total_num_seq))

Total number of sequences: 45005


In [10]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text) # Create a dataset from the text
for item in char_dataset.take(500):
    print(idx2char[item.numpy()])



 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1


 
 
F
r
o
m
 
f
a
i
r
e
s
t
 
c
r
e
a
t
u
r
e
s
 
w
e
 
d
e
s
i
r
e
 
i
n
c
r
e
a
s
e
,


 
 
T
h
a
t
 
t
h
e
r
e
b
y
 
b
e
a
u
t
y
'
s
 
r
o
s
e
 
m
i
g
h
t
 
n
e
v
e
r
 
d
i
e
,


 
 
B
u
t
 
a
s
 
t
h
e
 
r
i
p
e
r
 
s
h
o
u
l
d
 
b
y
 
t
i
m
e
 
d
e
c
e
a
s
e
,


 
 
H
i
s
 
t
e
n
d
e
r
 
h
e
i
r
 
m
i
g
h
t
 
b
e
a
r
 
h
i
s
 
m
e
m
o
r
y
:


 
 
B
u
t
 
t
h
o
u
 
c
o
n
t
r
a
c
t
e
d
 
t
o
 
t
h
i
n
e
 
o
w
n
 
b
r
i
g
h
t
 
e
y
e
s
,


 
 
F
e
e
d
'
s
t
 
t
h
y
 
l
i
g
h
t
'
s
 
f
l
a
m
e
 
w
i
t
h
 
s
e
l
f
-
s
u
b
s
t
a
n
t
i
a
l
 
f
u
e
l
,


 
 
M
a
k
i
n
g
 
a
 
f
a
m
i
n
e
 
w
h
e
r
e
 
a
b
u
n
d
a
n
c
e
 
l
i
e
s
,


 
 
T
h
y
 
s
e
l
f
 
t
h
y
 
f
o
e
,
 
t
o
 
t
h
y
 
s
w
e
e
t
 
s
e
l
f
 
t
o
o
 
c
r
u
e
l
:


 
 
T
h
o
u
 
t
h
a
t
 
a
r
t
 
n
o
w
 
t
h
e
 
w
o
r
l
d
'
s
 
f
r
e
s
h
 
o
r
n
a
m
e
n
t
,


 
 
A
n
d
 
o
n
l
y
 
h
e
r
a
l
d
 
t
o
 
t
h
e
 
g
a
u
d
y
 
s
p
r
i
n
g
,


 
 
W
i
t
h
i
n
 
t
h
i
n
e
 
o
w
n
 
b
u


In [11]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True) # Create sequences of length 121
def create_seq_targets(seq): # Create input and target sequences 
    input_txt = seq[:-1] # For instance, if the sequence is 'hello', the input sequence is 'hell' and the target sequence is 'ello'
    target_txt = seq[1:] # The target sequence is the input sequence shifted by one character
    return input_txt, target_txt

In [12]:
dataset = sequences.map(create_seq_targets) # Apply the function to the sequences
for input_txt, target_txt in dataset.take(1):
    print(input_txt.numpy())
    print(''.join(idx2char[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    print(''.join(idx2char[target_txt.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [13]:
batch_size = 128 # Define the batch size 

In [14]:
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True) # Shuffle the dataset and create batches
vocab_size = len(vocab) # Define the vocabulary size

In [15]:
dataset

<_BatchDataset element_spec=(TensorSpec(shape=(128, 120), dtype=tf.int64, name=None), TensorSpec(shape=(128, 120), dtype=tf.int64, name=None))>

In [16]:
vocab_size = len(vocab) # Define the vocabulary size
embed_dim = 64 # Define the embedding dimension, which is the number of dimensions for the word vectors
rnn_neurons = 1026 # Define the number of RNN neurons 

In [17]:
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [18]:
def sparse_cat_loss(y_true, y_pred): # Define the loss function
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [19]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = tf.keras.Sequential()
    model.add(Embedding(vocab_size, embed_dim, input_shape=[None]))  # Usando input_shape ao invés de batch_input_shape
    model.add(GRU(rnn_neurons, return_sequences=True, recurrent_initializer='glorot_uniform'))  # Removido stateful=True
    model.add(Dense(vocab_size))
    model.compile('adam', loss=sparse_cat_loss)
    return model


In [20]:
model = create_model(vocab_size=vocab_size, 
                     embed_dim=embed_dim, 
                     rnn_neurons=rnn_neurons, 
                     batch_size=batch_size) # Create the model

  super().__init__(**kwargs)


In [21]:
model.summary() # Over 3 million parameters

In [22]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch) # Pass the input batch to the model and get the predictions
    print(example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')

(128, 120, 84) # (batch_size, sequence_length, vocab_size)


In [23]:
sample_indicies = tf.random.categorical(example_batch_predictions[0], num_samples=1) # Sample from the output distribution to get the predicted character

In [24]:
sample_indicies = tf.squeeze(sample_indicies, axis=-1).numpy() # Remove the last dimension
print(sample_indicies)

[70 76 73 31  6 37  0  0 47 63 59 43 46 74 16 63 30 63 76 42 56 64 44 51
 15 27 42 48 67 32  2 58  4 18 57 48 44 19 57 34 15 44 24 27 83  3 12 13
 75  3  8 71 57 22 31 16 54 23 11 43 77 82 10 36  7 14 21 72 53 81 75 46
 41 17 83 81 67 18 54 59 62 25 32  0 40  9 39 59 23 58 23 83 58 15 16 49
  9 32 39 74 10 74 55 23  1 56 42 83 10 28 14 48 69 39 57 76 10 68 38 42]


In [25]:
idx2char[sample_indicies] # Random characters, since the model has not been trained yet

array(['o', 'u', 'r', 'F', '(', 'L', '\n', '\n', 'V', 'h', 'd', 'R', 'U',
       's', '5', 'h', 'E', 'h', 'u', 'Q', 'a', 'i', 'S', 'Z', '4', 'B',
       'Q', 'W', 'l', 'G', '!', 'c', '&', '7', 'b', 'W', 'S', '8', 'b',
       'I', '4', 'S', '>', 'B', '}', '"', '1', '2', 't', '"', ',', 'p',
       'b', ';', 'F', '5', '_', '<', '0', 'R', 'v', '|', '.', 'K', ')',
       '3', ':', 'q', ']', 'z', 't', 'U', 'P', '6', '}', 'z', 'l', '7',
       '_', 'd', 'g', '?', 'G', '\n', 'O', '-', 'N', 'd', '<', 'c', '<',
       '}', 'c', '4', '5', 'X', '-', 'G', 'N', 's', '.', 's', '`', '<',
       ' ', 'a', 'Q', '}', '.', 'C', '3', 'W', 'n', 'N', 'b', 'u', '.',
       'm', 'M', 'Q'], dtype='<U1')

In [26]:
epochs = 30 # Define the number of epochs
model.fit(dataset, epochs=epochs) # Train the model

Epoch 1/30
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m374s[0m 1s/step - loss: 2.8455
Epoch 2/30
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 1s/step - loss: 1.6191
Epoch 3/30
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 1s/step - loss: 1.3757
Epoch 4/30
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m365s[0m 1s/step - loss: 1.2770
Epoch 5/30
[1m351/351[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m365s[0m 1s/step - loss: 1.2231
Epoch 6/30
[1m 57/351[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1:08:29[0m 14s/step - loss: 1.2000

KeyboardInterrupt: 

In [None]:
def generate_text(model, start_seed, gen_size=500, temp=1.0): # Generate text using the trained model
    num_generate = gen_size
    input_eval = [char2idx[s] for s in start_seed]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    temperature = temp
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])
    return (start_seed + ''.join(text_generated))

In [None]:
print(generate_text(model, 'JULIET', gen_size=1000)) # Generate text using the trained model

ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input [[[57. 66. 57. 67.  1.  1.]]]. Expected shape (None, None), but input has incompatible shape (1, 1, 6)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1, 1, 6), dtype=int32)
  • training=False
  • mask=None