# Actividad - Redes Neuronales Recurrentes
Karla González Sánchez | A01541526

Martes 7 de noviembre de 2023

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os

### Descarga de datos

In [None]:
# Abre y lee el archivo "darwin.txt" que contiene el texto que se desea analizar
with open("/content/drive/MyDrive/Colab Notebooks/NLP/darwin.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [None]:
print('Longitud del texto:        {} carácteres'.format(len(text)))

vocab = sorted(set(text))

print ('El texto está compuesto de estos {} carácteres:'.format(len(vocab)))
print (vocab)

Longitud del texto:        971489 carácteres
El texto está compuesto de estos 106 carácteres:
['\n', '\x0c', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', '¾', 'Æ', '×', 'ä', 'æ', 'ë', 'ö', 'ü', 'œ', '—', '‘', '’', '“', '”', '•', '™', '√', 'ﬀ', 'ﬁ', 'ﬂ', 'ﬃ']


### Tokenización inversa del vocabulario

In [None]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

for char,_ in zip(char2idx, range(len(vocab))):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))

  '\n':   0,
  '\x0c':   1,
  ' ' :   2,
  '!' :   3,
  '"' :   4,
  '#' :   5,
  '$' :   6,
  '%' :   7,
  '&' :   8,
  "'" :   9,
  '(' :  10,
  ')' :  11,
  '*' :  12,
  ',' :  13,
  '-' :  14,
  '.' :  15,
  '/' :  16,
  '0' :  17,
  '1' :  18,
  '2' :  19,
  '3' :  20,
  '4' :  21,
  '5' :  22,
  '6' :  23,
  '7' :  24,
  '8' :  25,
  '9' :  26,
  ':' :  27,
  ';' :  28,
  '?' :  29,
  'A' :  30,
  'B' :  31,
  'C' :  32,
  'D' :  33,
  'E' :  34,
  'F' :  35,
  'G' :  36,
  'H' :  37,
  'I' :  38,
  'J' :  39,
  'K' :  40,
  'L' :  41,
  'M' :  42,
  'N' :  43,
  'O' :  44,
  'P' :  45,
  'Q' :  46,
  'R' :  47,
  'S' :  48,
  'T' :  49,
  'U' :  50,
  'V' :  51,
  'W' :  52,
  'X' :  53,
  'Y' :  54,
  'Z' :  55,
  '[' :  56,
  ']' :  57,
  'a' :  58,
  'b' :  59,
  'c' :  60,
  'd' :  61,
  'e' :  62,
  'f' :  63,
  'g' :  64,
  'h' :  65,
  'i' :  66,
  'j' :  67,
  'k' :  68,
  'l' :  69,
  'm' :  70,
  'n' :  71,
  'o' :  72,
  'p' :  73,
  'q' :  74,
  'r' :  75,
  's' :  7

### Covertir texto a numeros

In [None]:
text_as_int = np.array([char2idx[c] for c in text])

print ('texto: {}'.format(repr(text[:50])))
print ('{}'.format(repr(text_as_int[:50])))

texto: 'The Project Gutenberg eBook of On the Origin of Sp'
array([49, 65, 62,  2, 45, 75, 72, 67, 62, 60, 77,  2, 36, 78, 77, 62, 71,
       59, 62, 75, 64,  2, 62, 31, 72, 72, 68,  2, 72, 63,  2, 44, 71,  2,
       77, 65, 62,  2, 44, 75, 66, 64, 66, 71,  2, 72, 63,  2, 48, 73])


### Preparar los datos

In [None]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

seq_length = 100

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [None]:
for item in sequences.take(10):
  print(repr(''.join(idx2char[item.numpy()])))

'The Project Gutenberg eBook of On the Origin of Species by Means of Natural Selection\n    This ebook '
'is for the use of anyone anywhere in the United States and most other parts of the world at no\n    co'
'st and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the t'
'erms\n    of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If'
' you are not\n    located in the United States, you will have to check the laws of the country where y'
'ou are located before\n    using this eBook.\n\n    Title: On the Origin of Species by Means of Natural '
'Selection\n\n         Author: Charles Darwin\n\n    Release date: September 25, 2007 [eBook #22764]\n     '
'       Most recently updated: March 12, 2021\n\n    Language: English\n\n\n    *** START OF THE PROJECT GU'
'TENBERG EBOOK ON THE ORIGIN OF SPECIES BY MEANS OF\n                                 NATURAL SELECTION'
' ***\n\n\n\n\n There are several editions of 

In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'The Project Gutenberg eBook of On the Origin of Species by Means of Natural Selection\n    This ebook'
Target data: 'he Project Gutenberg eBook of On the Origin of Species by Means of Natural Selection\n    This ebook '


In [None]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

print (dataset)

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>


### Construir modelo

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [None]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           27136     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 106)           108650    
                                                                 
Total params: 5382762 (20.53 MB)
Trainable params: 5382762 (20.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Entrenamiento

In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS=50
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### Generación de texto

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

### temperatura de 0.5

In [None]:
def generate_text(model, start_string):

  num_generate = 500
  input_eval = [char2idx[s] for s in start_string]

  input_eval = tf.expand_dims(input_eval, 0)
  text_generated = []


  temperature = 0.5

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)

      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()


      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"species"))

species, or of any recognisable degree of
utility of constitution, with which the great diﬃculty lies in the working of a perfectly ecies; and will then rank when the requirements of natural selection consider which is very great. If we have seen in the case of animals with simple in a few days, they could not be found on the heath. The eﬀect on the
insect at a quicker rate that only a single case, that of working
or sterile ants. How the workers or species are rendered to a large extent deﬁned and dis


In [None]:
print(generate_text(model, start_string=u"animals"))

animals, it
seems to me that
the theory of natural selection may be extended. In Europe we have the plainest evidence in great fossiliferous formations were deposited during periods of subsidence. These periods will have been ample figh in homologous organs, the other forms in the same manner; then, on the plants of the same variety may be said to have been recorded. These facts can be
explained, as follows, on the view of descent with modification.
It is commonly adapted to each other. It is a case of


### Temperatura de 0.1

In [None]:
def generate_text(model, start_string):

  num_generate = 500
  input_eval = [char2idx[s] for s in start_string]

  input_eval = tf.expand_dims(input_eval, 0)
  text_generated = []


  temperature = 0.1

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)

      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()


      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"species"))

species of a genus having
descended from a common parent and many individuals will have had a single species of bat having been developed into trees, &c.,—seem to me to accord better with
the view of oceanic islands, whilst the most
isolated islands possessed by the Melipona,
and in the period
muse insects in the waters of the sea. If two great regions had
been for a long period favourable conditions, is innately variable. That is, the colour was more closely related to
each other, and
with the several


In [None]:
print(generate_text(model, start_string=u"animals"))

animals, it
seems to me well enabled the same species in Switzerland, I can see no limit to this power, in
species, that mammals have not been able to migrate to
the stamens of the same species. I should never have expeared within the same class, that we might be led to look at
these facts as necessarily accumulated at wide and true time of descent. These cirripedes can act on each shoulder in comparison with the same species when self-fertilised, sometimes depends.
The practicate seems to have been fo


### Temperatura de 1

In [None]:
def generate_text(model, start_string):

  num_generate = 500
  input_eval = [char2idx[s] for s in start_string]

  input_eval = tf.expand_dims(input_eval, 0)
  text_generated = []


  temperature = 1

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)

      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()


      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"species"))

species, so that they have changed hind legs, and
even in several quarters of the world, we cannot my, make err from the waters.
Hence, we can here give only the fore-less not like the planer only with skin the female to the male mammals, from the Project Gutenberg™ electronic work, yet keep further known so little, and their
luxerens being riped in any
degree perfect and described in a few classes; a        [379]
migrate no Project Gutenberg™ e cells of the earth. But the use of any one; according to 


In [None]:
print(generate_text(model, start_string=u"animals"))

animals,
it is well known, fear to three twigming in each sub-varying group having generally
varied and were applicable to its seasons. But the importance of the cref he treather modiﬁcation and leaving ﬂoating produce hybrids.
These cases on the opposite sides of almost every quantity; and the research of the two areas, and we can understand how it is that all the forms of life, and which had subsequently better adapted to the inhabitants of other and distant sea-shell Fertility. I have         s he
w
