# LSTM: Aplicacion para determinar la proxima palabra.
Este cuaderno esta basado en
[este video](https://www.youtube.com/watch?v=Zn22qt7j2dM).

Bajar los datos de
[Kaggle](https://www.kaggle.com/datasets/adangonzalez/sherlock-holmes-txt)






In [1]:
# import libraries
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import os

In [4]:
# mount gdrive from google
from google.colab import drive
drive.mount('/content/gdrive')



Mounted at /content/gdrive


In [5]:
!cd /root
!mkdir -p ~/.kaggle

In [6]:
# si kaggle no esa debe hacer pip install kaggle
import shutil
shutil.move("kaggle.json", "/root/.kaggle")

'/root/.kaggle/kaggle.json'

In [7]:
!chmod 600 /root/.kaggle/kaggle.json

In [8]:
!kaggle datasets download -d adangonzalez/sherlock-holmes-txt

Dataset URL: https://www.kaggle.com/datasets/adangonzalez/sherlock-holmes-txt
License(s): unknown
Downloading sherlock-holmes-txt.zip to /content
  0% 0.00/221k [00:00<?, ?B/s]
100% 221k/221k [00:00<00:00, 86.0MB/s]


In [9]:
import zipfile
import os

# replace dataset.zip with the actual name of the downloaded zip file
with zipfile.ZipFile('sherlock-holmes-txt.zip', 'r') as zip_ref:
    zip_ref.extractall("/content")


In [10]:
# open the file
file = open("Sherlock.txt", "r", encoding="utf8")
file # QC

<_io.TextIOWrapper name='Sherlock.txt' mode='r' encoding='utf8'>

In [11]:
lines = []
# read all the lines
for i in file:
    lines.append(i)

lines[:20] #QC

['The Adventures of Sherlock Holmes\n',
 '\n',
 'by Arthur Conan Doyle\n',
 '\n',
 '\n',
 'Contents\n',
 '\n',
 '   I.     A Scandal in Bohemia\n',
 '   II.    The Red-Headed League\n',
 '   III.   A Case of Identity\n',
 '   IV.    The Boscombe Valley Mystery\n',
 '   V.     The Five Orange Pips\n',
 '   VI.    The Man with the Twisted Lip\n',
 '   VII.   The Adventure of the Blue Carbuncle\n',
 '   VIII.  The Adventure of the Speckled Band\n',
 '   IX.    The Adventure of the Engineer’s Thumb\n',
 '   X.     The Adventure of the Noble Bachelor\n',
 '   XI.    The Adventure of the Beryl Coronet\n',
 '   XII.   The Adventure of the Copper Beeches\n',
 '\n']

In [12]:
data = ""
for i in lines:
    data=' '.join(lines)

data[:100] # QC

'The Adventures of Sherlock Holmes\n \n by Arthur Conan Doyle\n \n \n Contents\n \n    I.     A Scandal in B'

In [13]:
# process data, remove backslashes and other things
data1 = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data1[:1000] # QC

'The Adventures of Sherlock Holmes  by Arthur Conan Doyle   Contents     I.     A Scandal in Bohemia    II.    The Red-Headed League    III.   A Case of Identity    IV.    The Boscombe Valley Mystery    V.     The Five Orange Pips    VI.    The Man with the Twisted Lip    VII.   The Adventure of the Blue Carbuncle    VIII.  The Adventure of the Speckled Band    IX.    The Adventure of the Engineer’s Thumb    X.     The Adventure of the Noble Bachelor    XI.    The Adventure of the Beryl Coronet    XII.   The Adventure of the Copper Beeches     I. A SCANDAL IN BOHEMIA   I.  To Sherlock Holmes she is always _the_ woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the wor

In [14]:
# remover espacions multiples con "split" y "join"
data2 = data1.split() # parte todas las palabras en un lista
data2[:20]

['The',
 'Adventures',
 'of',
 'Sherlock',
 'Holmes',
 'by',
 'Arthur',
 'Conan',
 'Doyle',
 'Contents',
 'I.',
 'A',
 'Scandal',
 'in',
 'Bohemia',
 'II.',
 'The',
 'Red-Headed',
 'League',
 'III.']

In [15]:
data3=' '.join(data2)
data3[:1000]

'The Adventures of Sherlock Holmes by Arthur Conan Doyle Contents I. A Scandal in Bohemia II. The Red-Headed League III. A Case of Identity IV. The Boscombe Valley Mystery V. The Five Orange Pips VI. The Man with the Twisted Lip VII. The Adventure of the Blue Carbuncle VIII. The Adventure of the Speckled Band IX. The Adventure of the Engineer’s Thumb X. The Adventure of the Noble Bachelor XI. The Adventure of the Beryl Coronet XII. The Adventure of the Copper Beeches I. A SCANDAL IN BOHEMIA I. To Sherlock Holmes she is always _the_ woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position. He 

In [16]:
# tokenize data
import itertools
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data3])
word_index = tokenizer.word_index
dict(itertools.islice(word_index.items(), 10))

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'i': 5,
 'a': 6,
 '”': 7,
 'in': 8,
 'that': 9,
 'it': 10}

In [17]:
# creamos sequencia de datos
sequence_data = tokenizer.texts_to_sequences([data3])[0]
sequence_data[:10]

[1, 1406, 4, 132, 34, 48, 698, 4604, 4605, 1844]

In [18]:
len(sequence_data)

107995

In [19]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8643


In [20]:
# creamos secuencias de 4 palabras.
# la idea es que X es un vector de 3 palabras y "y" es la salida

In [21]:
sequence=[]
for i in range(3, len(sequence_data)):
    words=sequence_data[i-3:i+1]
    sequence.append(words)

len(sequence)

107992

In [22]:
type(sequence)

list

In [23]:
sequence = np.array(sequence)
sequence[:10]


array([[   1, 1406,    4,  132],
       [1406,    4,  132,   34],
       [   4,  132,   34,   48],
       [ 132,   34,   48,  698],
       [  34,   48,  698, 4604],
       [  48,  698, 4604, 4605],
       [ 698, 4604, 4605, 1844],
       [4604, 4605, 1844,    5],
       [4605, 1844,    5,    6],
       [1844,    5,    6,  850]])

In [24]:
# definamos los input/output pairs
X=[]
y=[]
for i in sequence:
    X.append(i[0:3])
    y.append(i[3])

X[:10] # QC

[array([   1, 1406,    4]),
 array([1406,    4,  132]),
 array([  4, 132,  34]),
 array([132,  34,  48]),
 array([ 34,  48, 698]),
 array([  48,  698, 4604]),
 array([ 698, 4604, 4605]),
 array([4604, 4605, 1844]),
 array([4605, 1844,    5]),
 array([1844,    5,    6])]

In [25]:
y[:10] # QC

[132, 34, 48, 698, 4604, 4605, 1844, 5, 6, 850]

In [26]:
type(X)

list

In [27]:
# convertir listas a arreglos
X = np.array(X)
y = np.array(y)

# convert y to categorical
y = to_categorical(y, num_classes=vocab_size)
y[0] # QC

array([0., 0., 0., ..., 0., 0., 0.])

In [28]:
len(y[0])

8643

In [29]:
max(y[0])

1.0

In [30]:
np.argmax(y[0])

132

In [31]:
y[0][132]

1.0

In [32]:
# build the LSTM model
nunits=1000
sequence_length=3
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=10, input_shape=(sequence_length,)))
model.add(LSTM(nunits, return_sequences=True))
model.add(LSTM(nunits))
# model.add(Dense(units=nunits, activation='relu'))
model.add(Dense(units=vocab_size, activation='softmax'))
model.summary()

  super().__init__(**kwargs)


In [33]:
# compile and train
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("next_word.keras", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

In [34]:
model.fit(X, y, epochs=2, batch_size=16, callbacks=[checkpoint])

Epoch 1/2
[1m6750/6750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 6.5714
Epoch 1: loss improved from inf to 6.26436, saving model to next_word.keras
[1m6750/6750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 6ms/step - loss: 6.5714
Epoch 2/2
[1m6743/6750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - loss: 5.5759
Epoch 2: loss improved from 6.26436 to 5.56826, saving model to next_word.keras
[1m6750/6750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 6ms/step - loss: 5.5759


<keras.src.callbacks.history.History at 0x7ead086b43a0>

In [41]:
# testing
def predict_next_word(model, tokenizer, text):
    seq = tokenizer.texts_to_sequences([text])[0]
    seq = np.array(seq)
    predict =  np.argmax(model.predict(seq.reshape(1, -1)))
    pred  = model.predict(seq.reshape(1,-1))[0]

    pred_word = tokenizer.index_word[np.argmax(pred)]
    return pred_word

In [47]:
predict_next_word(model, tokenizer, 'with a few')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


'minutes'

Tres redes importantes recurrentes:

1. RNN
2. LSTM
3. GRU: Gated Recurrent Inits (next class).