In [1]:
import tensorflow as tf
import numpy as np
import collections
from keras.api.layers import LSTM, Dense

In [2]:
def read_data(fname):
    with open(fname) as f:
        content = f.readlines()
        content = [x.strip() for x in content]
        words = []
        for line in content:
            words.extend(line.split())
    return np.array(words)

In [3]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    word2id = {}
    for word, freq in count:
        word2id[word] = len(word2id)
        id2word = dict(zip(word2id.values(), word2id.keys()))
    return word2id, id2word

In [4]:
data = read_data('toto.txt')
print(data)
w2i, i2w = build_dataset(data)
vocab_size = len(w2i)
timestep = 3

['long' 'ago,' 'the' 'mice' 'had' 'a' 'general' 'council' 'to' 'consider'
 'what' 'measures' 'they' 'could' 'take' 'to' 'outwit' 'their' 'common'
 'enemy,' 'the' 'cat.' 'some' 'said' 'this,' 'and' 'some' 'said' 'that'
 'but' 'at' 'last' 'a' 'young' 'mouse' 'got' 'up' 'and' 'said' 'he' 'had'
 'a' 'proposal' 'to' 'make,' 'which' 'he' 'thought' 'would' 'meet' 'the'
 'case.' 'you' 'will' 'all' 'agree,' 'said' 'he,' 'that' 'our' 'chief'
 'danger' 'consists' 'in' 'the' 'sly' 'and' 'treacherous' 'manner' 'in'
 'which' 'the' 'enemy' 'approaches' 'us.' 'now,' 'if' 'we' 'could'
 'receive' 'some' 'signal' 'of' 'her' 'approach,' 'we' 'could' 'easily'
 'escape' 'from' 'her.' 'i' 'venture,' 'therefore,' 'to' 'propose' 'that'
 'a' 'small' 'bell' 'be' 'procured,' 'and' 'attached' 'by' 'a' 'ribbon'
 'round' 'the' 'neck' 'of' 'the' 'cat.' 'by' 'this' 'means' 'we' 'should'
 'always' 'know' 'when' 'she' 'was' 'about,' 'and' 'could' 'easily'
 'retire' 'while' 'she' 'was' 'in' 'the' 'neighborhood.' 'this' '

In [5]:
X, Y = [], []
for i in range(timestep, len(data)):
    X.append([w2i[data[k]] for k in range(i-timestep, i)])
    Y.append(w2i[data[i]])

In [10]:
encoded_data = [w2i[x] for x in data]
X = encoded_data[:-1]
Y = encoded_data[timestep:]
X_training_np = np.array(X)
y_training_np = np.array(Y)
train_data = tf.keras.preprocessing.timeseries_dataset_from_array(
X_training_np, y_training_np, sequence_length=timestep, sampling_rate=1
)

In [11]:
model = tf.keras.Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(timestep, 1)))
model.add(LSTM(512, return_sequences=False))
model.add(Dense(vocab_size))
model.summary()

In [12]:
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
model.fit(train_data, epochs=500)

Epoch 1/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 47ms/step - accuracy: 0.0149 - loss: 4.7595  
Epoch 2/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.0529 - loss: 4.5469 
Epoch 3/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.0529 - loss: 4.5042 
Epoch 4/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.0719 - loss: 4.4661
Epoch 5/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.0856 - loss: 4.4205 
Epoch 6/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0693 - loss: 4.3368
Epoch 7/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.0756 - loss: 4.2400
Epoch 8/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.0920 - loss: 4.1207
Epoch 9/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x2a6ad816890>

In [15]:
def encode(sent):
    encoded_sent = [w2i[w] for w in sent.split()] 
    encoded_sent = np.array(encoded_sent).reshape(1, timestep, 1)  
    return encoded_sent
pred = model.predict(encode("had a general"))
pred_word = i2w[np.argmax(pred)]
print(pred_word)
pred = model.predict(encode("a general council"))
pred_word = i2w[np.argmax(pred)]
print(pred_word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step
council
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
to
