In [1]:
import tensorflow as tf
import numpy as np
import collections
from keras.api.layers import LSTM, Dense

In [5]:
def read_data(fname):
    with open(fname) as f:
        content = f.readlines()
        content = [x.strip() for x in content]
        words = []
        for line in content:
            words.extend(line.split())
    return np.array(words)

In [6]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    word2id = {}
    for word, freq in count:
        word2id[word] = len(word2id)
        id2word = dict(zip(word2id.values(), word2id.keys()))
    return word2id, id2word

In [9]:
data = read_data('AliceInTheDreamWorld.txt')
print(data)
w2i, i2w = build_dataset(data)
vocab_size = len(w2i)
timestep = 3

['in' 'a' 'small' ... 'walking' 'beside' 'her.']


In [10]:
X, Y = [], []
for i in range(timestep, len(data)):
    X.append([w2i[data[k]] for k in range(i-timestep, i)])
    Y.append(w2i[data[i]])

In [11]:
encoded_data = [w2i[x] for x in data]
X = encoded_data[:-1]
Y = encoded_data[timestep:]
X_training_np = np.array(X)
y_training_np = np.array(Y)
train_data = tf.keras.preprocessing.timeseries_dataset_from_array(
X_training_np, y_training_np, sequence_length=timestep, sampling_rate=1
)

In [12]:
model = tf.keras.Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(timestep, 1)))
model.add(LSTM(512, return_sequences=False))
model.add(Dense(vocab_size))
model.summary()

  super().__init__(**kwargs)


In [13]:
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])
model.fit(train_data, epochs=500)

Epoch 1/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 64ms/step - accuracy: 0.0418 - loss: 6.4848
Epoch 2/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.0547 - loss: 5.9309
Epoch 3/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - accuracy: 0.0559 - loss: 5.7888
Epoch 4/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - accuracy: 0.0594 - loss: 5.6264
Epoch 5/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.0588 - loss: 5.4254
Epoch 6/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step - accuracy: 0.0671 - loss: 5.2042
Epoch 7/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 71ms/step - accuracy: 0.0721 - loss: 5.0014
Epoch 8/500
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.0768 - loss: 4.8137
Epoch 9/500
[1m21/21[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x274a47a7c10>

In [25]:
model.save('train.keras')

In [27]:
def encode(sent):
    encoded_sent = [w2i[w] for w in sent.split()] 
    encoded_sent = np.array(encoded_sent).reshape(1, timestep, 1)  
    return encoded_sent
pred = model.predict(encode("in a small"))
pred_word = i2w[np.argmax(pred)]
print(pred_word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
village
