In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense



In [2]:
corpus = [
    "I love deep learning",
    "I love natural language processing",
    "deep learning is fun",
    "language models are powerful",
    "I don't like icecream"
]

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [10]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequences = token_list[:i+1]
        input_sequences.append(n_gram_sequences)

input_sequences

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2],
 [1, 2, 6],
 [1, 2, 6, 5],
 [1, 2, 6, 5, 7],
 [3, 4],
 [3, 4, 8],
 [3, 4, 8, 9],
 [5, 10],
 [5, 10, 11],
 [5, 10, 11, 12]]

In [11]:
max_seq_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len)

In [14]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)


array([[ 0,  0,  0,  1],
       [ 0,  0,  1,  2],
       [ 0,  1,  2,  3],
       [ 0,  0,  0,  1],
       [ 0,  0,  1,  2],
       [ 0,  1,  2,  6],
       [ 1,  2,  6,  5],
       [ 0,  0,  0,  3],
       [ 0,  0,  3,  4],
       [ 0,  3,  4,  8],
       [ 0,  0,  0,  5],
       [ 0,  0,  5, 10],
       [ 0,  5, 10, 11]], dtype=int32)

In [15]:
model = Sequential([
    Embedding(input_dim=total_words, output_dim=10, input_length=max_seq_len - 1),
    LSTM(64),
    Dense(total_words, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


2025-05-12 12:45:47.605537: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-05-12 12:45:47.605646: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2025-05-12 12:45:47.605665: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2025-05-12 12:45:47.605704: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-12 12:45:47.605724: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [17]:
history = model.fit(X, y, epochs=200, verbose=1)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [20]:
seed_text = "I hate"
next_word = 1

for _ in range(next_word):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen = max_seq_len-1)
    predicted_probs = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predicted_probs)

    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            seed_text+=" "+ word
            break

print("Next work predictions: ", seed_text)


Next work predictions:  I love natural
