In [17]:
# source text
data = """The Grand Canyon is a steep-sided canyon carved by the Colorado River in Arizona, United States. The Grand Canyon is 277 miles long, up to 18 miles wide and attains a depth of over a mile.

The canyon and adjacent rim are contained within Grand Canyon National Park, the Kaibab National Forest, Grand Canyon–Parashant National Monument, the Hualapai Indian Reservation, the Havasupai Indian Reservation and the Navajo Nation. President Theodore Roosevelt was a major proponent of preservation of the Grand Canyon area and visited it on numerous occasions to hunt and enjoy the scenery.

Nearly two billion years of Earth's geological history have been exposed as the Colorado River and its tributaries cut their channels through layer after layer of rock while the Colorado Plateau was uplifted. While some aspects about the history of incision of the canyon are debated by geologists, several recent studies support the hypothesis that the Colorado River established its course through the area about 5 to 6 million years ago. Since that time, the Colorado River has driven the down-cutting of the tributaries and retreat of the cliffs, simultaneously deepening and widening the canyon.

For thousands of years, the area has been continuously inhabited by Native Americans, who built settlements within the canyon and its many caves. The Pueblo people considered the Grand Canyon a holy site, and made pilgrimages to it. The first European known to have viewed the Grand Canyon was García López de Cárdenas from Spain, who arrived in 1540."""


In [18]:
import nltk


In [19]:
# prepare the dataset
nltk.download('punkt_tab')

nltk.download('punkt')
nltk.download('stopwords')
all_sentences = nltk.sent_tokenize(data)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop words
from nltk.corpus import stopwords
for i in range(len(all_words)):
  all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
all_words

[['The',
  'Grand',
  'Canyon',
  'steep-sided',
  'canyon',
  'carved',
  'Colorado',
  'River',
  'Arizona',
  ',',
  'United',
  'States',
  '.'],
 ['The',
  'Grand',
  'Canyon',
  '277',
  'miles',
  'long',
  ',',
  '18',
  'miles',
  'wide',
  'attains',
  'depth',
  'mile',
  '.'],
 ['The',
  'canyon',
  'adjacent',
  'rim',
  'contained',
  'within',
  'Grand',
  'Canyon',
  'National',
  'Park',
  ',',
  'Kaibab',
  'National',
  'Forest',
  ',',
  'Grand',
  'Canyon–Parashant',
  'National',
  'Monument',
  ',',
  'Hualapai',
  'Indian',
  'Reservation',
  ',',
  'Havasupai',
  'Indian',
  'Reservation',
  'Navajo',
  'Nation',
  '.'],
 ['President',
  'Theodore',
  'Roosevelt',
  'major',
  'proponent',
  'preservation',
  'Grand',
  'Canyon',
  'area',
  'visited',
  'numerous',
  'occasions',
  'hunt',
  'enjoy',
  'scenery',
  '.'],
 ['Nearly',
  'two',
  'billion',
  'years',
  'Earth',
  "'s",
  'geological',
  'history',
  'exposed',
  'Colorado',
  'River',
  'tributa

In [21]:
!pip install gensim
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=1)



In [24]:
word2vec.wv.most_similar('Grand')


[('Nearly', 0.3710095286369324),
 ('hunt', 0.2133701741695404),
 ('García', 0.20292851328849792),
 ('National', 0.20262862741947174),
 ('widening', 0.18730488419532776),
 ('billion', 0.18335269391536713),
 ('layer', 0.17477752268314362),
 ('Navajo', 0.17114494740962982),
 ('Colorado', 0.17063814401626587),
 ('incision', 0.1615745723247528)]

In [26]:
word2vec.wv.most_similar('incision')

[('Colorado', 0.17787475883960724),
 ('Reservation', 0.17240816354751587),
 ('Grand', 0.1615746021270752),
 ('United', 0.1582937091588974),
 ('wide', 0.15795394778251648),
 ('Park', 0.14983715116977692),
 ('viewed', 0.1467556208372116),
 ('settlements', 0.14104360342025757),
 ('Since', 0.13654309511184692),
 ('Plateau', 0.1346345990896225)]

**Prediction Using LSTM**

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [28]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

# determine the vocubulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 144


In [29]:
import numpy as np
# create word -> word sequence

sequences = list()
for i in range(1,len(encoded)):
  sequence = encoded[i-1:i+1]
  sequences.append(sequence)
print('Total Sequence: %d' % len(sequences))
# split into X and y elements
sequences = np.array(sequences)
X , y = sequences[:,0] , sequences[:,1]


Total Sequence: 249


In [30]:
y = to_categorical(y, num_classes=vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))

# compile
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

# force to show the tranable params without building
model.build(input_shape=(None, 1))
model.summary()



In [31]:
model.fit(X,y, epochs=200 ,verbose=2)

Epoch 1/200
8/8 - 2s - 307ms/step - accuracy: 0.0643 - loss: 4.9694
Epoch 2/200
8/8 - 0s - 9ms/step - accuracy: 0.1165 - loss: 4.9645
Epoch 3/200
8/8 - 0s - 8ms/step - accuracy: 0.1084 - loss: 4.9604
Epoch 4/200
8/8 - 0s - 9ms/step - accuracy: 0.1084 - loss: 4.9555
Epoch 5/200
8/8 - 0s - 9ms/step - accuracy: 0.1084 - loss: 4.9504
Epoch 6/200
8/8 - 0s - 18ms/step - accuracy: 0.1084 - loss: 4.9443
Epoch 7/200
8/8 - 0s - 9ms/step - accuracy: 0.1084 - loss: 4.9369
Epoch 8/200
8/8 - 0s - 8ms/step - accuracy: 0.1084 - loss: 4.9287
Epoch 9/200
8/8 - 0s - 11ms/step - accuracy: 0.1084 - loss: 4.9182
Epoch 10/200
8/8 - 0s - 16ms/step - accuracy: 0.1084 - loss: 4.9063
Epoch 11/200
8/8 - 0s - 9ms/step - accuracy: 0.1084 - loss: 4.8922
Epoch 12/200
8/8 - 0s - 17ms/step - accuracy: 0.1245 - loss: 4.8760
Epoch 13/200
8/8 - 0s - 9ms/step - accuracy: 0.1245 - loss: 4.8553
Epoch 14/200
8/8 - 0s - 8ms/step - accuracy: 0.1245 - loss: 4.8313
Epoch 15/200
8/8 - 0s - 8ms/step - accuracy: 0.1245 - loss: 4.803

<keras.src.callbacks.history.History at 0x7931564e6510>

In [32]:
import numpy as np

def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text

    for _ in range(n_words):
        # Encode text as integers
        encoded = tokenizer.texts_to_sequences([in_text])[0]

        if not encoded:  # Check if token exists
            print("Warning: Word not found in tokenizer vocabulary.")
            break

        # Reshape for model input
        encoded = np.array(encoded).reshape(1, -1)

        # Predict next word index
        yhat = np.argmax(model.predict(encoded, verbose=0), axis=-1)[0]

        # Find corresponding word
        out_word = next((word for word, index in tokenizer.word_index.items() if index == yhat), None)

        if not out_word:  # Stop if prediction fails
            print("Warning: No word found for predicted index.")
            break

        # Append to result
        in_text, result = out_word, result + ' ' + out_word
    return result


In [33]:
print(generate_seq(model ,tokenizer, 'incision', 40))

incision of the grand canyon is 277 miles wide and its many caves the grand canyon is 277 miles wide and its many caves the grand canyon is 277 miles wide and its many caves the grand canyon is 277 miles
