In [1]:
# source text
data = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.\n
             Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural-language generation.\n
             Natural language processing has its roots in the 1950s.\n
             Already in 1950, Alan Turing published an article titled Computing Machinery and Intelligence which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence.\n
             The following is a list of some of the most commonly researched tasks in natural language processing.\n
             Some of these tasks have direct real-world applications, while others more commonly serve as subtasks that are used to aid in solving larger tasks.\n
             Though natural language processing tasks are closely intertwined, they can be subdivided into categories for convenience.\n"""


In [2]:
import nltk


In [3]:
# prepare the dataset
nltk.download('punkt_tab')

nltk.download('punkt')
nltk.download('stopwords')
all_sentences = nltk.sent_tokenize(data)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop words
from nltk.corpus import stopwords
for i in range(len(all_words)):
  all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
all_words

[['Natural',
  'language',
  'processing',
  '(',
  'NLP',
  ')',
  'subfield',
  'linguistics',
  ',',
  'computer',
  'science',
  ',',
  'artificial',
  'intelligence',
  'concerned',
  'interactions',
  'computers',
  'human',
  'language',
  ',',
  'particular',
  'program',
  'computers',
  'process',
  'analyze',
  'large',
  'amounts',
  'natural',
  'language',
  'data',
  '.'],
 ['Challenges',
  'natural',
  'language',
  'processing',
  'frequently',
  'involve',
  'speech',
  'recognition',
  ',',
  'natural',
  'language',
  'understanding',
  ',',
  'natural-language',
  'generation',
  '.'],
 ['Natural', 'language', 'processing', 'roots', '1950s', '.'],
 ['Already',
  '1950',
  ',',
  'Alan',
  'Turing',
  'published',
  'article',
  'titled',
  'Computing',
  'Machinery',
  'Intelligence',
  'proposed',
  'called',
  'Turing',
  'test',
  'criterion',
  'intelligence',
  ',',
  'task',
  'involves',
  'automated',
  'interpretation',
  'generation',
  'natural',
  'lang

In [5]:
!pip install gensim
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=1)



In [6]:
word2vec.wv.most_similar('processing')


[('science', 0.253825843334198),
 ('Challenges', 0.20102696120738983),
 ('linguistics', 0.19542384147644043),
 ('amounts', 0.17556427419185638),
 ('.', 0.17191971838474274),
 ('Turing', 0.15155328810214996),
 ('list', 0.1496729850769043),
 ('following', 0.14837966859340668),
 ('human', 0.14774440228939056),
 ('Computing', 0.14511805772781372)]

In [7]:
word2vec.wv.most_similar('language')

[('recognition', 0.19066877663135529),
 ('data', 0.18839259445667267),
 ('serve', 0.18468110263347626),
 ('NLP', 0.16145464777946472),
 ('roots', 0.16034404933452606),
 ('automated', 0.1600670963525772),
 ('Machinery', 0.1595340073108673),
 ('science', 0.13824574649333954),
 ('separate', 0.1350008249282837),
 ('speech', 0.12905026972293854)]

**Prediction Using LSTM**

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [9]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]

# determine the vocubulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 108


In [10]:
import numpy as np
# create word -> word sequence

sequences = list()
for i in range(1,len(encoded)):
  sequence = encoded[i-1:i+1]
  sequences.append(sequence)
print('Total Sequence: %d' % len(sequences))
# split into X and y elements
sequences = np.array(sequences)
X , y = sequences[:,0] , sequences[:,1]


Total Sequence: 173


In [11]:
y = to_categorical(y, num_classes=vocab_size)
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))

# compile
model.compile(loss='categorical_crossentropy' , optimizer='adam',
              metrics=['accuracy'])

# force to show the tranable params without building
model.build(input_shape=(None, 1))
model.summary()



In [12]:
model.fit(X,y, epochs=200 ,verbose=2)

Epoch 1/200
6/6 - 5s - 760ms/step - accuracy: 0.0173 - loss: 4.6821
Epoch 2/200
6/6 - 1s - 91ms/step - accuracy: 0.0983 - loss: 4.6787
Epoch 3/200
6/6 - 0s - 53ms/step - accuracy: 0.1272 - loss: 4.6758
Epoch 4/200
6/6 - 0s - 37ms/step - accuracy: 0.1329 - loss: 4.6729
Epoch 5/200
6/6 - 0s - 48ms/step - accuracy: 0.1329 - loss: 4.6699
Epoch 6/200
6/6 - 0s - 58ms/step - accuracy: 0.1272 - loss: 4.6665
Epoch 7/200
6/6 - 0s - 76ms/step - accuracy: 0.1272 - loss: 4.6628
Epoch 8/200
6/6 - 0s - 33ms/step - accuracy: 0.1329 - loss: 4.6589
Epoch 9/200
6/6 - 0s - 28ms/step - accuracy: 0.1329 - loss: 4.6540
Epoch 10/200
6/6 - 0s - 56ms/step - accuracy: 0.1329 - loss: 4.6490
Epoch 11/200
6/6 - 0s - 53ms/step - accuracy: 0.1329 - loss: 4.6429
Epoch 12/200
6/6 - 0s - 49ms/step - accuracy: 0.1445 - loss: 4.6360
Epoch 13/200
6/6 - 0s - 42ms/step - accuracy: 0.1503 - loss: 4.6282
Epoch 14/200
6/6 - 0s - 40ms/step - accuracy: 0.1561 - loss: 4.6188
Epoch 15/200
6/6 - 0s - 59ms/step - accuracy: 0.1561 - l

<keras.src.callbacks.history.History at 0x79315ea6b2d0>

In [13]:
import numpy as np

def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result = seed_text, seed_text

    for _ in range(n_words):
        # Encode text as integers
        encoded = tokenizer.texts_to_sequences([in_text])[0]

        if not encoded:  # Check if token exists
            print("Warning: Word not found in tokenizer vocabulary.")
            break

        # Reshape for model input
        encoded = np.array(encoded).reshape(1, -1)

        # Predict next word index
        yhat = np.argmax(model.predict(encoded, verbose=0), axis=-1)[0]

        # Find corresponding word
        out_word = next((word for word, index in tokenizer.word_index.items() if index == yhat), None)

        if not out_word:  # Stop if prediction fails
            print("Warning: No word found for predicted index.")
            break

        # Append to result
        in_text, result = out_word, result + ' ' + out_word
    return result


In [15]:
print(generate_seq(model ,tokenizer, 'subfield', 40))

subfield of natural language processing tasks are closely intertwined they can be subdivided into categories for convenience interpretation and artificial intelligence the turing published an article titled computing machinery and artificial intelligence the turing published an article titled computing machinery and
