#### 1. Importing regular python libraries

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize,sent_tokenize
import pandas as pd
import string

#### 2. Get the Data

In [2]:
!dir data

 Volume in drive C has no label.
 Volume Serial Number is 9CDE-5548

 Directory of C:\Users\User\Desktop\CHHABI\NLP\Next_Word_Generator\data

02/06/2025  11:12 AM    <DIR>          .
02/12/2025  11:14 PM    <DIR>          ..
12/13/2023  03:33 PM           170,568 next_word_predictor.txt
               1 File(s)        170,568 bytes
               2 Dir(s)  91,084,763,136 bytes free


In [3]:
with open("data/next_word_predictor.txt","r") as f:
    data = f.readlines()

#### 2.1 Tokenize the words and Assign each unique word with index

In [4]:
main_data = " ".join(i.replace("\n"," ").strip() for i in data)

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([main_data])
tokens_index = tokenizer.word_index
len(tokenizer.word_index)

4993

#### 2.2 Generate the sentence tokens and convert text or sentences to sequences

In [6]:
# ##### first way
# sentence_tokens = sent_tokenize(main_data)
# sentence_tokens

def text_to_sequence(input_text,tokens_index):
    return [tokens_index[word] for word in input_text.split() if word in tokens_index]

### second way
input_sequence = []
for sentence in main_data.split('.'):
    sentence_token = text_to_sequence(sentence , tokens_index)   #### now replace all the words of the sentence with the token index
    for i in range(1,len(sentence_token)):  ##### 2.3 Now convert the data into input and output form like n-grams
        input_sequence.append(sentence_token[:i+1])

In [7]:
input_sequence

[[155, 21],
 [155, 21, 2368],
 [155, 21, 2368, 1549],
 [155, 21, 2368, 1549, 8],
 [155, 21, 2368, 1549, 8, 1],
 [155, 21, 2368, 1549, 8, 1, 422],
 [155, 21, 2368, 1549, 8, 1, 422, 692],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3, 2369],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3, 2369, 1550],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3, 2369, 1550, 2370],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3, 2369, 1550, 2370, 1],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3, 2369, 1550, 2370, 1, 423],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3, 2369, 1550, 2370, 1, 423, 4],
 [155, 21, 2368, 1549, 8, 1, 422, 692, 2, 3, 2369, 1550, 2370, 1, 423, 4, 1],
 [155,
  21,
  2368,
  1549,
  8,
  1,
  422,
  692,
  2,
  3,
  2369,
  1550,
  2370,
  1,
  423,
  4,
  1,
  1142],
 [155,
  21,
  2368,
  1549,
  8,
  1,
  422,
  692,
  2,
  3,
  2369,
  1550,
  2370,
  1,
  423,
  4,
  1,
  1142,
  491],
 [66, 47],
 [66,

#### 2.3 In Machine Learning the input data size should be equal so we need to add padding to the each input_sequence to make the equal size

In [8]:
#### For this we need to the maximum length of the input sequence
max_len = max([len(sequence) for sequence in input_sequence])

In [9]:
max_len

53

In [10]:
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequence = pad_sequences(input_sequence, maxlen = max_len, padding = 'pre')

In [11]:
padded_input_sequence.shape

(17371, 53)

In [12]:
padded_input_sequence

array([[   0,    0,    0, ...,    0,  155,   21],
       [   0,    0,    0, ...,  155,   21, 2368],
       [   0,    0,    0, ...,   21, 2368, 1549],
       ...,
       [   0,    0,    0, ..., 2331,  290,   19],
       [   0,    0,    0, ...,  290,   19,   54],
       [   0,    0,    0, ...,   19,   54, 1535]], dtype=int32)

##### 2.3.1 Split the data into input and output

In [13]:
X = padded_input_sequence[:, : -1]

In [14]:
y = padded_input_sequence[:,-1]

##### Since our text generator model is multi class classifier we convert the y (output) to the One Hot Encoder

In [15]:
y.shape

(17371,)

In [16]:
X.shape

(17371, 52)

In [17]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=4994)

In [18]:
y.shape

(17371, 4994)

#### 3. Now Create the Model Architecture

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [32]:
model = Sequential()
model.add(Embedding(4994, 100, input_shape=(52,)))  # Define input shape explicitly
model.add(LSTM(150))
model.add(Dense(4994, activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [33]:
model.summary()

### Tomorrow write the model in pytorch and rewatch the youtube video and understands the model architecture