In [43]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [10]:
with open('next_word_predictor.txt','r') as file:
    data = file.read()


In [11]:
data

'The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. People were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. Children were playing games, and laughter filled the air.\n\nAs the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. Families gathered for picnics, and the smell of barbecues wafted through the air. It was a perfect day for a picnic by the lake.\n\nIn the distance, you could hear the sound of live music coming from a local band, and people began to gather around the stage to enjoy the performance. The atmosphere was electric, and the music had everyone swaying to the beat.\n\nAs the stars began to twinkle in the night sky, the crowd grew even larger, and the festivities continued well into the night. It was a day filled with joy, laughter, and memories that would last a lifetime.\n\n\nT

In [12]:
def punc_seperator(data):
    return [words.lower() for words in data.split(" ") if words not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [28]:
data_cleaned = punc_seperator(data)
data_cleaned = " ".join(data_cleaned)

In [29]:
data_cleaned

'the sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. people were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. children were playing games, and laughter filled the air.\n\nas the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. families gathered for picnics, and the smell of barbecues wafted through the air. it was a perfect day for a picnic by the lake.\n\nin the distance, you could hear the sound of live music coming from a local band, and people began to gather around the stage to enjoy the performance. the atmosphere was electric, and the music had everyone swaying to the beat.\n\nas the stars began to twinkle in the night sky, the crowd grew even larger, and the festivities continued well into the night. it was a day filled with joy, laughter, and memories that would last a lifetime.\n\n\nt

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data_cleaned])
total_words = len(tokenizer.word_index)+1


In [31]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'i': 6,
 'you': 7,
 'in': 8,
 'is': 9,
 'monica': 10,
 'it': 11,
 'with': 12,
 'ross': 13,
 'that': 14,
 'rachel': 15,
 'for': 16,
 'chandler': 17,
 'this': 18,
 'on': 19,
 'joey': 20,
 'was': 21,
 'oh': 22,
 'phoebe': 23,
 'are': 24,
 'all': 25,
 'as': 26,
 'what': 27,
 'be': 28,
 'like': 29,
 'no': 30,
 "it's": 31,
 "i'm": 32,
 'her': 33,
 'they': 34,
 'just': 35,
 'from': 36,
 'okay': 37,
 'not': 38,
 'so': 39,
 'my': 40,
 'have': 41,
 'me': 42,
 'where': 43,
 'know': 44,
 'she': 45,
 'we': 46,
 'out': 47,
 'well': 48,
 'their': 49,
 'can': 50,
 'at': 51,
 'he': 52,
 'yeah': 53,
 'your': 54,
 'about': 55,
 'but': 56,
 'its': 57,
 'up': 58,
 "don't": 59,
 'text': 60,
 'scene': 61,
 'by': 62,
 'do': 63,
 'an': 64,
 'or': 65,
 'were': 66,
 'there': 67,
 'if': 68,
 'uh': 69,
 'look': 70,
 'life': 71,
 'through': 72,
 'into': 73,
 'him': 74,
 'his': 75,
 "you're": 76,
 'hey': 77,
 'how': 78,
 'right': 79,
 'think': 80,
 'time': 81,
 'no

In [58]:
input_sequence=[]
for word in data_cleaned.split('\n'):
    token_list = tokenizer.texts_to_sequences([word])[0]
    for i in range(1,len(token_list)):
        n_grams = token_list[:i+1]
        input_sequence.append(n_grams)


word is the sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. people were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. children were playing games, and laughter filled the air.
[1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2, 3, 2369, 1550, 2370, 1, 423, 4, 1, 1142, 491, 84, 66, 47, 1143, 1, 693, 1144, 98, 575, 8, 1, 576, 492, 493, 3, 1145, 2371, 694, 1, 2372, 1146, 66, 494, 2373, 2, 368, 243, 1, 115]
word is 
[]
word is as the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. families gathered for picnics, and the smell of barbecues wafted through the air. it was a perfect day for a picnic by the lake.
[26, 1, 244, 1147, 73, 1551, 1, 1552, 873, 5, 695, 2, 1, 215, 696, 73, 3, 874, 4, 99, 697, 1553, 875, 16, 2374, 2, 1, 876, 4, 2375, 1554, 72, 1, 115, 11, 21, 3, 698, 244, 16, 3, 2376, 62, 1, 2377]


In [40]:
input_sequence

[[1, 155],
 [1, 155, 21],
 [1, 155, 21, 2368],
 [1, 155, 21, 2368, 1549],
 [1, 155, 21, 2368, 1549, 8],
 [1, 155, 21, 2368, 1549, 8, 1],
 [1, 155, 21, 2368, 1549, 8, 1, 422],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2, 3],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2, 3, 2369],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2, 3, 2369, 1550],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2, 3, 2369, 1550, 2370],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2, 3, 2369, 1550, 2370, 1],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2, 3, 2369, 1550, 2370, 1, 423],
 [1,
  155,
  21,
  2368,
  1549,
  8,
  1,
  422,
  692,
  215,
  2,
  3,
  2369,
  1550,
  2370,
  1,
  423,
  4],
 [1,
  155,
  21,
  2368,
  1549,
  8,
  1,
  422,
  692,
  215,
  2,
  3,
  2369,
  1550,
  2370,
  1,
  423,
  4,
  1],
 [1,
  155,
  21,
  2368,
  1549,
  8

In [44]:
max_seq_len = max(len(X) for X in input_sequence)
input_sequence = np.array(pad_sequences(input_sequence,maxlen=max_seq_len))

In [46]:
X,y = input_sequence[:,:-1],input_sequence[:,-1]
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,GRU,Dropout

In [52]:
model = Sequential()
model.add(Embedding(total_words,100,input_length=max_seq_len-1))
model.add(GRU(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(100))
model.add(Dense(total_words,activation='softmax'))

In [53]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [55]:
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("GPUs:", tf.config.list_physical_devices('GPU'))

TensorFlow version: 2.15.0
Num GPUs Available: 0
GPUs: []


In [None]:
history = model.fit(X_train,y_train,
                    epochs=50,
                    validation_data = (X_test,y_test),
                    verbose=1)

In [None]:
model.save('gru.h5')

In [None]:
import pickle
with open('optimizer.pickle','wb') as file:
  pickle.dump(tokenizer,file)