In [1]:
import datetime

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import regex as re

In [2]:
def file_to_sentence_list(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
        
    sentences = [sentence.strip() for sentence in re.split(
        r'(?<=[.!?])\s+', text) if sentence.strip()]
    
    return sentences

In [3]:
file_path = 'pizza.txt'
text_data = file_to_sentence_list(file_path)

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

In [5]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'pizza': 3,
 'of': 4,
 'a': 5,
 'to': 6,
 'in': 7,
 'has': 8,
 'its': 9,
 'for': 10,
 'with': 11,
 'it': 12,
 'that': 13,
 'is': 14,
 'as': 15,
 'culinary': 16,
 'from': 17,
 'become': 18,
 'their': 19,
 'have': 20,
 'on': 21,
 'flavors': 22,
 'cheese': 23,
 'toppings': 24,
 'also': 25,
 'delivery': 26,
 'food': 27,
 'people': 28,
 'world': 29,
 'traditional': 30,
 'made': 31,
 'experience': 32,
 'our': 33,
 'pizzerias': 34,
 'dish': 35,
 'diverse': 36,
 'like': 37,
 'crust': 38,
 'delight': 39,
 'symbol': 40,
 'pizzas': 41,
 'more': 42,
 'making': 43,
 'or': 44,
 'iconic': 45,
 'creativity': 46,
 'cultural': 47,
 'italy': 48,
 'an': 49,
 'combinations': 50,
 'ancient': 51,
 'who': 52,
 'ingredients': 53,
 'we': 54,
 'this': 55,
 'style': 56,
 'home': 57,
 'indulgence': 58,
 'beyond': 59,
 'global': 60,
 'inspired': 61,
 'options': 62,
 'those': 63,
 'not': 64,
 'but': 65,
 'together': 66,
 'allowing': 67,
 'just': 68,
 'comfort': 69,
 'local': 70,
 'may': 71,
 '

In [6]:
input_sequences = []

for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
        

In [7]:
input_sequences

[[3, 1],
 [3, 1, 233],
 [3, 1, 233, 2],
 [3, 1, 233, 2, 45],
 [3, 1, 233, 2, 45, 35],
 [3, 1, 233, 2, 45, 35, 13],
 [3, 1, 233, 2, 45, 35, 13, 8],
 [3, 1, 233, 2, 45, 35, 13, 8, 114],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235, 74],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235, 74, 115],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235, 74, 115, 236],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235, 74, 115, 236, 14],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235, 74, 115, 236, 14, 5],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235, 74, 115, 236, 14, 5, 116],
 [3, 1, 233, 2, 45, 35, 13, 8, 114, 234, 2, 235, 74, 115, 236, 14, 5, 116, 6],
 [3,
  1,
  233,
  2,
  45,
  35,
  13,
  8,
  114,
  234,
  2,
  235,
  74,
  115,
  236,
  14,
  5,
  116,
  6,
  1],
 [3,
  1,
  233,
  2,
  45,
  35,
  13,
  8,
  114,
  234,
  2,
  235,
  74,
  115,
  23

In [8]:
# Padding so all sequences are the same length
max_seq_length = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(
    input_sequences,
    maxlen=max_seq_length,
    padding='pre'
))

X = input_sequences[:, :-1]
y = input_sequences[:, -1]

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [9]:
X

array([[  0,   0,   0, ...,   0,   0,   3],
       [  0,   0,   0, ...,   0,   3,   1],
       [  0,   0,   0, ...,   3,   1, 233],
       ...,
       [  0,   0,   0, ..., 684, 685,   4],
       [  0,   0,   0, ..., 685,   4,  19],
       [  0,   0,   0, ...,   4,  19,  72]], dtype=int32)

In [10]:
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_seq_length-1))

model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 39, 10)            6870      
                                                                 
 lstm (LSTM)                 (None, 128)               71168     
                                                                 
 dense (Dense)               (None, 687)               88623     
                                                                 
Total params: 166661 (651.02 KB)
Trainable params: 166661 (651.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
model.fit(X, y, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x178779150>

In [None]:
seed_text = "Become a large pizza with"
next_words = 1

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list],
        maxlen=max_seq_length-1,
        padding='pre'
    )
    
    predictions = model.predict(token_list)
    pred_word = tokenizer.index_word[np.argmax(predictions)]
    seed_text += " " + pred_word

print("Next predicted words: ", seed_text)

In [13]:
import datetime
import pickle


model.save("model_{0}.h5".format(datetime.datetime.now()).replace(" ", "_"), )

with open("tokenizer_{0}.pickle".format(datetime.datetime.now()).replace(" ", "_"), 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(


In [None]:
max_seq_length

In [None]:
model.layers[0].get_output_at(0).get_shape().as_list()[1] + 1