In [2]:
from numpy import array
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.models import load_model
import keras.utils as ku 
import datetime
import re
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

file_content = open(r"Database\bread.txt").read()
file_content = file_content.replace(".", " . ")
file_content = file_content.replace("\n", " \n ")

# Use regex to take off some meta-lines
file_content = re.sub(r'^   NYC.*\n?', '', file_content, flags=re.MULTILINE)
file_content = re.sub(r'^ ====.*\n?', '', file_content, flags=re.MULTILINE)
file_content = re.sub(r'^ \*\*.*\n?', '', file_content, flags=re.MULTILINE)



lines = file_content.split("@@@@@ Now You're Cooking! Export Format")
print(len(lines))
print(lines[:2])

998
['', ' \n  \n 100% Whole Wheat Bread \n  \n breads \n  \n 2/3 cup water \n 1 tablespoon sugar \n 2/3 cup shortening \n 1/2 cup molasses \n 12 cup whole wheat flour \n 3 pkg yeast \n 8 cup scalded milk \n 1 cup sugar \n 2 tablespoon salt \n  \n Dissolve yeast in 2/3 c water while your milk is cooling .   Dissolve 1 \n cup sugar in the hot milk .   Stir all ingredients in large bowl, turn \n out and knead about 5 minutes, adding flour if needed .   Knead about 5 \n minutes .  Let rise until doubled in bulk, about 1 1/2 to 2 hours .  \n Knead down and shape into 6 loaves, let rise until doubled in pans .  \n Bake at 375 degrees F .  for 40 minutes .   Turn out on wire rack and let \n cool to cold before slicing, if you can .  NOTE: Raisins and/or walnuts \n can be added for a change .  Also this bread freezes well .  \n  \n Yield: 6 servings \n  \n  \n  \n ']


Using TensorFlow backend.


Agora utilizamos `Tokenizer()` para gerar uma lista de palavras, onde cada palavra é um id. Lembrando que precisamos fazer uma outra lista com todas as combinações das formações das frases, já que queremos prever a próxima palavra com base nas N palavras anteriores

In [3]:
tokenizer = Tokenizer(num_words=1000, filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t')


def get_sequence_of_tokens(corpus):
    ## tokenization
    janela = 40
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            if(i<janela):
                n_gram_sequence = token_list[:i+1]
                input_sequences.append(n_gram_sequence)
            else:
                n_gram_sequence = token_list[i+1-janela:i+1]
                input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(lines)
inp_sequences[:10]

[[1, 1],
 [1, 1, 910],
 [1, 1, 910, 166],
 [1, 1, 910, 166, 160],
 [1, 1, 910, 166, 160, 33],
 [1, 1, 910, 166, 160, 33, 1],
 [1, 1, 910, 166, 160, 33, 1, 1],
 [1, 1, 910, 166, 160, 33, 1, 1, 32],
 [1, 1, 910, 166, 160, 33, 1, 1, 32, 1],
 [1, 1, 910, 166, 160, 33, 1, 1, 32, 1, 1]]

In [3]:
#print("word_index : ",tokenizer.word_index)
#print(len(tokenizer.word_index))

### Ta vendo que ali em cima os vetores tem dimensões todas diferentes? precisamos padronizar isso daí fazendo com que todos tenham as mesmas dimensões

In [4]:
max_sequence_len = max([len(x) for x in inp_sequences])
print(max_sequence_len)
input_sequences = np.array(pad_sequences(inp_sequences, maxlen=max_sequence_len, padding='pre'))

40


### O último número é sempre função dos números anteriores

In [5]:
qtd_de_palavrinhas = 1000#len(tokenizer.word_index)+1
print(qtd_de_palavrinhas)

1000


In [6]:
X, y_denso = input_sequences[:,:-1],input_sequences[:,-1]
y = ku.to_categorical(y_denso, num_classes=qtd_de_palavrinhas)
print(X[:3]) # Entrada
print(y_denso[:3]) # Saída
print(y[:3]) # Saída

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   1   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    1   1 910]]
[  1 910 166]
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [7]:
tamanho_entrada = X.shape[1]
print(tamanho_entrada)

39


O layer `Embbedded` é usado para transformar cada palavra (que agora é um número) em uma representação vetorial de acordo com sua frequência. A ideia é que palavras que sejam usadas

> words that have similar context will have similar meanings

Esse embedding é bonitão pq já arruma a matriz no formato 3D que o LSTM precisa como input, então não preciso me procupar com isso.

After we’ve done the file reading, we will create the actual input for the Network. We’re gonna use Keras to create and train our Network, so we must convert the data into this form: **(number_of_sequences, length_of_sequence, number_of_features)**.

In [8]:
model = Sequential()
model.add(Embedding(qtd_de_palavrinhas, round(qtd_de_palavrinhas**0.25), input_length=tamanho_entrada))
model.add(LSTM(100))
model.add(Dense(qtd_de_palavrinhas, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 39, 6)             6000      
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               42800     
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              101000    
Total params: 149,800
Trainable params: 149,800
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.fit(X, y, epochs=200, verbose=5, batch_size=300)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.callbacks.History at 0x22e178e03c8>

In [4]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [11]:
print(generate_text("milk",100,model,max_sequence_len))

Milk 
 
 
 
 Breads 
 
 1 Cup Water 
 1 2 Cup Sugar 
 1 Teaspoon Salt 
 1 2 Cup Butter 
 1 2 Cup Sugar 
 1 2 Cup Butter 
 1 2 Cup Sugar 
 1 Teaspoon Salt 
 1 2 Teaspoon Cinnamon 
 1 2 Cup Butter 
 1 Cup Chopped Nuts 
 1 Cup Chopped Nuts 
 
 Combine Flour Baking Powder Salt And Salt In A Large Bowl . Add 
 Flour Mixture . Add To Dry Ingredients . Stir In The Flour 
 Mixture . Stir In Enough Of The Flour


In [12]:
model.save("model.h5")
print("Saved model to disk")

Saved model to disk


In [7]:
import pickle

net_output = { "model_name": "model.h5", "tokenizer": tokenizer, "max_sequence_len":max_sequence_len }

output = open('net_data.pkl', 'wb')
pickle.dump(net_output, output)
output.close()