In [12]:
# load data and convert to lowercase for reducing the vocabulary that the network must learn
raw_data = open(r"wonderland.txt", encoding='utf-8').read()
raw_data = raw_data.lower().split('\n')

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
# create input sequence using list of tokens generated from raw_data
kToken = Tokenizer()

def get_sequence_of_tokens(raw_data):
    
    kToken.fit_on_texts(raw_data)
    vocab_size = len(kToken.word_index)+1
    input_sequences = []
    for line in raw_data:
        token_list = kToken.texts_to_sequences([line])[0]
        # token_list=[line][0]
        for i in range(1, len(token_list)):
            n_gram_seq = token_list[:i+1]
            input_sequences.append(n_gram_seq)
    return input_sequences, vocab_size
in_seq,vocab_size=get_sequence_of_tokens(raw_data)
in_seq[:20]

[[48, 1302],
 [48, 1302, 248],
 [48, 1302, 248, 342],
 [48, 1302, 248, 342, 10],
 [48, 1302, 248, 342, 10, 481],
 [48, 1302, 248, 342, 10, 481, 59],
 [48, 1302, 248, 342, 10, 481, 59, 815],
 [48, 1302, 248, 342, 10, 481, 59, 815, 816],
 [22, 443],
 [22, 443, 31],
 [22, 443, 31, 24],
 [22, 443, 31, 24, 1],
 [22, 443, 31, 24, 1, 151],
 [22, 443, 31, 24, 1, 151, 6],
 [22, 443, 31, 24, 1, 151, 6, 704],
 [22, 443, 31, 24, 1, 151, 6, 704, 1006],
 [22, 443, 31, 24, 1, 151, 6, 704, 1006, 19],
 [22, 443, 31, 24, 1, 151, 6, 704, 1006, 19, 49],
 [22, 443, 31, 24, 1, 151, 6, 704, 1006, 19, 49, 817],
 [22, 443, 31, 24, 1, 151, 6, 704, 1006, 19, 49, 817, 3]]

In [30]:
# pad the input sequences generated in previous step.
import numpy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


def generate_padded_sequence(input_sequences):
    max_seq_len = max([len(x) for x in input_sequences])
    input_sequences = numpy.array(pad_sequences(
        input_sequences, maxlen=max_seq_len, padding='pre'))
    # create predictors and labels
    predictors, labels = input_sequences[:, :-1], input_sequences[:, -1]
    labels = to_categorical(labels, num_classes=vocab_size)
    return predictors, labels, max_seq_len


predictors, labels, max_seq_len = generate_padded_sequence(in_seq)

In [None]:
# defining the model # don't execute this cell
# import tensorflow as tf

# regressor = tf.keras.models.Sequential()
# regressor.add(tf.keras.layers.Embedding(
#     vocab_size, 100, input_length=max_seq_len-1))
# regressor.add(tf.keras.layers.Bidirectional(
#     tf.keras.layers.LSTM(150, return_sequences=True)))
# regressor.add(tf.keras.layers.Dropout(0.4))
# regressor.add(tf.keras.layers.LSTM(50))
# regressor.add(tf.keras.layers.Dense(vocab_size/2,activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.01)))
# regressor.add(tf.keras.layers.Dense(vocab_size,activation='softmax'))
# regressor.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
# regressor.summary()

In [80]:
#reload the model
import tensorflow as tf
regressor=tf.keras.models.load_model("textPredictor.h5")
history=regressor.fit(predictors,labels,epochs=300,verbose=1)
regressor.save("textPredictor.h5")
print("Model saved")

Model saved


In [33]:
def generate_text(seed_text, next_words, regressor, max_sequence_len):
    for _ in range(next_words):
       
        token_list = kToken.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = regressor.predict(token_list, verbose=0)
        prediction=numpy.argmax(predicted,axis=1)
        
        output_word = ""
        for word,index in kToken.word_index.items():
            if numpy.any(index == prediction):
                output_word = word
                break
            
        seed_text += " "+output_word
    return seed_text


In [37]:
import tensorflow as tf
regressor = tf.keras.models.load_model("textPredictor.h5")
print(generate_text("ust in time ust in time There was nothing so VERY remarkable", 10, regressor, max_seq_len))


ust in time ust in time There was nothing so VERY remarkable in a little nervous about the project gutenberg tm electronic
