[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/jkanclerz/analiza-dokumentow/blob/main/50--text-generation.ipynb)

In [None]:
!wget https://wolnelektury.pl/media/book/txt/quo-vadis.txt -O quo_vadis.txt

In [None]:
text = open('quo_vadis.txt').read()

### Tokenize and Clean Text

In [10]:
def separate_punc(doc_text):
    return [token.lower() for token in doc_text.split() if token not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [11]:
d = open('quo_vadis.txt').read()
tokens = separate_punc(d)

In [16]:
tokens[:4]

['henryk', 'sienkiewicz', 'quo', 'vadis']

In [17]:
len(tokens)

172371

In [18]:
len(tokens) / 25

6894.84

## Create Sequences of Tokens

In [19]:
train_len = 25+1 # 50 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [20]:
' '.join(text_sequences[0])

'henryk sienkiewicz quo vadis isbn 978-83-288-2832-2 rozdział pierwszy petroniusz obudził się zaledwie koło południa i jak zwykle, zmęczony bardzo. poprzedniego dnia był na uczcie u nerona,'

In [21]:
' '.join(text_sequences[1])

'sienkiewicz quo vadis isbn 978-83-288-2832-2 rozdział pierwszy petroniusz obudził się zaledwie koło południa i jak zwykle, zmęczony bardzo. poprzedniego dnia był na uczcie u nerona, która'

In [22]:
' '.join(text_sequences[2])

'quo vadis isbn 978-83-288-2832-2 rozdział pierwszy petroniusz obudził się zaledwie koło południa i jak zwykle, zmęczony bardzo. poprzedniego dnia był na uczcie u nerona, która przeciągnęła'

In [23]:
len(text_sequences)

172345

# Keras

### Keras Tokenization

In [24]:
from keras.preprocessing.text import Tokenizer

SyntaxError: invalid syntax (pywrap_tensorflow_internal.py, line 114)

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
sequences[0]

In [None]:
tokenizer.index_word

In [None]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

In [None]:
tokenizer.word_counts

In [None]:
vocabulary_size = len(tokenizer.word_counts)

### Convert to Numpy Matrix

In [None]:
import numpy as np

In [None]:
sequences = np.array(sequences)

In [None]:
sequences

# Creating an LSTM based model

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

### Train / Test Split

In [None]:
from keras.utils import to_categorical

In [None]:
sequences

In [None]:
# First 49 words
sequences[:,:-1]

In [None]:
# last Word
sequences[:,-1]

In [None]:
X = sequences[:,:-1]

In [None]:
y = sequences[:,-1]

In [None]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [None]:
seq_len = X.shape[1]

In [None]:
seq_len

### Training the Model

In [None]:
# define model
model = create_model(vocabulary_size+1, seq_len)

In [None]:
from pickle import dump,load

In [None]:
# fit model
model.fit(X, y, batch_size=128, epochs=300,verbose=1)

In [None]:
# save the model to file
model.save('epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))

# Generating New Text

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind] 
        input_text += ' ' + pred_word
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [None]:
text_sequences[0]

In [None]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]

In [None]:
random_seed_text

In [None]:
seed_text = ' '.join(random_seed_text)

In [None]:
seed_text

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)