# Char-by text generation Leo Tolstoy

In [2]:
from keras.callbacks import *
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional
from keras.optimizers import *
import numpy as np
import random
import sys
import io

with io.open('data/voina-i-mir.txt', encoding='utf-8') as f:
    text = f.read().lower().replace('\n', ' \n ')
    
start = text.index('часть первая')
finish = text.index('часть четвертая')

text = text[start:finish]   

text[:500]

Corpus length: 3726491
Text example:  часть первая. 
  
  
 i. 
  
  
 — eh bien, mon prince. gênes et lucques ne sont plus que des apanages, des поместья, de la famille buonaparte. non, je vous préviens, que si vous ne me dites pas, que nous avons la guerre, si vous vous permettez encore de pallier toutes les infamies, toutes les atrocités de cet antichrist (ma parole, j’y crois) — je ne vous connais plus, vous n’êtes plus mon ami, vous n’êtes plus мой верный раб, comme vous dites.1 ну, здравствуйте, здравствуйте. je vois que je vo


In [3]:

chars = sorted(list(set(text)))
print('Total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 30
step = 3
sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('Done!')

Total chars: 106
Number of sequences: 392711
Vectorization...
Done!


In [8]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)
  

def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    #print('\n----- Generating text after Epoch: %d' % epoch)
    print('\n----- Generating text after Epoch: ' + str(epoch), file=example_file)
    
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    #print('----- Generating with seed: "' + sentence + '"')
    print('----- Generating with seed: "' + sentence + '"', file=example_file)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        #print('----- Temperature:', diversity)
        print('----- Diversity:' + str(diversity), file=example_file)

        #sys.stdout.write(generated)
        example_file.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            #sys.stdout.write(next_char)
            example_file.write(next_char)
            example_file.flush()

        print(file=example_file)

In [12]:
print('Building model...')
model = Sequential()
model.add(Bidirectional(LSTM(256, input_shape=(maxlen, len(chars)))))
model.add(Dense(len(chars), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
print('Done!')

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
example_file = open('output/leo_tolstoy_char-by_generation.txt', 'w', encoding='utf-8')
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.001)


print('Fitting model...')
history = model.fit(x, y,
                    batch_size=1024,
                    epochs=5,
                    callbacks=[print_callback, reduce_lr])

Building model...
Done!
Fitting model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
 76800/392711 [====>.........................] - ETA: 41s - loss: 2.3010

KeyboardInterrupt: 