# Word-by text generation Leo Tolstoy


In [1]:
import sys
import random
import numpy as np

from keras.callbacks import *
from keras.models import Sequential
from keras.layers import Dense, Dropout, Bidirectional, LSTM, Embedding
from keras.optimizers import *


f = open('data/voina-i-mir.txt', encoding='utf-8')
text = f.read().lower().replace('\n', ' \n ')
    
start = text.index('часть первая')
finish = text.index('часть четвертая')

text = text[start:finish]

print(text[:500])

Using TensorFlow backend.


часть первая. 
  
  
 i. 
  
  
 — eh bien, mon prince. gênes et lucques ne sont plus que des apanages, des поместья, de la famille buonaparte. non, je vous préviens, que si vous ne me dites pas, que nous avons la guerre, si vous vous permettez encore de pallier toutes les infamies, toutes les atrocités de cet antichrist (ma parole, j’y crois) — je ne vous connais plus, vous n’êtes plus mon ami, vous n’êtes plus мой верный раб, comme vous dites.1 ну, здравствуйте, здравствуйте. je vois que je vo


 ## Get sentences

In [2]:
tmp = text.split()
text_in_words = []

for word in tmp:
    if word[-1] in [',', '.', '(', ')', '\"', '\'', '!', '?']:
        text_in_words.append(word[:-1])
        text_in_words.append(word[-1])
    else:
        text_in_words.append(word)
        
print('Corpus length in words: ',len(text_in_words))

Corpus length in words:  223410


## Fit Word2Vec

In [3]:
sentences = []
text_in_sentences = text.split('.')
for i in range(len(text_in_sentences)):
    sentences.append(text_in_sentences[i].split())

s = 0   
    
for sentence in sentences:
    s += len(sentence)
    
avg_sent_len = int(s/len(sentences))
       
print('Average sentence length in words (including punctuation): ', avg_sent_len)

Average sentence length in words (including punctuation):  14


In [4]:
w2v_data = []
for i in range(len(text_in_words)-avg_sent_len):
    w2v_data.append(text_in_words[i:i+avg_sent_len])
    
w2v_data[0] 

['часть',
 'первая',
 '.',
 'i',
 '.',
 '—',
 'eh',
 'bien',
 ',',
 'mon',
 'prince',
 '.',
 'gênes',
 'et']

In [5]:
%%time
import multiprocessing
from gensim.models import Word2Vec, word2vec

#w2v_model = Word2Vec(w2v_data, size=avg_sent_len, window=3, min_count=1, iter=100, workers=multiprocessing.cpu_count()) 
w2v_model = Word2Vec.load('tolstoy.w2v')
#w2v_model.save('tolstoy.w2v')

Wall time: 1.75 s


In [6]:
for x in w2v_model.wv.similar_by_word('князь', topn=4):
    print(x)

('василий', 0.9729055762290955)
('андрей', 0.97160404920578)
('ростов', 0.8767209053039551)
('борис', 0.8717873692512512)


In [7]:
predtrained_weights = w2v_model.wv.vectors
vocab_size, emdedding_size = w2v_model.wv.vectors.shape

## Tokenize

In [8]:
tokenized_seq = [[w2v_model.wv.vocab[word].index for word in w2v_data[i]] for i in range(len(w2v_data))]

x_seq = []
y_seq = []

for seq in tokenized_seq:
    x_seq.append(seq[:-1])
    y_seq.append(np.array(seq[-1])) 

In [9]:
print(x_seq[0], '->', y_seq[0])  
print(' '.join([w2v_model.wv.index2word[i] for i in x_seq[0]]), '->', w2v_model.wv.index2word[y_seq[0]]) 

[608, 1661, 1, 4260, 1, 3, 2978, 741, 0, 153, 740, 1, 12004] -> 109
часть первая . i . — eh bien , mon prince . gênes -> et


In [10]:
x_seq = np.array(x_seq)
y_seq = np.array(y_seq) 

#y_seq = np.expand_dims(y_seq, axis=1)
y_seq.shape

(223396,)

# Build and fit model

### Define print callback

In [14]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [w2v_model.wv.vocab[word].index for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(w2v_model.wv.index2word[idx] for idx in word_idxs)

def on_epoch_end(epoch, _):
    example_file.write('\nGenerating text after epoch: %d' % epoch)
    texts = [
        'андрей пришел за мной',
        'война началась',
        'pardon'
    ]
    for text in texts:
        sample = generate_next(text)
        example_file.write('\n %s... -> %s' % (text, sample))
        print('\n %s... -> %s' % (text, sample))

In [15]:
from keras.layers import Flatten

model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=emdedding_size,
                    weights=[predtrained_weights]))

model.add(Bidirectional(LSTM(units=128)))

model.add(Dense(units=vocab_size, activation='linear'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [16]:
example_file = open('output/leo_tolstoy_word-by_generation.txt', 'w', encoding='utf-8')
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.001)

history = model.fit(x_seq, y_seq,
                   epochs=10,
                   batch_size=128,
                   callbacks=[print_callback, reduce_lr],
                   verbose=1)

Epoch 1/10


  """



 андрей пришел за мной... -> андрей пришел за мной . . . . . . . . . .

 война началась... -> война началась . . . . . . . . . .

 pardon... -> pardon . . . . . . . . . .
Epoch 2/10
 30976/223396 [===>..........................] - ETA: 8:11 - loss: 10.3551

KeyboardInterrupt: 