In [1]:
'''
#Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

import json
import re

Using TensorFlow backend.


In [2]:
BEFORE_START = '\0'
AFTER_END = '\1'
def preproc(s):
    s = BEFORE_START + s + AFTER_END
    s = re.sub(r'\[.*\|.*\]', ' ', s)
    s = re.sub(r'[«»]', '"', s)
    return s.lower()

SEQ_LENGTH = 40

In [3]:
def read_vk_json(s, post_likes_min=0):
    with open(s, 'r') as f:
        posts = json.load(f)

    texts = []
    for p in posts:
        ptext = p['text'].strip()
        if len(ptext) > SEQ_LENGTH and p['likesCount'] >= post_likes_min:
            texts.append(ptext)

        for c in p['comments']:
            ctext = c['text'].strip()
            if c['likesCount'] > 0 and len(ctext) > SEQ_LENGTH:
                texts.append(ctext)

    return [preproc(x) for x in texts]

In [4]:
kalik_texts = read_vk_json('data/kalikfan.json')
print('Number of Kalik samples:', len(kalik_texts))

Number of Kalik samples: 1795


In [5]:
jumoreski_texts = read_vk_json('data/jumoreski.json', 1600)
print('Number of Jumoreski samples:', len(jumoreski_texts))

Number of Jumoreski samples: 8788


In [6]:
texts = kalik_texts # + jumoreski_texts
print('texts:', len(texts))

texts: 1795


In [7]:
chars = sorted(list(set(''.join(texts))))
print('total chars before dropping rare:', len(chars))

total chars before dropping rare: 878


In [8]:
nums = {}
for c in chars:
    for t in texts:
        if c in t:
            if c not in nums:
                nums[c] = 0
            nums[c] += 1
            
MIN_OCCURANCES = 7

for c in chars:
    if nums[c] < MIN_OCCURANCES:
        texts = list(map(lambda t: t.replace(c, ''), texts))
            
nums = {}
for c in chars:
    for t in texts:
        if c in t:
            if c not in nums:
                nums[c] = 0
            nums[c] += 1
            
sorted( ((v,k) for k,v in nums.items()), reverse=False)

[(7, '☀'),
 (7, '⛵'),
 (7, '🌳'),
 (7, '🍁'),
 (7, '🍃'),
 (7, '🍋'),
 (7, '🍓'),
 (7, '🏭'),
 (7, '🐎'),
 (7, '🐩'),
 (7, '👁'),
 (7, '👫'),
 (7, '👷'),
 (7, '💁'),
 (7, '💊'),
 (7, '💕'),
 (7, '💰'),
 (7, '😙'),
 (7, '😚'),
 (7, '😣'),
 (7, '🙎'),
 (7, '🚘'),
 (7, '🚶'),
 (7, '🤞'),
 (7, '🤰'),
 (7, '🥕'),
 (7, '🧞'),
 (7, '🧠'),
 (8, '/'),
 (8, '🇧'),
 (8, '🇴'),
 (8, '🌑'),
 (8, '🏋'),
 (8, '👈'),
 (8, '👠'),
 (8, '👪'),
 (8, '💄'),
 (8, '💎'),
 (8, '💜'),
 (8, '🔫'),
 (8, '🔮'),
 (8, '🗯'),
 (8, '😒'),
 (8, '😖'),
 (8, '🤖'),
 (8, '🤥'),
 (8, '🤷'),
 (8, '🥺'),
 (8, '🦍'),
 (9, '_'),
 (9, 'v'),
 (9, '⛄'),
 (9, '🌞'),
 (9, '🍒'),
 (9, '🎤'),
 (9, '🐲'),
 (9, '💅'),
 (9, '💔'),
 (9, '💙'),
 (9, '💧'),
 (9, '💫'),
 (9, '😥'),
 (9, '😮'),
 (9, '😻'),
 (9, '🤑'),
 (9, '🤟'),
 (9, '🥝'),
 (10, '6'),
 (10, '8'),
 (10, 'w'),
 (10, '‼'),
 (10, '⚡'),
 (10, '🇪'),
 (10, '🇭'),
 (10, '🍆'),
 (10, '🍺'),
 (10, '🏃'),
 (10, '🐔'),
 (10, '🐰'),
 (10, '👵'),
 (10, '👸'),
 (10, '🗣'),
 (10, '😴'),
 (10, '🙉'),
 (10, '🙋'),
 (10, '🤐'),
 (10, '🤫'),
 (10, '🤭'),
 (10, '🥰'),

In [9]:
chars = sorted(list(set(''.join(texts))))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 346


In [10]:
# cut the text in semi-redundant sequences of maxlen characters
sentences = []
next_chars = []

for text in texts:
    t = [BEFORE_START] * (SEQ_LENGTH - 1) + list(text) + [AFTER_END] * SEQ_LENGTH
    for i in range(1, len(t) - SEQ_LENGTH, 1):
        sentences.append(t[i:i + SEQ_LENGTH])
        next_chars.append(t[i + SEQ_LENGTH])

# for i in range(0, len(text) - maxlen, step):
#     sentences.append(text[i: i + maxlen])
#     next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 402837


In [11]:
print('Vectorization...')
x = np.zeros((len(sentences), SEQ_LENGTH, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [12]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
# model.add(LSTM(32, input_shape=(SEQ_LENGTH, len(chars)), return_sequences=True))
# model.add(LSTM(32, input_shape=(SEQ_LENGTH, len(chars)), return_sequences=True))
model.add(LSTM(256))
model.add(Dense(len(chars), activation='softmax'))

Build model...


In [13]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [14]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.random.choice(np.arange(0, len(chars)), 1, False, preds)[0]

In [18]:
def on_epoch_end(epoch, _):
#     if epoch % 10 != 0:
#         return
    # Function invoked at end of each epoch. Prints generated text.
    print()
    # print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - SEQ_LENGTH - 1)
    
    diversity = 0.2
    
    # print('----- diversity:', diversity)

    generated = ''
    sentence = ''.join([BEFORE_START] * SEQ_LENGTH)

    finished_well = False
    for i in range(400):
        x_pred = np.zeros((1, SEQ_LENGTH, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        if next_char == AFTER_END:
            finished_well = True
            break

        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    if not finished_well:
        sys.stdout.write('@@@')
        sys.stdout.flush()
    print()

In [19]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
model.fit(x, y,
          batch_size=2048,
          epochs=60,
          callbacks=[print_callback])

Epoch 1/60

 это же настоящие русский кальянчик на кальянную придумывали на кальянчиком и без кальянчиком на кальянную и дымовая в дымном станем и дыма и дыма и на кальянную мало кальянчиком и кальянчик на кальянную и на кальянчиком на кальянную и на кальянную и на двойное яблоко 🍎🍏 и кальянчик на просто на кальянчиком и кальянчика на кальянчиком и под какой кальянчик по полной забивочки по полной кралечку по@@@
Epoch 2/60

а по сука в долгом кальянную просто просто под рукой кальянную с великим дымного дымного 💨 💨 💨 💨 💨 💨 💨 💨 💨 💨 💨 и вечером дымного дымного дымного дымного настроения по полном старовой кальяновой по кайфу по полном напасными кальянчика с просто на старовой полного кальянчика с под сказали как такой кальяном по семе на своим дымных дымком 💨 💨 💨 💨 💨 💨 💨 💨 на делает как понимают по кальянную и на старо@@@
Epoch 3/60

пока мы с кальянной запасиков и подымить кальянчиком и сделал кальянщика и покурим кальянчиком подымить по полной кальянбусик на кальянбусик и себе после по