In [3]:
import os

import random
import numpy as np

from sklearn.model_selection import train_test_split

from keras.models import Model 
from keras.layers.recurrent import LSTM
from keras.layers import Input, Dense, Dropout
from keras.callbacks import ModelCheckpoint,Callback

from utils import sample_multinomial

In [4]:
data = open(os.path.join("data","devfest_presentation.txt"), 'ra').read()
# data = data[:1000]
labels = list(set(data))

max_len = 20
nr_classes = len(labels)

text2label = {t:i for i, t in enumerate(labels)}
label2text = {i:t for i, t in enumerate(labels)}

In [5]:
step = 5
inputs,outputs = [], []
for i in range(0, len(data) - max_len, step):
    inputs.append(data[i:i+max_len])
    outputs.append(data[i+max_len])

X = np.zeros((len(inputs), max_len, len(labels)))
y = np.zeros((len(inputs), len(labels)))

for i, example in enumerate(inputs):
    for t, label in enumerate(example):
        X[i, t, text2label[label]] = 1
    y[i, text2label[outputs[i]]] = 1
    
X_train,X_valid,y_train,y_valid = train_test_split(X,y,train_size = 0.8, 
                                                   random_state = 1234) 

In [6]:
text_input = Input(shape=(max_len, nr_classes))
x = LSTM(256, return_sequences=True)(text_input)
x = Dropout(0.2)(x)
x = LSTM(256, return_sequences=False)(x)
x = Dropout(0.2)(x)
text_output = Dense(nr_classes,activation='softmax')(x)

char_rnn_model = Model(text_input,text_output)
char_rnn_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [7]:
def generate_text(seed,length, temperature):
    full_text = seed       

    current_text = full_text

    for i in range(length-max_len): 

        current_text_vec = np.zeros((1, max_len, len(labels)))
        for t, char in enumerate(current_text):
            current_text_vec[0, t, text2label[char]] = 1.

        next_letter_probs = char_rnn_model.predict(current_text_vec, verbose=0)[0] 

        next_letter_probs = np.log(next_letter_probs)/temperature
        next_letter_probs = np.exp(next_letter_probs)/np.sum(np.exp(next_letter_probs))
        choices = range(len(next_letter_probs))

        next_letter = np.random.choice(choices, p=next_letter_probs) 

        full_text += label2text[next_letter]
        current_text = full_text[-max_len:]
        
    return "/n %s"%full_text

class GenerateTextCallback(Callback):    
    
    def __init__(self, temperature, seed, length):        
        super(GenerateTextCallback, self).__init__()
        
        self.temperature = temperature
        self.seed = seed
        self.length = length

    def on_epoch_end(self,epoch,logs={}):  
        print generate_text(self.seed,self.length,self.temperature)

In [8]:
text_generation = GenerateTextCallback(temperature = 0.5,
                                       seed = "data scientist", 
                                       length = 200)
char_rnn_model.fit(X_train, y_train,
                   validation_data = [X_valid,y_valid],
                   batch_size=128, 
                   nb_epoch=100,
                   callbacks = [text_generation])

KeyboardInterrupt: 

In [None]:
print generate_text("data scientist",200,0.25)

