In [2]:
import random
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.models import model_from_json
from keras.callbacks import Callback

plt.style.use('fivethirtyeight')

In [3]:
def sample(preds, temp=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temp
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [4]:
df = pd.read_json('https://github.com/taivop/joke-dataset/blob/master/wocka.json?raw=true')
df["body"] = df["body"].str.lower().str.strip()

bad_indices = []
print("Length of dataframe before = {}".format(len(df)))
for i in range(len(df)):
    joke = df["body"].iloc[i]
    if len(joke) < 20:
        bad_indices.append(i)
df = df.drop(bad_indices, axis='rows')
print("Length of dataframe after = {}".format(len(df)))

maxlen = 50
step = 3
for joke in df["body"].values:
    length = len(joke)
    if maxlen > length:
        maxlen = length

maxlen = maxlen - step
print("Maxlen: {}".format(maxlen))

Length of dataframe before = 10019
Length of dataframe after = 9985
Maxlen: 17


In [5]:
sentences = []
next_chars = []
chars = []
print("Creating sentence and next_chars arrays...")
for joke in df["body"].values:
    for i in range(0, len(joke) - maxlen, step):
        sentences.append(joke[i: i + maxlen])
        next_chars.append(joke[i + maxlen])
    for char in list(set(joke)):
        chars.append(char)

print("Number of sequences: {}".format(len(sentences)))

chars = sorted(list(set(chars)))
print("Number of unique characters: {}".format(len(chars)))
print("Creating sentence and next_chars arrays...\n")

char_indices = dict((char, chars.index(char)) for char in chars)

print("Vectorization...")
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print("Finished.")

Creating sentence and next_chars arrays...
Number of sequences: 2016104
Number of unique characters: 149
Creating sentence and next_chars arrays...

Vectorization...
Finished.


In [12]:
class GenerateText(Callback):
    def __init__(self, chars, maxlen, char_indices):
        self.chars = chars
        self.maxlen = maxlen
        self.char_indices = char_indices
        self.generated_text = 'what do you call '
        
    def on_epoch_end(self, epoch, logs={}):
        print("--- Generating with seed: '" + self.generated_text + "'")
        for i in range(400):
            sampled = np.zeros((1, self.maxlen, len(self.chars)))
            for t, char in enumerate(self.generated_text):
                sampled[0, t, self.char_indices[char]] = 1

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, 0.5)
            next_char = self.chars[next_index]

            sys.stdout.write(next_char)
        print("\n")

In [14]:
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(LSTM(128, input_shape=(64,)))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

text_generation = GenerateText(chars=chars, maxlen=maxlen, char_indices=char_indices)

for epoch in range(1,60):
    print("Epoch: {}".format(epoch))
    model.fit(x, y, epochs=1, batch_size=128, callbacks=[text_generation])
    print("")

Epoch: 1
Epoch 1/1
--- Generating with seed: 'what do you call '
attitoiattdthttayciaitottfihwwctaobtfyyatmgrhoadtaitsdatowttioswttaiabttmiiooayamswtgiiitaayoatotatiaatatgatattgoti"tsasmaiaasaaatdatmcpttttbwastiitaodoaitttattsoadtttataabasdsiastdistiiatyaadttgtyiattatamttsttisttttsttadaiiaootttitatstcothtlcttatsomystagstatatittsostctatiaattwttrattotdatmbaahbaiehtatitisatgaawdtsaattasateatwhttapttatytbajgtsttattdtteittaatwitiawaahohtatttataaastothadaadtttitaitas


Epoch: 2
Epoch 1/1
--- Generating with seed: 'what do you call '
ttattittthtthtttttataiaiatatttthatattatiataattahatttaatttatttttyatooatttaataataoaayatataohbttbitabittotytahthaattatthtttttttattaatatwatdtaattttaaataatoayatabaawattwtatttaaattathtthaaotttyaaayatoaoatfttaotaottaayatatthttattitaaboottttittttttthaoaoaataatyataayattatttttthttttaaattyitttabttaathaatattdtetataatttytotatttattattaittatttataaataatattaattottaattttaaaotottatttbatattytthtataotaaotttotwtttttath


Epoch: 3
Epoch 1/1
--- Generating with seed: 'what do you call '

KeyboardInterrupt: 