In [3]:
import random
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.models import model_from_json
from keras.callbacks import Callback

plt.style.use('fivethirtyeight')

In [4]:
def sample(preds, temp=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temp
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [5]:
df = pd.read_json('https://github.com/taivop/joke-dataset/blob/master/wocka.json?raw=true')
df["body"] = df["body"].str.lower().str.strip()

bad_indices = []
print("Length of dataframe before = {}".format(len(df)))
for i in range(len(df)):
    joke = df["body"].iloc[i]
    if len(joke) < 20:
        bad_indices.append(i)
df = df.drop(bad_indices, axis='rows')
print("Length of dataframe after = {}".format(len(df)))

Length of dataframe before = 10019
Length of dataframe after = 9985


In [6]:
maxlen = 60
step = 3

sentences_text = []
print("Creating sentence and next_chars arrays...")
for joke in df["body"].values:
    sentences_text.append(joke)

sentences = []
next_chars = []
chars = []
text = "\n".join(sentences_text)
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

print("Number of sequences: {}".format(len(sentences)))

chars = sorted(list(set(text)))
print("Number of unique characters: {}".format(len(chars)))
print("Creating sentence and next_chars arrays...\n")
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print("Vectorization...")
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print("Finished.")


Creating sentence and next_chars arrays...
Number of sequences: 2072677
Number of unique characters: 149
Creating sentence and next_chars arrays...

Vectorization...
Finished.


In [7]:
class GenerateText(Callback):
    def __init__(self, chars, maxlen, char_indices):
        self.chars = chars
        self.maxlen = maxlen
        self.char_indices = char_indices
        self.generated_text = 'what do you call '
        
    def on_epoch_end(self, epoch, logs={}):
        print("--- Generating with seed: '" + self.generated_text + "'")
        for i in range(400):
            sampled = np.zeros((1, self.maxlen, len(self.chars)))
            for t, char in enumerate(self.generated_text):
                sampled[0, t, self.char_indices[char]] = 1

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, 0.5)
            next_char = self.chars[next_index]

            sys.stdout.write(next_char)
        print("\n")

In [8]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

text_generation = GenerateText(chars=chars, maxlen=maxlen, char_indices=char_indices)

for epoch in range(1,60):
    print("Epoch: {}".format(epoch))
    model.fit(x, y, epochs=1, batch_size=128, callbacks=[text_generation])
    print("")

Epoch: 1
Epoch 1/1
--- Generating with seed: 'what do you call '
    i    s     as     s     i       ets  s    ssiel sss.    n  . s  s        e    :  s    s          s . t    n     s   ss       s  ei          o      in t      y           s                      n ts      .a   a"le s       t      s : s         s   sb   , ns        .    s   " i sl    ys     s . as ln  s ?         s s     y        ,s  n  s    sss    


Epoch: 2
Epoch 1/1
--- Generating with seed: 'what do you call '
i     b s, .  niis  i  z s s  is   s s    s ss i,sai ss ss a a s:s ss  a           .i s s i  i s s    s,,s ssss s   s si, si    : s :  i s     s  ais,     ssiss  s n ssisis s l  i    i as  ,s   sssss  s    i s  sta   ss s s  ,  ,s:   s  ssssi,           iss  z ,  l ,ic   s y s ss a ia s  sssss  sss ssr .s  ! i  sss as   isss sssss  ssi a s  si  s s    s 


Epoch: 3
Epoch 1/1
--- Generating with seed: 'what do you call '
 st ss              .  e            o     s l a    b  s s  ,     s ss     sr  s   s  s a a a

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



--- Generating with seed: 'what do you call '
!     ,. .s   .ss   .         .. s s              s         ?,?sl,      ,    s   .    .    .. ?   ,   ". !    .            s'           . s     .          '" s,.s.,       s     .           , s.   s          ! .   s.         .               ss .  .        ,.   .      ,   .       s -  .   s .  '   .         s                  s      s          as     s    


Epoch: 12
Epoch 1/1
 185088/2072677 [=>............................] - ETA: 22:20 - loss: 1.4537

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



--- Generating with seed: 'what do you call '
      s   . s  s  "  s a s    s .       ,        s  s       s  ,     , .               l   s  '   "                ,        ."              s,  s           s  .          .  ..s             a          s               s?   s  s  , ,  s' a   .,   ss          s      s        s      .s?       s"      . .         l    r.      s    ss.       "      s  s.           s s  ,      s  ,  s  s   a           


Epoch: 14
Epoch 1/1
 443008/2072677 [=====>........................] - ETA: 19:23 - loss: 1.4398

KeyboardInterrupt: 