In [2]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import os
import time
import random
import sys
import io

from nltk.corpus import stopwords
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [3]:
with open('jokes/joke-dataset/stupidstuff.json') as json_data:
    ss_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/reddit_jokes.json') as json_data:
    reddit_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/wocka.json') as json_data:
    wocka_df = pd.DataFrame(json.load(json_data,))

pd.set_option('display.max_colwidth', -1)
    
ss_df.drop(['category', 'id'], axis=1, inplace=True)
ss_df.rename(index=str, columns={'body': 'joke', 'rating': 'score'}, inplace=True)

reddit_df['joke'] = reddit_df['title'] + " " + reddit_df['body']
reddit_df.drop(['body', 'id', 'title'], axis=1, inplace=True)

wocka_df.drop(['category', 'id', 'title'], axis=1, inplace=True)
wocka_df.rename(index=str, columns={'body': 'joke'}, inplace=True)

In [4]:
df = pd.concat([ss_df, reddit_df])
df = df.sample(frac=1).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [5]:
df.head()

Unnamed: 0,joke,score
0,"Living next to a golf course A little old lady was walking down the street dragging two large plastic garbage bags behind her. One of the bags was ripped and every once in awhile a $20 bill fell out onto the sidewalk.\n\nNoticing this, a policeman stopped her, and said, ""Ma'am, there are $20 bills falling out of that bag.""\n\n""Oh, really? Darn it!"" said the little old lady. ""I'd better go back and see if I can find them. Thanks for telling me, Officer.""\n\n""Well, now, not so fast,"" said the cop. "" Where did you get all that money? You didn't steal it, did you?""\n\n""Oh, no, no"", said the old lady. ""You see, my back yard is right next\nto a golf course. A lot of golfers come and pee through a knot hole in\nmy fence, right into my flower garden. It used to really tick me off.\nKills the flowers, you know. Then I thought, 'why not make the best of\nit?' So, now, I stand behind the fence by the knot hole, real quiet,\nwith my hedge clippers. Every time some guy sticks his thing through my fence, I surprise him, grab hold of it and say, 'O.K., buddy! Give me $20 or off it comes!'""\n\n""Well, that seems only fair,"" said the cop, laughing .""OK. Good luck! Oh, by the way, what's in the other bag?"" he asked.\n\nShe replied, ""Not everybody pays.""",140.0
1,What do Mathematicians use to fry their food? Euler butter.,0.0
2,So one man walks into a bar and the other one ducks.,0.0
3,What's the difference between a tire and 365 used condoms? Ones a goodyear and the others a great year.,12.0
4,What's the difference between a government bond and a man? *The bond matures.*,0.0


In [4]:
num_words = [len(row['joke'].split()) for _, row in df.iterrows()]                
max_chars_per_joke = max([len(joke) for joke in df['joke']])
num_jokes = df['joke'].count()

print("number of samples: ", num_jokes)
print("median words: ", np.median(num_words))
print("average words: ", np.average(num_words))
print("chars in longest joke: ", max_chars_per_joke)

number of samples:  198326
median words:  18.0
average words:  47.74331151740064
chars in longest joke:  39743


In [6]:
printable = set(string.printable)
data = ''
joke_list = []
for joke in df['joke']:
    filtered_joke = filter(lambda x: x in printable, joke)
    temp_joke = ""
    for c in filtered_joke:
        data += c
        temp_joke += c
    joke_list.append(temp_joke)
        
vocab = sorted(set(data))
vocab_size = len(vocab)

# Creating a mapping from unique characters to indices
char2idx = {c:i for i, c in enumerate(vocab)}
idx2char = {i:c for i, c in enumerate(vocab)}

In [7]:
#set of characters. change range to see n unique chars
print('{')
for char, _ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\t':   0,
  '\n':   1,
  '\r':   2,
  ' ' :   3,
  '!' :   4,
  '"' :   5,
  '#' :   6,
  '$' :   7,
  '%' :   8,
  '&' :   9,
  "'" :  10,
  '(' :  11,
  ')' :  12,
  '*' :  13,
  '+' :  14,
  ',' :  15,
  '-' :  16,
  '.' :  17,
  '/' :  18,
  '0' :  19,
  ...
}


In [8]:
maxlen = 100
step = 1
seqs = []
next_chars = []
for i in range(0, (len(data)//50) - maxlen, step): #only 1 million for now
    seqs.append(data[i: i + maxlen])
    next_chars.append(data[i + maxlen])
print('nb sequences:', len(seqs))

nb sequences: 1016755


In [10]:
seqs[1]

'iving next to a golf course A little old lady was walking down the street dragging two large plastic'

In [11]:
seqs_int = []
for seq in seqs[:1000000]: 
    temp = []
    for c in seq:
        temp.append(char2idx[c])
    seqs_int.append(temp)

In [12]:
seqs_int[0]

[47,
 76,
 89,
 76,
 81,
 74,
 3,
 81,
 72,
 91,
 87,
 3,
 87,
 82,
 3,
 68,
 3,
 74,
 82,
 79,
 73,
 3,
 70,
 82,
 88,
 85,
 86,
 72,
 3,
 36,
 3,
 79,
 76,
 87,
 87,
 79,
 72,
 3,
 82,
 79,
 71,
 3,
 79,
 68,
 71,
 92,
 3,
 90,
 68,
 86,
 3,
 90,
 68,
 79,
 78,
 76,
 81,
 74,
 3,
 71,
 82,
 90,
 81,
 3,
 87,
 75,
 72,
 3,
 86,
 87,
 85,
 72,
 72,
 87,
 3,
 71,
 85,
 68,
 74,
 74,
 76,
 81,
 74,
 3,
 87,
 90,
 82,
 3,
 79,
 68,
 85,
 74,
 72,
 3,
 83,
 79,
 68,
 86,
 87,
 76]

In [8]:
print('Vectorization...')
#for every sequence, length of maxlen, length of vocab
#boolean matrix where True if char appears in sequence, false otherwise
x = np.zeros((len(seqs), maxlen, len(vocab)), dtype=np.bool)
#boolean matrix, label corresponding to appropriate x input
y = np.zeros((len(seqs), len(vocab)), dtype=np.bool)
for i, seq in enumerate(seqs):
    for t, char in enumerate(seq):
        x[i, t, char2idx[char]] = 1
    y[i, char2idx[next_chars[i]]] = 1

Vectorization...


In [9]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(vocab))))
model.add(Dense(len(vocab), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...
Instructions for updating:
Colocations handled automatically by placer.


In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(data) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = data[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(vocab)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char2idx[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = idx2char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [None]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])