In [1]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import re
import time

from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras import optimizers

from keras.backend import clear_session
from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard
from keras import backend as K

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
K.set_session(session)

Using TensorFlow backend.


In [2]:
with open('jokes/joke-dataset/stupidstuff.json') as json_data:
    ss_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/reddit_jokes.json') as json_data:
    reddit_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/wocka.json') as json_data:
    wocka_df = pd.DataFrame(json.load(json_data,))

pd.set_option('display.max_colwidth', -1)
    
ss_df.drop(['category', 'id', 'rating'], axis=1, inplace=True)
ss_df.rename(index=str, columns={'body': 'joke'}, inplace=True)

reddit_df['joke'] = reddit_df['title'] + " " + reddit_df['body']
reddit_df.drop(['body', 'id', 'title', 'score'], axis=1, inplace=True)

wocka_df.drop(['category', 'id', 'title'], axis=1, inplace=True)
wocka_df.rename(index=str, columns={'body': 'joke'}, inplace=True)

In [3]:
df = pd.concat([ss_df, reddit_df, wocka_df])
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
#num_words = [len(row['joke'].split()) for _, row in df.iterrows()]                
#num_jokes = df['joke'].count()

#print("number of samples: ", num_jokes)
#print("median words: ", np.median(num_words))
#print("average words: ", np.average(num_words))

In [5]:
max_words_per_joke = 75

data = [re.split("\s", joke) for joke in df['joke']]
data = [item for item in data if len(item) < max_words_per_joke]

In [29]:
TOP_K = 5000

data = data[:50000]

tokenizer = Tokenizer(num_words=TOP_K, oov_token="<unkown>")
tokenizer.fit_on_texts(data)

unflattened_sequences = [tokenizer.texts_to_sequences(x) for x in data]

sequences = []
for sequence in unflattened_sequences:
    flattened = [num for sublist in sequence for num in sublist]
    sequences.append(flattened)

In [30]:
maxlen = max([len(sequence) for sequence in sequences])
vocab_size = len(set([elem for item in sequences for elem in item]))
embedding_dim = 128
rnn_units = 32
print("Max sequence length: ", maxlen)
print("Vocab size: ", vocab_size)
print("rnn units: ", rnn_units)
print("embedding dimension: ", embedding_dim)

sequences = np.array(sequences)
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding="pre")

X, y = padded_sequences[:,:-1], padded_sequences[:,-1]

Max sequence length:  71
Vocab size:  2934
rnn units:  32
embedding dimension:  128


In [31]:
def build_model(vocab_size, embedding_dim, rnn_units, activation):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=maxlen-1),
        LSTM(rnn_units),
        Dense(vocab_size, activation=activation)
        ])
    return model

In [32]:
#try relu activation
model = build_model(vocab_size, embedding_dim, rnn_units, "relu") 
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 70, 128)           375552    
_________________________________________________________________
lstm_3 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_3 (Dense)              (None, 2934)              96822     
Total params: 492,982
Trainable params: 492,982
Non-trainable params: 0
_________________________________________________________________
None


In [33]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
#learning_rate = 0.001
#clip_norm = 1.0
#clip_value = 0.5
#epsilon = 1e-04

#adam = optimizers.Adam(lr=learning_rate, clipnorm=clip_norm, clipvalue=clip_value, epsilon=epsilon)
#model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [35]:
#model.save_weights("weights/word-level-lstm-v1/weights.h5")

In [36]:
fpath = "weights/word-level-lstm-v1/weights.hdf5"
checkpoint = ModelCheckpoint(fpath, monitor='val_acc', verbose=2, save_best_only=True, mode='max')
tb = TensorBoard(log_dir="tensorboard-logs/{}".format("lstm v1"))
callback_lst = [checkpoint, tb]

In [37]:
BATCH_SIZE = 64
EPOCHS = 10

model.fit(X, y, batch_size=BATCH_SIZE, validation_split=.2, epochs=EPOCHS, callbacks=callback_lst, verbose=1)

Train on 800 samples, validate on 200 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.00000, saving model to weights/word-level-lstm-v1/weights.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.00000
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.00000
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.00000
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.00000
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.00000
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.00000
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.00000
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.00000
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.00000


<keras.callbacks.History at 0x7f85d4692c88>

In [23]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [28]:
print (generate_text("I was shocked", 5, model, maxlen))

I Was Shocked     
