In [1]:
import pandas as pd
import numpy as np
import json
import string
import tensorflow as tf
import re
import time

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

from keras.backend import clear_session
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard
from keras import backend as K

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
K.set_session(session)

Using TensorFlow backend.


In [2]:
with open('shortjokes.csv') as file:
    df = pd.read_csv(file)
    
pd.set_option('display.max_colwidth', -1)
df.rename(index=str, columns={'Joke': 'joke'}, inplace=True)
df = df.sample(frac=1).reset_index(drop=True)

In [3]:
def clean_text(text):
    text = "".join(c for c in text if c not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii", 'ignore')
    return text 

jokes_to_use = df['joke'].count()

data = [clean_text(x) for x in df['joke'][:jokes_to_use]]

In [4]:
TOP_K = 10000

tokenizer = Tokenizer(num_words=TOP_K)
# tokenization
tokenizer.fit_on_texts(data)
total_words = len(tokenizer.word_index) + 1

def get_sequence_of_tokens(data):
    # convert data to sequence of tokens 
    input_sequences = [tokenizer.texts_to_sequences([line])[0] for line in data]
    return input_sequences

input_sequences = get_sequence_of_tokens(data)

In [9]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    X, y = input_sequences[:,:-1], input_sequences[:,-1]
    #commented out to use sparse, which is more memory efficient
    #y = ku.to_categorical(y, num_classes=total_words)
    return X, y, max_sequence_len

X, y, max_sequence_len = generate_padded_sequences(input_sequences)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
def create_model(max_sequence_len, total_words, embedding_dim, rnn_units, activation):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, embedding_dim, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(rnn_units))
    #model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation=activation))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #categorical_crossentropy
    
    return model

In [None]:
embedding_dim = 10
rnn_units = 100
activation = 'softmax'

model = create_model(max_sequence_len, total_words, embedding_dim, rnn_units, activation)
model.summary()

In [None]:
fpath = "weights/model-v5.hdf5"
checkpoint = ModelCheckpoint(fpath, monitor='loss', verbose=2, save_best_only=True, mode='min')
#checkpoint = ModelCheckpoint(fpath, monitor='val_loss', verbose=2, mode='min')
tb = TensorBoard(log_dir="../mville-insights/tensorboard-logs/{}".format("word level lstm v5"))
callback_lst = [checkpoint, tb]

In [None]:
EPOCHS = 5
#BATCH_SIZE = 50

model.fit(X, y,
          epochs=EPOCHS,
          verbose=1,
          validation_split=0.05,
          callbacks=callback_lst)

In [7]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [13]:
model = load_model("weights/model-v2.hdf5")

In [18]:
print(generate_text("The", 5, model, max_sequence_len))

The It It It It It
