In [2]:
import pandas as pd
import numpy as np
import json
import string
import tensorflow as tf
import re
import time

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

from keras.backend import clear_session
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from keras.callbacks import TensorBoard
from keras import backend as K

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
K.set_session(session)

Using TensorFlow backend.


In [3]:
with open('shortjokes.csv') as file:
    df = pd.read_csv(file)
    
pd.set_option('display.max_colwidth', -1)
df.rename(index=str, columns={'Joke': 'joke'}, inplace=True)
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
def clean_text(text):
    text = "".join(c for c in text if c not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii", 'ignore')
    return text 

jokes_to_use = df['joke'].count()

data = [clean_text(x) for x in df['joke'][:jokes_to_use]]

In [4]:
TOP_K = 10000

tokenizer = Tokenizer(num_words=TOP_K)
# tokenization
tokenizer.fit_on_texts(data)
total_words = len(tokenizer.word_index) + 1

def get_sequence_of_tokens(data):
    # convert data to sequence of tokens 
    input_sequences = []
    for line in data:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

input_sequences = get_sequence_of_tokens(data)

In [5]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    X, y = input_sequences[:,:-1], input_sequences[:,-1]
    #commented out to use sparse, which is more memory efficient
    #y = ku.to_categorical(y, num_classes=total_words)
    return X, y, max_sequence_len

X, y, max_sequence_len = generate_padded_sequences(input_sequences)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [6]:
def create_model(max_sequence_len, total_words, embedding_dim, rnn_units, activation):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, embedding_dim, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(rnn_units))
    #model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation=activation))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #categorical_crossentropy
    
    return model

In [7]:
embedding_dim = 10
rnn_units = 100
activation = 'softmax'

model = create_model(max_sequence_len, total_words, embedding_dim, rnn_units, activation)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 92, 10)            760910    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_1 (Dense)              (None, 76091)             7685191   
Total params: 8,490,501
Trainable params: 8,490,501
Non-trainable params: 0
_________________________________________________________________


In [None]:
fpath = "weights/model-v2.hdf5"
checkpoint = ModelCheckpoint(fpath, monitor='loss', verbose=2, save_best_only=True, mode='min')
#checkpoint = ModelCheckpoint(fpath, monitor='val_loss', verbose=2, mode='min')
tb = TensorBoard(log_dir="../mville-insights/tensorboard-logs/{}".format("word level lstm v2"))
callback_lst = [checkpoint, tb]

In [None]:
EPOCHS = 5
BATCH_SIZE = 50

model.fit(X, y,
          epochs=EPOCHS,
          batch_size=BATCH_SIZE,
          verbose=1,
          validation_split=0.05,
          callbacks=callback_lst)

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

In [None]:
model = load_model("weights/model.hdf5")

In [None]:
print(generate_text("Breakups are worst", 5, model, max_sequence_len))