In [3]:
import pandas as pd
import numpy as np
import json
import string
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import os
import time
from nltk.corpus import stopwords

tf.enable_eager_execution()

In [4]:
with open('jokes/joke-dataset/stupidstuff.json') as json_data:
    ss_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/reddit_jokes.json') as json_data:
    reddit_df = pd.DataFrame(json.load(json_data,))

with open('jokes/joke-dataset/wocka.json') as json_data:
    wocka_df = pd.DataFrame(json.load(json_data,))

pd.set_option('display.max_colwidth', -1)
    
ss_df.drop(['category', 'id'], axis=1, inplace=True)
ss_df.rename(index=str, columns={'body': 'joke', 'rating': 'score'}, inplace=True)

reddit_df['joke'] = reddit_df['title'] + " " + reddit_df['body']
reddit_df.drop(['body', 'id', 'title'], axis=1, inplace=True)

wocka_df.drop(['category', 'id', 'title'], axis=1, inplace=True)
wocka_df.rename(index=str, columns={'body': 'joke'}, inplace=True)

In [5]:
reddit_df.loc[reddit_df['score'] == 0, ['score']] = 1
reddit_df.loc[(reddit_df['score'] > 0) & (reddit_df['score'] < 3), ['score']] = 2
reddit_df.loc[(reddit_df['score'] >= 3) & (reddit_df['score'] < 16), ['score']] = 3
reddit_df.loc[(reddit_df['score'] >= 16) & (reddit_df['score'] < 50), ['score']] = 4
reddit_df.loc[(reddit_df['score'] >= 50) & (reddit_df['score'] < 50000), ['score']] = 5

df = pd.concat([ss_df, reddit_df])
df.round({'score': 0})
df['score'] =  df['score'].apply(np.int64)
df = df.sample(frac=1).reset_index(drop=True)
#df['joke'].str.lower()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [None]:
num_words = [len(row['joke'].split()) for _, row in df.iterrows()]
print("median words: ", np.median(num_words))
print("average words: ", np.average(num_words))

In [6]:
printable = set(string.printable)
data = ''
for joke in df['joke']:
    filtered_joke = filter(lambda x: x in printable, joke)
    for s in filtered_joke:
        data += s
vocab = sorted(set(data))
data_size, vocab_size = len(data), len(vocab)
#print('total chars:', len(chars))

# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in data])

In [9]:
text_as_int.shape

(50842779,)

In [6]:
#set of characters. change range to see n unique chars
print('{')
for char,_ in zip(char2idx, range(50)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\t':   0,
  '\n':   1,
  '\r':   2,
  ' ' :   3,
  '!' :   4,
  '"' :   5,
  '#' :   6,
  '$' :   7,
  '%' :   8,
  '&' :   9,
  "'" :  10,
  '(' :  11,
  ')' :  12,
  '*' :  13,
  '+' :  14,
  ',' :  15,
  '-' :  16,
  '.' :  17,
  '/' :  18,
  '0' :  19,
  '1' :  20,
  '2' :  21,
  '3' :  22,
  '4' :  23,
  '5' :  24,
  '6' :  25,
  '7' :  26,
  '8' :  27,
  '9' :  28,
  ':' :  29,
  ';' :  30,
  '<' :  31,
  '=' :  32,
  '>' :  33,
  '?' :  34,
  '@' :  35,
  'A' :  36,
  'B' :  37,
  'C' :  38,
  'D' :  39,
  'E' :  40,
  'F' :  41,
  'G' :  42,
  'H' :  43,
  'I' :  44,
  'J' :  45,
  'K' :  46,
  'L' :  47,
  'M' :  48,
  'N' :  49,
  ...
}


In [10]:
# The maximum length sentence we want for a single input in characters
seq_length = 200
examples_per_epoch = len(data)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

#for i in char_dataset.take(5):
#    print(idx2char[i.numpy()])

#convert individual characters to sequences
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

#for item in sequences.take(5):
#    print(repr(''.join(idx2char[item.numpy()])))

In [11]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [12]:
#print some examples
for input_example, target_example in dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Instructions for updating:
Colocations handled automatically by placer.
Input data:  'Why did the melons have a big wedding? Well, they cantaloupeWhat feels good in the west but feels co'
Target data: 'hy did the melons have a big wedding? Well, they cantaloupeWhat feels good in the west but feels cot'


In [13]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 58 ('W')
  expected output: 75 ('h')
Step    1
  input: 75 ('h')
  expected output: 92 ('y')
Step    2
  input: 92 ('y')
  expected output: 3 (' ')
Step    3
  input: 3 (' ')
  expected output: 71 ('d')
Step    4
  input: 71 ('d')
  expected output: 76 ('i')


In [14]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

#dataset

In [15]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                             return_sequences=True,
                             stateful=True,
                             recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
        ])
    return model

In [64]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 256

model = build_model(
    vocab_size = vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)



In [65]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 98) # (batch_size, sequence_length, vocab_size)


In [66]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (64, None, 64)            6272      
_________________________________________________________________
lstm_6 (LSTM)                (64, None, 64)            33024     
_________________________________________________________________
dense_6 (Dense)              (64, None, 98)            6370      
Total params: 45,666
Trainable params: 45,666
Non-trainable params: 0
_________________________________________________________________


In [67]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 '" said the cop. \nThe priests look at each other, nod then the driver says to the cop "alright, we\'ll'

Next Char Predictions: 
 "s1).NT\nXk.E>dMS$$l0x?N3W\rfaQBYICxb.@ehCaa(3S$zyQW&RWs\rBN^da2< (t@E&L#=2%U8ddRAKZ$=Y-'\t]f,8\n\tEcm#d.&_"


In [68]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 98)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.586261


In [69]:
model.compile(optimizer='adam', loss=loss)

In [70]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [71]:
EPOCHS=2
history = model.fit(dataset, epochs=EPOCHS, steps_per_epoch=examples_per_epoch, callbacks=[checkpoint_callback])

Epoch 1/2

Consider using a TensorFlow optimizer from `tf.train`.
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.


In [88]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    print(tf.shape(input_eval))
    input_eval = tf.expand_dims(input_eval, 0)
    print(tf.shape(input_eval))

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [89]:
print(generate_text(model, start_string=u"                                                                "))

tf.Tensor([64], shape=(1,), dtype=int32)
tf.Tensor([ 1 64], shape=(2,), dtype=int32)


ValueError: Tensor's shape (64, 64, 64) is not compatible with supplied shape [Dimension(64), Dimension(1), 64]