In [1]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Dropout, Reshape, LSTM, TimeDistributed
from keras.optimizers import Adam
from keras import callbacks
from keras.utils import to_categorical
from keras.models import load_model
import json
from keras.callbacks import *

Using TensorFlow backend.


Before start go to **Edit->Notebook setting->Hardware accelerator** and set GPU

In [2]:
def generator(features, labels, batch_size, symbols):
    # Create empty arrays to contain batch of features and labels#
    idx = np.arange(len(features))
    np.random.shuffle(idx)
    offset = 0
    while True:
        if offset + batch_size > len(idx):
            np.random.shuffle(idx)
            offset = 0

        # choose random index in features
        index = idx[offset: offset + batch_size]
        batch_features = to_categorical(
            features[index], num_classes=symbols)
        batch_labels = to_categorical(
            labels[index], num_classes=symbols)
        offset += batch_size
        yield batch_features, batch_labels

Here we load train data from git

In [6]:
!git clone -l -s git://github.com/taivop/joke-dataset jokes_repo
with open('jokes_repo/reddit_jokes.json', 'r') as f:
        data = json.loads(f.read())
        print('\n\nSuccesfull opend')

fatal: destination path 'jokes_repo' already exists and is not an empty directory.


Succesfull opend


In [7]:
jokes = []
for j in data:
  content = j['title'] + ' ' + j['body']
  content = ''.join(map(lambda c: c if ord(c) < 128 else ' ', content))
  content = content.replace('\n', ' ')
  content = content.strip()
  content = ' '.join(content.split(' '))
  if len(content) > 200:
    continue
  jokes.append(content)
print('jokes: ', len(jokes))

jokes:  147811


Observing joke format

In [8]:
jokes[0]

'I hate how you cant even say black paint anymore Now I have to say "Leroy can you please paint the fence?"'

In [9]:
everything = sorted(set(''.join(jokes)))
everything = filter(lambda c: ord(c) < 128, everything)
char_map = {}
reverse_char_map = {}
for i, c in enumerate(everything):
  char_map[c] = i + 2
  reverse_char_map[i + 2] = c
symbols = len(char_map) + 2
print('symbols: ', symbols)

symbols:  102


In [10]:
features = list(
    map(lambda j: np.array([1] + list(map(lambda c: char_map.get(c, char_map[' ']), j)) + [0], dtype='int8'),
        jokes))

largest = max(map(len, features))
x = np.zeros([len(features), largest], dtype='int8')
y = np.zeros([len(features), largest], dtype='int8')

for i, f in enumerate(features):
  x[i, :len(f)] = f
  y[i, :len(f) - 1] = f[1:]

idx = np.arange(len(x))
np.random.shuffle(idx)

validation_split = 0.2
val_samples = int(validation_split * len(x))

val_idx = idx[:val_samples]
val_x = to_categorical(x[val_idx], num_classes=symbols)
val_y = to_categorical(y[val_idx], num_classes=symbols)

train_idx = idx[val_samples:]
x = x[train_idx]
y = y[train_idx]

batch_size = 64

[Here](https://medium.com/@mukesh.kumar43585/model-checkpoint-google-colab-and-drive-as-persistent-storage-for-long-training-runs-e35ffa0c33d9)  I read how to save model in Colab

In [16]:
from google.colab import drive
drive.mount("/content/gdrive")

ModuleNotFoundError: No module named 'google.colab'

In [17]:
filepath="/content/gdrive/My Drive/BJokesNN/epochs:{epoch:03d}-val_acc:{categorical_accuracy:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='categorical_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

Let us go use free GPU

In [None]:
model = Sequential()
model.add(LSTM(1024, return_sequences=True, input_shape=[largest, symbols]))
model.add(Dropout(0.2))
model.add(LSTM(1024, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(1024, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(1024, return_sequences=True))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(symbols, activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['categorical_accuracy'])
print(model.summary())

model.fit_generator(
    generator(x, y, batch_size, symbols), steps_per_epoch=len(x) // batch_size,
    epochs=20, shuffle=False, validation_data=(val_x  , val_y),
    callbacks=callbacks_list, verbose=1
    )
print("Whoho done")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 202, 1024)         4616192   
_________________________________________________________________
dropout_1 (Dropout)          (None, 202, 1024)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 202, 1024)         8392704   
_________________________________________________________________
dropout_2 (Dropout)          (None, 202, 1024)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 202, 1024)         8392704   
_________________________________________________________________
dropout_3 (Dropout)          (None, 202, 1024)         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 202, 1024)        