In [2]:
from __future__ import print_function
import numpy as np
import tensorflow as tf

import time
import os
from six.moves import cPickle
import codecs
import collections

import argparse
import csv
import re

In [3]:
data_dir = 'data'# data directory containing scripts.csv
input_encoding = None 
log_dir = 'logs'# directory containing tensorboard logs
save_dir = 'save' # directory to store checkpointed models

### Load the script data
Parse all the scripts to a dictionary based on the character value 

In [4]:
input_file = os.path.join(data_dir, "scripts.csv")
vocab_file = os.path.join(data_dir, "vocab.pkl")

In [5]:
dialogue_dict = {}
vocab_data = []

input_data = csv.DictReader(open(input_file))
for row in input_data:
    if row["Character"] in dialogue_dict:
        dialogue_dict[row["Character"]].append(row["Dialogue"].lower())
    else:
        dialogue_dict[row["Character"]] = [row["Dialogue"].lower()]
    
    vocab_data.extend(row["Dialogue"].lower().split())


### Build vocabulary 

In [6]:
# count the number of words
word_counts = collections.Counter(vocab_data)

# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

vocab_size = len(words)
print(f"Vocabulary Size: {vocab_size}")

#Save the vovabulary file
with open(vocab_file, 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

Vocabulary Size: 45050


### Training Data 
Create training data (x and y) for each character based on the vocabulary

In [7]:
JERRY_data = []
for i in dialogue_dict["JERRY"]:
    JERRY_data.extend(i.split())
    
#create sequences
sequences_step = 1
seq_length = 20
sequences = []
next_words = []
for i in range(0, len(JERRY_data) - seq_length, sequences_step):
    sequences.append(JERRY_data[i: i + seq_length])
    next_words.append(JERRY_data[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 147369


In [8]:
def batch_generator(X_train, Y_train, epochs, batch_size):
    for _ in range(epochs):
        cursor = 0
        while cursor + batch_size < len(X_train):
            x_batch = X_train[cursor:cursor+batch_size]
            y_batch = Y_train[cursor:cursor+batch_size]
            
            x = np.zeros((len(x_batch), seq_length, vocab_size), dtype=np.bool)
            y = np.zeros((len(y_batch), vocab_size), dtype=np.bool)
            for i, sentence in enumerate(x_batch):
                for j, word in enumerate(sentence):
                    x[i, j, vocab[word]] = 1
                y[i, vocab[y_batch[i]]] = 1
            yield x, y
            cursor += batch_size
            

### Define the LSTM model 

In [9]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    return model

Using TensorFlow backend.


In [10]:
rnn_size = 256 # size of RNN
learning_rate = 0.001 #learning rate

model = bidirectional_lstm_model(seq_length, vocab_size)
model.summary()

Build LSTM model.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 512)               92788736  
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 45050)             23110650  
_________________________________________________________________
activation (Activation)      (None, 45050)             0         
Total params: 115,899,386
Trainable params: 115,899,386
Non-trainable params: 0
_________________________________________________________________


### Training the dataset

In [12]:
batch_size = 128 # minibatch size
num_epochs = 50 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='loss', verbose=0, mode='auto', period=2)]
#fit the model
history = model.fit(batch_generator(sequences, next_words, num_epochs, batch_size),
                 batch_size=batch_size,
                 steps_per_epoch = np.floor(len(sequences)/batch_size),
                 epochs=num_epochs,
                 callbacks=callbacks)

#save the model
md.save(save_dir + "/" + 'my_model_generate_sentences.h5')

Epoch 1/50
Epoch 2/50


KeyError: 'val_loss'