In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
import time
import os
from six.moves import cPickle
import codecs
import collections
import argparse
import csv
import re

In [2]:
data_dir = 'data'# data directory containing scripts.csv
save_dir = 'save' # directory to store checkpointed models

### Load the script data
Parse all the scripts to a dictionary based on the character. 

In [3]:
input_file = os.path.join(data_dir, "scripts.csv")
vocab_file = os.path.join(data_dir, "vocab.pkl")

In [4]:
#Parse the csv file to a dictionary with character as the key 
dialogue_dict = {}

input_data = csv.DictReader(open(input_file))
for row in input_data:
    dialogue = row["Character"].lower() + ": " + row["Dialogue"].lower() + "\n"
    if row["Character"].lower() in dialogue_dict:
        dialogue_dict[row["Character"].lower()].append(dialogue)
    else:
        dialogue_dict[row["Character"].lower()] = [dialogue]

In [5]:
#List the first 6 characters with the most lines
dialogue_freq = {}

for key in dialogue_dict:
    dialogue_freq[key] = len(dialogue_dict[key])
    
highest_freq = collections.Counter(dialogue_freq).most_common(6)
highest_freq

[('jerry', 14786),
 ('george', 9708),
 ('elaine', 7984),
 ('kramer', 6664),
 ('newman', 641),
 ('morty', 505)]

There are 4 major characters in the TV series, other characters has much less lines compared to them. 

### Build vocabulary 
Build the vocabulary, word_to_index mapping, based on all the words appeared in the scripts.csv file. And save the vocabulary to vocab.pkl file. 

In [5]:
# Vocalbuary is built by all the possible characters in the scripts.csv

# Replacing special characters in the text and pend <PAD> by the end of each conversation
# Parse the string to list of tokens 
def tokenizer(text):
    SPECIAL_WORDS = {'PADDING': '<PAD>'}
    token_dict = {
            '.': '||Period||',
            ',': '||Comma||',
            '"': '||Quotation_Mark||',
            ';': '||Semicolon||',
            '!': '||Exclamation_Mark||',
            '?': '||Question_Mark||',
            '(': '||Left_Parentheses||',
            ')': '||Right_Parentheses||',
            '-': '||Dash||',
            '\n': '||Return||'
            }
    for key, token in token_dict.items():
            text = text.replace(key, ' {} '.format(token))
            
    text = text.split()
    text = text + list(SPECIAL_WORDS.values())
    return text

In [6]:
vocabulary = []

for character, dialogue_lst in dialogue_dict.items():
    for dialogue in dialogue_lst:
        vocabulary.extend(tokenizer(dialogue))

In [10]:
# count the number of words
word_counts = collections.Counter(vocabulary).most_common()

# Find the unique word lists 
words_lst = [x[0] for x in word_counts]
words_lst = list(sorted(words_lst))

# Mapping from word to index
word_to_index = {x: i for i, x in enumerate(words_lst)}
index_to_word = {i: x for i, x in enumerate(words_lst)}
words = [x[0] for x in word_counts]

vocab_size = len(words)
print(f"Vocabulary Size: {vocab_size}")

#Save the vovabulary file
with open(vocab_file, 'wb') as f:
    cPickle.dump((word_counts, word_to_index, index_to_word), f)

Vocabulary Size: 21397


### Prepare Training Data for individual character
Create training data (x and y) for each character based on the vocabulary

In [9]:
## Load dialogue data for specific character 
def get_script(character):
    path = os.path.join("data", character + "_script.txt")
    file = open(path, "w+")
    for line in dialogue_dict[character]:
        file.write(line)
    file.close()

In [10]:
# Get the script for the major four characters 
main_characters = ['jerry', 'george', 'elaine', 'kramer']
for c in main_characters:
    get_script(c)

In [9]:
def load_data(data_path):
    file = open(data_path, "r")
    lines = file.readlines()
    training_data = []
    
    for line in lines:
        training_data.extend(tokenizer(line))
    
    return training_data

In [10]:
def prepare_training_data(data, seq_length): 
    sequences_step = 1
    X_train = []
    Y_train = []
    for i in range(0, len(data) - seq_length, sequences_step):
        X_train.append(data[i: i + seq_length])
        Y_train.append(data[i + seq_length])

    print('Total sequences:', len(X_train))
    return X_train, Y_train
    

In [11]:
def batch_generator(X_train, Y_train, epochs, batch_size):
    for _ in range(epochs):
        cursor = 0
        while cursor + batch_size < len(X_train):
            x_batch = X_train[cursor:cursor+batch_size]
            y_batch = Y_train[cursor:cursor+batch_size]
            
            x = np.zeros((len(x_batch), seq_length, vocab_size), dtype=np.bool)
            y = np.zeros((len(y_batch), vocab_size), dtype=np.bool)
            for i, sentence in enumerate(x_batch):
                for j, word in enumerate(sentence):
                    x[i, j, word_to_index[word]] = 1
                y[i, word_to_index[y_batch[i]]] = 1
            yield x, y
            cursor += batch_size
            

### Prepare Training data for inter character dialogue

In [9]:
# Building training data sets for four major characters in the sequence of the dialogue 
main_characters = ['jerry', 'george', 'elaine', 'kramer']

def get_dialogue():
    input_data = csv.DictReader(open(input_file))
    path = os.path.join("data", "main_character_script.txt")
    file = open(path, "w+")
    for row in input_data:
        if row["Character"].lower() in main_characters:
            line = row["Character"].lower() + ": " + row["Dialogue"].lower() + "\n"
            file.write(line)
    file.close()

In [None]:
get_dialogue()

### Define the LSTM model 

In [12]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

def bidirectional_lstm_model(seq_length, vocab_size):
    rnn_size = 256 # size of RNN
    learning_rate = 0.001 #learning rate
    
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.2))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    return model

Using TensorFlow backend.


### Training Per character dataset
Train the model specifically for each main charaters. 

In [13]:
import time
batch_size = 128 # minibatch size
num_epochs = 40 # number of epochs
seq_length = 20

def model_training(data_path):
    model = bidirectional_lstm_model(seq_length, vocab_size)
    model.summary()
    
    X_train, Y_train = prepare_training_data(load_data(data_path), seq_length)

    callbacks=[EarlyStopping(patience=2, monitor='loss'),
               ModelCheckpoint(filepath=save_dir + "/" + f'model_lstm_{character}_{batch_size}' + '.{epoch:02d}-{loss:.2f}.hdf5',
                               monitor='loss', verbose=0, mode='auto', period=2)]
    #fit the model
    str_time = time.time()
    history = model.fit(batch_generator(X_train, Y_train, num_epochs, batch_size),
                     batch_size=batch_size,
                     steps_per_epoch = np.floor(len(X_train)/batch_size),
                     epochs=num_epochs,
                     callbacks=callbacks)
    end_time = time.time()
    training_time = end_time - str_time
    
    #save the model history 
    with open(save_dir + "/" + f'model_history_{character}_{batch_size}.pkl', 'wb') as file_pi:
        cPickle.dump((history.history, training_time), file_pi)
    


In [14]:
main_characters = ['jerry', 'george', 'elaine', 'kramer']
for c in main_characters:
    print(f"Model Training with {c} script")
    path = os.path.join("data", character + "_script.txt")
    model_training(path)

Model Training with george script
Build LSTM model.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 512)               44347392  
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 21397)             10976661  
_________________________________________________________________
activation (Activation)      (None, 21397)             0         
Total params: 55,324,053
Trainable params: 55,324,053
Non-trainable params: 0
_________________________________________________________________
Total sequences: 170682
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 1

### Train inter-character dialogue 

In [None]:
path = os.path.join("data", "main_character_script.txt")
model_training(path)