In [48]:
import os
import pickle
import numpy as np
import pandas as pd
from collections import Counter

### Data Preprocessing

In [49]:
def load_data(path):
    """
    Load Dataset from File
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data

def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """
    text = load_data(dataset_path)
    
    # Ignore notice, since we don't use it for analysing the data
    text = text[81:]

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))   

In [50]:
scripts = pd.read_csv('./data/scripts.csv', index_col=0)
scripts.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0
2,GEORGE,Are you through?,1.0,S01E01,1.0
3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0


In [51]:
scripts[['Character', 'Dialogue']].groupby(['Character']).count().sort_values(by='Dialogue',ascending=False).head(10)

Unnamed: 0_level_0,Dialogue
Character,Unnamed: 1_level_1
JERRY,14786
GEORGE,9708
ELAINE,7983
KRAMER,6664
NEWMAN,640
MORTY,505
HELEN,471
FRANK,436
SUSAN,379
[Setting,293


**The main characters are JERRY(has 14786 dialogues), GEORGE(9708 dialogues), ELAINE(7983 dialogues) and KRAMER(6664 dialogues). One can see that the rest character has 10x less dialogues afterwards. To maintain the characteristics of each role, we will train the RNN model for each of them respectively.**

In [52]:
def convert_to_role_dialogue(r):
    return r[0].lower()+": "+ r[1]

In [53]:
# scripts['char_dial'] = scripts[['Character', 'Dialogue']].apply(convert_to_role_dialogue)
scripts['char_dial'] = scripts['Character'].str.lower()+': '+scripts['Dialogue']
scripts.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,char_dial
0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0,jerry: Do you know what this is all about? Do ...
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0,"jerry: (pointing at Georges shirt) See, to me,..."
2,GEORGE,Are you through?,1.0,S01E01,1.0,george: Are you through?
3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0,"jerry: You do of course try on, when you buy?"
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0,"george: Yes, it was purple, I liked it, I dont..."


In [54]:
# scripts[['char_dial']].to_csv(r'./data/test.txt', header=None, index=None, sep='\n', mode='a')

In [8]:
for i in ['jerry', 'george', 'elaine', 'kramer']:
    temp = scripts[scripts.Character == i.upper()]
    print(temp.head())
    temp[['char_dial']].to_csv(r'./data/{i}_script.txt'.format(i=i), header=None, index=None, sep='\n', mode='a')

  Character                                           Dialogue  EpisodeNo  \
0     JERRY  Do you know what this is all about? Do you kno...        1.0   
1     JERRY  (pointing at Georges shirt) See, to me, that b...        1.0   
3     JERRY             You do of course try on, when you buy?        1.0   
5     JERRY                               Oh, you dont recall?        1.0   
7     JERRY  Well, senator, Id just like to know, what you ...        1.0   

     SEID  Season                                          char_dial  
0  S01E01     1.0  jerry: Do you know what this is all about? Do ...  
1  S01E01     1.0  jerry: (pointing at Georges shirt) See, to me,...  
3  S01E01     1.0      jerry: You do of course try on, when you buy?  
5  S01E01     1.0                        jerry: Oh, you dont recall?  
7  S01E01     1.0  jerry: Well, senator, Id just like to know, wh...  
   Character                                           Dialogue  EpisodeNo  \
2     GEORGE                     

In [93]:
# data_dir = './data/{i}_script.txt'.format(i='kramer')
data_dir = './data/Seinfeld_Scripts.txt'
print(data_dir)

./data/Seinfeld_Scripts.txt


In [94]:
text = load_data(data_dir)
view_line_range = (30, 50)

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 46367
Number of lines: 109233
Average number of words in each line: 5.544240293684143

The lines 30 to 50:
george: wait a second, wait a second, what coming in, what woman is coming in? 

jerry: i told you about laura, the girl i met in michigan? 

george: no, you didnt! 

jerry: i thought i told you about it, yes, she teaches political science? i met her the night i did the show in lansing... 

george: ha. 

jerry: (looks in the creamer) theres no milk in here, what... 

george: wait wait wait, what is she... (takes the milk can from jerry and puts it on the table) what is she like? 

jerry: oh, shes really great. i mean, shes got like a real warmth about her and shes really bright and really pretty and uh... the conversation though, i mean, it was... talking with her is like talking with you, but, you know, obviously much better. 

george: (smiling) so, you know, what, what happened? 

jerry: oh, nothing happened, you know, but is was

In [95]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    chars = sorted(list(set(text))) # getting all unique chars
    print('total chars: ', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return chars, (char_indices, indices_char)

In [96]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    return {
        '.': '||Period||',
        ',': '||Comma||',
        '"': '||Quotation_Mark||',
        ';': '||Semicolon||',
        '!': '||Exclamation_Mark||',
        '?': '||Question_Mark||',
        '(': '||Left_Parentheses||',
        ')': '||Right_Parentheses||',
        '-': '||Dash||',
        '\n': '||Return||'
        }

In [97]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}
# preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [98]:
vocabulary = sorted(list(set(text)))
char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

In [99]:
max_length = 20
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])
      
# Hot encoding each character into a boolean vector
  
# Initializing a matrix of boolean vectors with each column representing
# the hot encoded representation of the character
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype = np.bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype = np.bool)
  
# Placing the value 1 at the appropriate position for each vector
# to complete the hot-encoding process
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
        y[i, char_to_indices[next_chars[i]]] = 1

In [100]:
def prepare_train(text, max_length, step):
    sentences = []
    next_chars = []
    for i in range(0, len(text) - max_length, steps):
        sentences.append(text[i: i + max_length])
        next_chars.append(text[i + max_length])

    # Hot encoding each character into a boolean vector

    # Initializing a matrix of boolean vectors with each column representing
    # the hot encoded representation of the character
    X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype = np.bool)
    y = np.zeros((len(sentences), len(vocabulary)), dtype = np.bool)

    # Placing the value 1 at the appropriate position for each vector
    # to complete the hot-encoding process
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            X[i, t, char_to_indices[char]] = 1
            y[i, char_to_indices[next_chars[i]]] = 1
    return X, y

## Build GRU model

In [101]:
from __future__ import absolute_import, division, print_function, unicode_literals
  
import numpy as np
import tensorflow as tf
  
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, GRU
  
from keras.optimizers import RMSprop
  
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau
import random
import sys

In [102]:
# Initializing the GRU network
model = Sequential()
  
# Defining the cell type
model.add(GRU(128, input_shape =(max_length, len(vocabulary))))
  
# Defining the densely connected Neural Network layer
model.add(Dense(len(vocabulary)))
  
# Defining the activation function for the cell
model.add(Activation('softmax'))
  
# Defining the optimizing function
optimizer = RMSprop(lr = 0.01)
  
# Configuring the model for training
model.compile(loss ='categorical_crossentropy', optimizer = optimizer)

In [103]:
# Helper function to sample an index from a probability array
def sample_index(preds, temperature = 1.0):
# temperature determines the freedom the function has when generating text
  
    # Converting the predictions vector into a numpy array
    preds = np.asarray(preds).astype('float64')
  
    # Normalizing the predicitons array
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
  
    # The main sampling step. Creates an array of probablities signifying
    # the probability of each character to be the next character in the 
    # generated text
    probas = np.random.multinomial(1, preds, 1)
  
    # Returning the character with maximum probability to be the next character
    # in the generated text
    return np.argmax(probas)

In [104]:
# Helper function to generate text after the end of each epoch
def on_epoch_end(epoch, logs):
    print()
    print('----- Generating text after Epoch: % d' % epoch)
  
    # Choosing a random starting index for the text generation
    start_index = random.randint(0, len(text) - max_length - 1)
  
    # Sampling for different values of diversity
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)
  
        generated = ''
  
        # Seed sentence
        sentence = text[start_index: start_index + max_length]
  
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
  
        for i in range(400):
            # Initializing the predicitons vector
            x_pred = np.zeros((1, max_length, len(vocabulary)))
  
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_indices[char]] = 1.
  
            # Making the predictions for the next character
            preds = model.predict(x_pred, verbose = 0)[0]
  
            # Getting the index of the most probable next character
            next_index = sample_index(preds, diversity)
  
            # Getting the most probable next character using the mapping built
            next_char = indices_to_char[next_index]
  
            # Building the generated text
            generated += next_char
            sentence = sentence[1:] + next_char
  
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
# Defining a custom callback function to 
# describe the internal states of the network
print_callback = LambdaCallback(on_epoch_end = on_epoch_end)

In [105]:
# Defining a helper function to save the model after each epoch
# in which the loss decreases
# filepath = "weights_{i}.hdf5".format(i='kramer')
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor ='loss',verbose = 1, save_best_only = True,mode ='min')

In [106]:
# Defining a helper function to reduce the learning rate each time
# the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor ='loss', factor = 0.2,patience = 1, min_lr = 0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]

In [107]:
# Training the GRU model
model.fit(X, y, batch_size = 128, epochs = 40, callbacks = callbacks)

Epoch 1/40

----- Generating text after Epoch:  0
----- diversity: 0.2
----- Generating with seed: "i'm lost now, kramer"
i'm lost now, kramer! 

jerry: (loughing and here hard the carend and the changer and what are you should had the care and her down and her and you know you have to see hor and what is the manding to the come in the come in the caren and i have to me her hard in the come grander there and here word hore who hard the man and what happened? 

jerry: hey, i have the changer and her like a her hard hard hor my and what a
----- diversity: 0.5
----- Generating with seed: "i'm lost now, kramer"
i'm lost now, kramer! what do you gotta have who don't want this got the lovernis hard and word end you have are word the raching are your loughting as it of it stunfriend) what? 

jerry: hey, you're nothing. 

jerry: i'm hor and the hard the one a word somework. 

george: well, this is no shruper with the word your my who hom. 

jerry: you gotta got the little from the mane of who 

<tensorflow.python.keras.callbacks.History at 0x7fc9c6063fd0>

In [85]:
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - max_length - 1)
  
    # Defining the generated text
    generated = ''
    sentence = text[start_index: start_index + max_length]
    generated += sentence
  
    # Generating new text of given length
    for i in range(length):
  
            # Initializing the predicition vector
            x_pred = np.zeros((1, max_length, len(vocabulary)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_indices[char]] = 1.
  
            # Making the predicitons
            preds = model.predict(x_pred, verbose = 0)[0]
  
            # Getting the index of the next most probable index
            next_index = sample_index(preds, diversity)
  
            # Getting the most probable next character using the mapping built
            next_char = indices_to_char[next_index]
  
            # Generating new text
            generated += next_char
            sentence = sentence[1:] + next_char
    return generated

In [86]:
print(generate_text(50, 1.0))

the moron in the blubed fro.
kramer: What are you like the push thent.


## Clean up the code 