In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import csv
import sys
import nltk

### Data Preprocessing

In [2]:
def load_data(path):
    """
    Load Dataset from File
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data

def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """
    text = load_data(dataset_path)
    
    # Ignore notice, since we don't use it for analysing the data
    text = text[81:]

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))   

In [3]:
scripts = pd.read_csv('./data/scripts.csv', index_col=0)
scripts.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0
2,GEORGE,Are you through?,1.0,S01E01,1.0
3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0


In [4]:
scripts[['Character', 'Dialogue']].groupby(['Character']).count().sort_values(by='Dialogue',ascending=False).head(10)

Unnamed: 0_level_0,Dialogue
Character,Unnamed: 1_level_1
JERRY,14786
GEORGE,9708
ELAINE,7983
KRAMER,6664
NEWMAN,640
MORTY,505
HELEN,471
FRANK,436
SUSAN,379
[Setting,293


In [5]:
def convert_to_role_dialogue(r):
    return r[0].lower()+": "+ r[1]

In [6]:
scripts['char_dial'] = scripts['Character'].str.lower()+': '+scripts['Dialogue']
scripts.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,char_dial
0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0,jerry: Do you know what this is all about? Do ...
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0,"jerry: (pointing at Georges shirt) See, to me,..."
2,GEORGE,Are you through?,1.0,S01E01,1.0,george: Are you through?
3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0,"jerry: You do of course try on, when you buy?"
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0,"george: Yes, it was purple, I liked it, I dont..."


In [7]:
scripts['answer_character'] = scripts['Character'].shift(-1)
scripts.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season,char_dial,answer_character
0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0,jerry: Do you know what this is all about? Do ...,JERRY
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0,"jerry: (pointing at Georges shirt) See, to me,...",GEORGE
2,GEORGE,Are you through?,1.0,S01E01,1.0,george: Are you through?,JERRY
3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0,"jerry: You do of course try on, when you buy?",GEORGE
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0,"george: Yes, it was purple, I liked it, I dont...",JERRY


In [8]:
summary = scripts[['Character', 'answer_character', 'Dialogue']].groupby(['Character', 'answer_character']).count().reset_index()

In [9]:
summary.head()

Unnamed: 0,Character,answer_character,Dialogue
0,"""SALMAN""",BETSY (to Aunt May),1
1,"""SALMAN""",KRAMER,2
2,"""SALMAN"" (shakes Kramer's hand)",KRAMER,1
3,% A night at the Improv. Jerry receives some d...,JERRY,1
4,"% Apparently, a movie that can be interpreted ...",KRAMER,1


In [10]:
df = summary[summary.Character.isin(['JERRY', 'GEORGE', 'ELAINE', 'KRAMER'])][summary.answer_character.isin(['JERRY', 'GEORGE', 'ELAINE', 'KRAMER'])]
df

  """Entry point for launching an IPython kernel.


Unnamed: 0,Character,answer_character,Dialogue
946,ELAINE,ELAINE,204
974,ELAINE,GEORGE,1123
1027,ELAINE,JERRY,3298
1071,ELAINE,KRAMER,889
1784,GEORGE,ELAINE,1134
1821,GEORGE,GEORGE,265
1864,GEORGE,JERRY,4287
1908,GEORGE,KRAMER,948
2752,JERRY,ELAINE,3262
2805,JERRY,GEORGE,4278


In [11]:
all_d = df[['Character', 'Dialogue']].groupby(['Character']).sum().reset_index()
all_d.head()

Unnamed: 0,Character,Dialogue
0,ELAINE,5514
1,GEORGE,6634
2,JERRY,10826
3,KRAMER,4725


In [12]:
prob = pd.merge(all_d, df, on=['Character'])
prob

Unnamed: 0,Character,Dialogue_x,answer_character,Dialogue_y
0,ELAINE,5514,ELAINE,204
1,ELAINE,5514,GEORGE,1123
2,ELAINE,5514,JERRY,3298
3,ELAINE,5514,KRAMER,889
4,GEORGE,6634,ELAINE,1134
5,GEORGE,6634,GEORGE,265
6,GEORGE,6634,JERRY,4287
7,GEORGE,6634,KRAMER,948
8,JERRY,10826,ELAINE,3262
9,JERRY,10826,GEORGE,4278


In [13]:
prob['probability'] = prob['Dialogue_y']*1.0/prob['Dialogue_x']
prob

Unnamed: 0,Character,Dialogue_x,answer_character,Dialogue_y,probability
0,ELAINE,5514,ELAINE,204,0.036997
1,ELAINE,5514,GEORGE,1123,0.203663
2,ELAINE,5514,JERRY,3298,0.598114
3,ELAINE,5514,KRAMER,889,0.161226
4,GEORGE,6634,ELAINE,1134,0.170938
5,GEORGE,6634,GEORGE,265,0.039946
6,GEORGE,6634,JERRY,4287,0.646216
7,GEORGE,6634,KRAMER,948,0.1429
8,JERRY,10826,ELAINE,3262,0.301312
9,JERRY,10826,GEORGE,4278,0.39516


In [14]:
for i in ['jerry', 'george', 'elaine', 'kramer']:
    temp = scripts[scripts.Character == i.upper()]
    temp[['char_dial']].to_csv(r'./data/{i}_script.txt'.format(i=i), header=None, index=None, sep='\n', mode='a')

In [49]:
csv.field_size_limit(sys.maxsize)

episode = 3
line_text = 2
speaker = 1
d = {}

name = "./data/scripts.csv"
# train_speaker = open("./data/train.txt", "w")
# test_speaker = open("./data/test.txt", "w")
# dev_speaker = open("./data/dev.es", "w")
train_jerry = open("./data/train.txt", "w")
test_jerry = open("./data/test.txt", "w")
dev_jerry = open("./data/dev.txt", "w")

f = open(name)

reader = csv.reader(f)
next(reader)
prevRow = next(reader)

# speaker_vocab = {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3}
jerry_vocab = {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, '<PAD>': 4}
data = {"src_word2id": speaker_vocab, "tgt_word2id": jerry_vocab}
speaker_count = 1
jerry_count = 1

for currRow in reader:
    if currRow[speaker] == "JERRY" and prevRow[episode] == currRow[episode]:
        if int(float(currRow[episode])) <= 15:
#             train_speaker.write(prevRow[line_text] + "\n")
            train_jerry.write(currRow[line_text] + "\n")
        elif int(float(currRow[episode])) <= 19:
#             test_speaker.write(prevRow[line_text] + "\n")
            test_jerry.write(currRow[line_text] + "\n")
        else:
#             dev_speaker.write(prevRow[line_text] + "\n")
            dev_jerry.write(currRow[line_text] + "\n")
    for word in prevRow[line_text].split(" "):
        if word not in speaker_vocab.keys():
            speaker_vocab[word] = speaker_count
            speaker_count += 1
    for word in currRow[line_text].split(" "):
        if word not in jerry_vocab.keys():
            jerry_vocab[word] = jerry_count
            jerry_count += 1
    prevRow = currRow

## Training Process

In [8]:
# data_dir = './data/Seinfeld_Scripts.txt'
# data_dir = './data/elaine_script.txt'
data_dir = './data/train.txt'
print(data_dir)

./data/train.txt


In [9]:
text = load_data(data_dir)

In [10]:
view_line_range = (30, 50)

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 15631
Number of lines: 10240
Average number of words in each line: 10.0705078125

The lines 30 to 50:
This is insane. You know, I dont even know where shes staying! She, shes not gonna call me, this is unbelievable.
No no no, dont interrupt the cycle. The machine is working, it, it knows what its doing. Just let it finish.
You, you cant overdry.
Same as you cant overwet. You see, once something is wet, its wet. Same thing with death. Like once you die youre dead, right? Lets say you drop dead and I shoot you. Youre not gonna die again, youre already dead. You cant overdie, you cant overdry.
How could she not tell me where she was staying?
Laundry day is the only exciting day in the life of clothes. It is...yknow, think about it. The washing machine is the nightclub of clothes. You know, its dark, theres bubbles happening, theyre all kinda dancing around in there- shirt grabs the underwear, Cmon babe, lets go. You come by, you open up th

In [11]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    chars = sorted(list(set(text))) # getting all unique chars
    print('total chars: ', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return chars, char_indices, indices_char

In [12]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    return {
        '.': '||Period||',
        ',': '||Comma||',
        '"': '||Quotation_Mark||',
        ';': '||Semicolon||',
        '!': '||Exclamation_Mark||',
        '?': '||Question_Mark||',
        '(': '||Left_Parentheses||',
        ')': '||Right_Parentheses||',
        '-': '||Dash||',
        '\n': '||Return||'
        }

In [13]:
SPECIAL_WORDS = {'PADDING': '<PAD>'}

In [14]:
vocabulary, char_to_indices, indices_to_char = create_lookup_tables(text)

total chars:  87


In [15]:
max_length = 20
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])
      
# Hot encoding each character into a boolean vector
  
# Initializing a matrix of boolean vectors with each column representing
# the hot encoded representation of the character
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype = np.bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype = np.bool)
  
# Placing the value 1 at the appropriate position for each vector
# to complete the hot-encoding process
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
        y[i, char_to_indices[next_chars[i]]] = 1

In [16]:
from __future__ import absolute_import, division, print_function, unicode_literals
  
import numpy as np
import tensorflow as tf
  
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, GRU
  
from keras.optimizers import RMSprop, Adam
  
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau
from keras.metrics import categorical_accuracy
import random
import sys

Using TensorFlow backend.


In [17]:
# Helper function to sample an index from a probability array
def sample_index(preds, temperature = 1.0):
# temperature determines the freedom the function has when generating text
  
    # Converting the predictions vector into a numpy array
    preds = np.asarray(preds).astype('float64')
  
    # Normalizing the predicitons array
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
  
    # The main sampling step. Creates an array of probablities signifying
    # the probability of each character to be the next character in the 
    # generated text
    probas = np.random.multinomial(1, preds, 1)
  
    # Returning the character with maximum probability to be the next character
    # in the generated text
    return np.argmax(probas)


# Helper function to generate text after the end of each epoch
def on_epoch_end(epoch, logs):
    print()
    print('----- Generating text after Epoch: % d' % epoch)
  
    # Choosing a random starting index for the text generation
    start_index = random.randint(0, len(text) - max_length - 1)
  
    # Sampling for different values of diversity
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)
  
        generated = ''
  
        # Seed sentence
        sentence = text[start_index: start_index + max_length]
  
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
  
        for i in range(400):
            # Initializing the predicitons vector
            x_pred = np.zeros((1, max_length, len(vocabulary)))
  
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_indices[char]] = 1.
  
            # Making the predictions for the next character
            preds = model.predict(x_pred, verbose = 0)[0]
  
            # Getting the index of the most probable next character
            next_index = sample_index(preds, diversity)
  
            # Getting the most probable next character using the mapping built
            next_char = indices_to_char[next_index]
  
            # Building the generated text
            generated += next_char
            sentence = sentence[1:] + next_char
  
            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
        
def GRU_model(max_length, vocabulary):
    learning_rate = 0.01
    model = Sequential()
    # Defining the cell type
    model.add(GRU(128, input_shape =(max_length, len(vocabulary))))

    # Defining the densely connected Neural Network layer
    model.add(Dense(len(vocabulary)))

    # Defining the activation function for the cell
    model.add(Activation('softmax'))

    # Defining the optimizing function
#     optimizer = Adam(lr=learning_rate)
    # Defining the optimizing function
    optimizer = RMSprop(lr = learning_rate)
    
    # Configuring the model for training
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    return model

In [60]:
character = 'jerry'
print_callback = LambdaCallback(on_epoch_end = on_epoch_end)
filepath = "weights_{c}.hdf5".format(c=character)
checkpoint = ModelCheckpoint(filepath, monitor ='loss',verbose = 1, save_best_only = True,mode ='min')
reduce_alpha = ReduceLROnPlateau(monitor ='loss', factor = 0.2, patience = 1, min_lr = 0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]

model = GRU_model(max_length, vocabulary)
model.fit(X, y, batch_size = 128, epochs = 40, callbacks = callbacks)

Epoch 1/40

----- Generating text after Epoch:  0
----- diversity: 0.2
----- Generating with seed: "ke I don't even know"
ke I don't even know what is they cares and to she was the mant to my on they really want to he don't know what to you don't know what is they car to you don't know what to you don't know what in they want to you not to be they really arame.
Now.
Hey, you don't know.
I don't know what and they really and they really and you don't know what are you don't know what in they want to my waster they cares they bett to she 
----- diversity: 0.5
----- Generating with seed: "ke I don't even know"
ke I don't even know.
Well you don't know and to do you didn't pitting to in your and to her come on they radiry core in of hare
and bad to she was has buck.
(one there)
Hey, you dennd that and they care and they never on my arry back to you bat to now carsto ma.
Hey, you don't know are you don't know hare do yin.
Hey. I'll dend they to sary in that what in they right and they cant 

<tensorflow.python.keras.callbacks.History at 0x7f33b40b9190>

In [18]:
model.summary()

NameError: name 'model' is not defined

In [6]:
data_dir = './data/dev.txt'
print(data_dir)
text = load_data(data_dir)

./data/dev.txt


In [22]:
def load_model(weight_file):
    # Load the network weights 
    model = GRU_model(20, vocabulary)
    model.load_weights(weight_file)
    model.summary()
    
    return model

In [23]:
model = load_model('weights_jerry.hdf5')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, 128)               83328     
_________________________________________________________________
dense (Dense)                (None, 87)                11223     
_________________________________________________________________
activation (Activation)      (None, 87)                0         
Total params: 94,551
Trainable params: 94,551
Non-trainable params: 0
_________________________________________________________________


In [24]:
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - max_length - 1)
  
    # Defining the generated text
    generated = ''
    sentence = text[start_index: start_index + max_length]
    reference = text[start_index+ max_length: start_index+ max_length+length]
    generated += sentence
  
    # Generating new text of given length
    for i in range(length):
  
            # Initializing the predicition vector
            x_pred = np.zeros((1, max_length, len(vocabulary)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_to_indices[char]] = 1.
  
            # Making the predicitons
            preds = model.predict(x_pred, verbose = 0)[0]
  
            # Getting the index of the next most probable index
            next_index = sample_index(preds, diversity)
  
            # Getting the most probable next character using the mapping built
            next_char = indices_to_char[next_index]
  
            # Generating new text
            generated += next_char
            sentence = sentence[1:] + next_char
    return generated, reference

In [43]:
print(generate_text(20, 0.2))

('with all do respect for this my face any', "I would think it's r")


In [26]:
def BLEU_score(simulation_time, prediction_length):
    avg_BLEU_1 = 0
    avg_BLEU_4 = 0
    length = 20
    diversity=0.2
    for i in range(simulation_time):
        reference, prediction = generate_text(length, diversity)
        BLEU_1 = nltk.translate.bleu_score.sentence_bleu([reference], prediction, weights=(1,0,0,0))
        BLEU_4 = nltk.translate.bleu_score.sentence_bleu([reference], prediction)
        
        avg_BLEU_1 += BLEU_1
        avg_BLEU_4 += BLEU_4
        
    avg_BLEU_1 = avg_BLEU_1 / simulation_time
    avg_BLEU_4 = avg_BLEU_4 / simulation_time
    
    print(f"Cumulative 1-gram: {avg_BLEU_1}")
    print(f"Cumulative 4-gram: {avg_BLEU_4}")

In [27]:
BLEU_score(100, 20)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Cumulative 1-gram: 0.2624819812758244
Cumulative 4-gram: 0.03859844449749971


In [71]:
# def remove_special_chara(text):
#     SPECIAL_WORDS = {'PADDING': '<PAD>'}
#     token_dict = {
#             '.': '||Period||',
#             ',': '||Comma||',
#             '"': '||Quotation_Mark||',
#             ';': '||Semicolon||',
#             '!': '||Exclamation_Mark||',
#             '?': '||Question_Mark||',
#             '(': '||Left_Parentheses||',
#             ')': '||Right_Parentheses||',
#             '-': '||Dash||',
#             '\n': '||Return||'
#             }
#     for key, token in token_dict.items():
#             text = text.replace(" {}".format(token), key)
    
#     text = text.replace('<PAD>', '\n')
    
#     return text

In [69]:
# from rouge_metric import PyRouge

# def Rouge_score(simulation_time, prediction_length):
#     length = 20
#     diversity=0.2
#     for i in range(simulation_time):
#         reference, prediction = generate_text(length, diversity)
#         reference = remove_special_chara(" " + " ".join(reference))
#         prediction = remove_special_chara(" " + " ".join(prediction))

#         rouge = PyRouge(rouge_n=(1, 2), rouge_l=True, rouge_w=True,
#                     rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)
#         scores = rouge.evaluate([prediction], [[reference]])
#         print(scores)

In [70]:
# Rouge_score(1, 200)

In [36]:
# count the number of words
word_counts = Counter(text).most_common()
word_counts

[(' ', 92908),
 ('e', 46205),
 ('t', 37300),
 ('o', 35463),
 ('a', 30439),
 ('n', 25845),
 ('h', 24605),
 ('i', 23821),
 ('s', 21142),
 ('r', 18997),
 ('l', 16411),
 ('u', 14578),
 ('.', 13042),
 ('d', 12681),
 ('y', 11811),
 ('g', 10514),
 ('\n', 10239),
 ('m', 9583),
 ('w', 8923),
 ('c', 7630),
 (',', 6995),
 ('k', 6048),
 ("'", 5738),
 ('p', 5616),
 ('f', 5588),
 ('I', 5481),
 ('b', 5269),
 ('?', 4445),
 ('v', 3426),
 ('W', 2517),
 ('Y', 1942),
 ('T', 1705),
 ('(', 1590),
 (')', 1588),
 ('H', 1510),
 ('!', 1431),
 ('S', 1347),
 ('A', 1127),
 ('O', 1084),
 ('N', 1027),
 ('-', 895),
 ('G', 772),
 ('j', 725),
 ('B', 674),
 ('E', 631),
 ('C', 612),
 ('M', 565),
 ('K', 505),
 ('x', 493),
 ('L', 488),
 ('D', 476),
 ('J', 408),
 ('"', 404),
 ('P', 385),
 ('R', 356),
 ('z', 340),
 ('F', 244),
 ('U', 220),
 ('q', 190),
 ('V', 120),
 ('0', 71),
 ('1', 61),
 ('5', 61),
 ('*', 56),
 (';', 49),
 ('9', 43),
 ('2', 37),
 ('8', 35),
 ('3', 33),
 ('[', 31),
 (']', 30),
 ('4', 26),
 ('7', 19),
 ('6',

In [37]:
from collections import Counter
unigram_prob = {}
word_counts = Counter(text).most_common()
for w in word_counts:
    unigram_prob[w[0]] = w[1]

def perplexity_score(simulation_time, prediction_length):
    score = 0.0
    length = 20
    diversity=0.2
    for i in range(simulation_time):
        reference, prediction = generate_text(length, diversity)
        l = 0.0
        for w in prediction:
            l += np.log2(unigram_prob[w])
        l = l / prediction_length
        score += np.power(2, -l)
    score /= simulation_time
    
    return score

In [38]:
perplexity_score(100, 20)

4.482171403098487e-05