# TV Script Generation
Generating own [Simpsons](https://en.wikipedia.org/wiki/The_Simpsons) TV scripts using RNNs, using part of the [Simpsons dataset](https://www.kaggle.com/wcukierski/the-simpsons-by-the-data) of scripts from 27 seasons.  The Neural Network built will generate a new TV script for a scene at [Moe's Tavern](https://simpsonswiki.com/wiki/Moe's_Tavern).

In [1]:
import helper

data_dir = './data/simpsons/moes_tavern_lines.txt'
text = helper.load_data(data_dir)
text = text[81:]

## Explore the Data


In [3]:
view_sentence_range = (0, 10)

import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
scenes = text.split('\n\n')
print('Number of scenes: {}'.format(len(scenes)))
sentence_count_scene = [scene.count('\n') for scene in scenes]
print('Average number of sentences in each scene: {}'.format(np.average(sentence_count_scene)))

sentences = [sentence for scene in scenes for sentence in scene.split('\n')]
print('Number of lines: {}'.format(len(sentences)))
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print('Average number of words in each line: {}'.format(np.average(word_count_sentence)))

print()
print('The sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

Dataset Stats
Roughly the number of unique words: 11492
Number of scenes: 262
Average number of sentences in each scene: 15.251908396946565
Number of lines: 4258
Average number of words in each line: 11.50164396430249

The sentences 0 to 10:

Moe_Szyslak: (INTO PHONE) Moe's Tavern. Where the elite meet to drink.
Bart_Simpson: Eh, yeah, hello, is Mike there? Last name, Rotch.
Moe_Szyslak: (INTO PHONE) Hold on, I'll check. (TO BARFLIES) Mike Rotch. Mike Rotch. Hey, has anybody seen Mike Rotch, lately?
Moe_Szyslak: (INTO PHONE) Listen you little puke. One of these days I'm gonna catch you, and I'm gonna carve my name on your back with an ice pick.
Moe_Szyslak: What's the matter Homer? You're not your normal effervescent self.
Homer_Simpson: I got my problems, Moe. Give me another one.
Moe_Szyslak: Homer, hey, you should not drink to forget your problems.
Barney_Gumble: Yeah, you should only drink to enhance your social skills.



## Implementing Preprocessing Functions

### Lookup Table


In [4]:
import numpy as np

def create_lookup_tables(text):
    vocab_to_int={word:index for index, word in enumerate(set(text))}
    int_to_vocab={index:word for word,index in vocab_to_int.items()}

    return vocab_to_int, int_to_vocab

### Tokenize Punctuation

In [5]:
def token_lookup():

    dict= { '.': '||Period||',
             ',': '||Comma||',
             '"': '||Quotation_Mark||',
             ';': '||Semicolon||',
             '!': '||Exclamation_Mark||',
             '?': '||Question_Mark||',
             '(': '||Left_Parentheses||',
             ')': '||Right_Parentheses||',
             '--':'||Dash||',
             '\n':'||Return||'
           }
    return dict

In [6]:
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [7]:

import helper
import numpy as np
int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

## Build the Neural Network


In [8]:
def get_inputs():
    Input=tf.placeholder(shape=[None, None],dtype=tf.int32, name='input')
    Targets=tf.placeholder(shape=[None, None],dtype=tf.int32, name='target')
    LearningRate=tf.placeholder(dtype=tf.float32,name='learning_rate')
    return Input, Targets, LearningRate

### Build RNN Cell

In [9]:
def get_init_cell(batch_size, rnn_size,keep_prob=0.5,rnn_layers=1):
    #LSTM cell
    lstm=tf.contrib.rnn.BasicLSTMCell(rnn_size)
    #Dropout
    drop=tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    #Multiple LSTM layers
    cell=tf.contrib.rnn.MultiRNNCell([drop]*rnn_layers)
    #initial state
    initial_state=tf.identity(cell.zero_state(batch_size,tf.float32),name='initial_state')
    return cell, initial_state

### Word Embedding

In [10]:
def get_embed(input_data, vocab_size, embed_dim):
    embedding=tf.Variable(tf.random_uniform([vocab_size,embed_dim],-1,1))
    embed=tf.nn.embedding_lookup(embedding, input_data)
    return embed

### Build RNN

In [11]:
def build_rnn(cell, inputs):
    outputs,final_state=tf.nn.dynamic_rnn(cell,inputs,dtype=tf.float32)
    return outputs, tf.identity(final_state,name='final_state')

### Build the Neural Network

In [12]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    embed=get_embed(input_data,vocab_size,embed_dim)
    outputs,final_state=build_rnn(cell,embed)
    predictions=tf.contrib.layers.fully_connected(outputs,vocab_size,activation_fn=None)
    return predictions, final_state

In [13]:
def get_batches(int_text, batch_size, seq_length):

    #Calculating number of full batches
    n_batches=int(len(int_text)/(batch_size*seq_length))
    
    #discarding batches which wont be full
    x=np.array(int_text[:n_batches*batch_size*seq_length])
    y=np.array(int_text[1:n_batches*batch_size*seq_length+1])
    y[-1]=x[0]
    
    #shape into batches
    x_batches=np.split(x.reshape(batch_size,-1),n_batches,1)
    y_batches=np.split(y.reshape(batch_size,-1),n_batches,1)
    
    return np.array(list(zip(x_batches,y_batches)))

## Neural Network Training
### Hyperparameters

In [14]:
# Number of Epochs
num_epochs = 50
# Batch Size
batch_size = 512
# RNN Size
rnn_size = 512
# Embedding Dimension Size
embed_dim = 256
# Sequence Length
seq_length = 10
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 25

## Train

In [43]:

batches = get_batches(int_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/13   train_loss = 8.822
Epoch   0 Batch    2/13   train_loss = 6.803
Epoch   0 Batch    4/13   train_loss = 6.167
Epoch   0 Batch    6/13   train_loss = 6.038
Epoch   0 Batch    8/13   train_loss = 5.874
Epoch   0 Batch   10/13   train_loss = 5.593
Epoch   0 Batch   12/13   train_loss = 5.598
Epoch   1 Batch    1/13   train_loss = 5.320
Epoch   1 Batch    3/13   train_loss = 5.207
Epoch   1 Batch    5/13   train_loss = 5.145
Epoch   1 Batch    7/13   train_loss = 5.082
Epoch   1 Batch    9/13   train_loss = 4.987
Epoch   1 Batch   11/13   train_loss = 4.991
Epoch   2 Batch    0/13   train_loss = 4.881
Epoch   2 Batch    2/13   train_loss = 4.832
Epoch   2 Batch    4/13   train_loss = 4.676
Epoch   2 Batch    6/13   train_loss = 4.702
Epoch   2 Batch    8/13   train_loss = 4.645
Epoch   2 Batch   10/13   train_loss = 4.547
Epoch   2 Batch   12/13   train_loss = 4.561
Epoch   3 Batch    1/13   train_loss = 4.397
Epoch   3 Batch    3/13   train_loss = 4.321
Epoch   3 

Epoch  28 Batch    2/13   train_loss = 1.151
Epoch  28 Batch    4/13   train_loss = 1.222
Epoch  28 Batch    6/13   train_loss = 1.169
Epoch  28 Batch    8/13   train_loss = 1.152
Epoch  28 Batch   10/13   train_loss = 1.129
Epoch  28 Batch   12/13   train_loss = 1.111
Epoch  29 Batch    1/13   train_loss = 1.156
Epoch  29 Batch    3/13   train_loss = 1.112
Epoch  29 Batch    5/13   train_loss = 1.102
Epoch  29 Batch    7/13   train_loss = 1.114
Epoch  29 Batch    9/13   train_loss = 1.100
Epoch  29 Batch   11/13   train_loss = 1.116
Epoch  30 Batch    0/13   train_loss = 1.051
Epoch  30 Batch    2/13   train_loss = 1.076
Epoch  30 Batch    4/13   train_loss = 1.179
Epoch  30 Batch    6/13   train_loss = 1.091
Epoch  30 Batch    8/13   train_loss = 1.091
Epoch  30 Batch   10/13   train_loss = 1.078
Epoch  30 Batch   12/13   train_loss = 1.111
Epoch  31 Batch    1/13   train_loss = 1.087
Epoch  31 Batch    3/13   train_loss = 1.055
Epoch  31 Batch    5/13   train_loss = 1.059
Epoch  31 

## Save Parameters


In [44]:

# Save parameters for checkpoint
helper.save_params((seq_length, save_dir))

## Implement Generate Functions
### Get Tensors


In [16]:
def get_tensors(loaded_graph):

    input_0 = loaded_graph.get_tensor_by_name('input:0')
    initial_state_0 = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state_0 = loaded_graph.get_tensor_by_name('final_state:0')
    probs_0 = loaded_graph.get_tensor_by_name('probs:0')
    return input_0, initial_state_0, final_state_0, probs_0


### Choose Word


In [17]:
def pick_word(probabilities, int_to_vocab):

    max_idx=np.argmax(probabilities)
    return int_to_vocab[max_idx]


## Generate TV Script

In [48]:
gen_length = 200
# homer_simpson, moe_szyslak, or Barney_Gumble
prime_word = 'moe_szyslak'


loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word + ':']
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

INFO:tensorflow:Restoring parameters from ./save
moe_szyslak:(uneasy) i gotta admit, moe!
homer_simpson: what's the matter, homer?
homer_simpson:(explaining) part of my new bartender.
homer_simpson:(shaking head) no, i won't accept that beer.
moe_szyslak:(incredulous) homer? you got it. i know why i made a mistake i just wanted a bottle like a pal outta.
moe_szyslak:(worried) oh, i don't think maybe i work for a dude.(chuckle) do i look at this bad guy. eh, i'm a little more sensitivity right on, i'll / i do to this bar.
homer_simpson:(on phone) boy, i can make it happen?
moe_szyslak:(terrified) what the?
moe_szyslak: hey, homer. look at me. i'm attracted to another woman and then tonight. but here's one that's even more spellbinding. once there!
moe_szyslak:" no acronyms!" had off to inspired."
carl_carlson: is that a lot of things you do?
lenny_leonard: say, homer. no doubt
