# TV Script Generation

In [8]:
# loading the data
import helper
data_dir = './data/Seinfeld_Scripts.txt'
text = helper.load_data(data_dir)

In [9]:
view_line_range = (0, 10)

import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 46367
Number of lines: 109233
Average number of words in each line: 5.544240293684143

The lines 0 to 10:
jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and out is one of the single most enjoyable experiences of life. people...did you ever hear people talking about we should go out? this is what theyre talking about...this whole thing, were all out now, no one is home. not one person here is home, were all out! there are people trying to find us, they dont know where we are. (on an imaginary phone) did you ring?, i cant find him. where did he go? he didnt tell me where he was going. he must have gone out. you wanna go out you get ready, you pick out the clothes, right? you take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...then youre standing around, what do you do? you go we gotta be getting back. once youre out, you wanna get back! y

## Lookup Table

In [10]:
from collections import Counter

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    counts = Counter(text)
    vocab = sorted(counts, key=counts.get, reverse=True)
    int_to_vocab = {i: word for i, word in enumerate(vocab)}
    vocab_to_int = {word: i for i, word in enumerate(vocab)}
    return (vocab_to_int, int_to_vocab)

## Tokenizing Punctuation

In [11]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    tokens = dict()
    tokens['.'] = '<PERIOD>'
    tokens[','] = '<COMMA>'
    tokens['"'] = '<QUOTATION_MARK>'
    tokens[';'] = '<SEMICOLON>'
    tokens['!'] = '<EXCLAMATION_MARK>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['('] = '<LEFT_PAREN>'
    tokens[')'] = '<RIGHT_PAREN>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['-'] = '<DASH>'
    tokens['\n'] = '<NEW_LINE>'
    return tokens 

## Pre-processing the data and saving it

In [12]:
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

Loading the saved pre-processed data

In [13]:
int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

---
## Building the Neural Network

In [14]:
import torch

# Check for a GPU
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found.')

No GPU found.


In [15]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    :param words: The word ids of the TV scripts
    :param sequence_length: The sequence length of each batch
    :param batch_size: The size of each batch; the number of sequences in a batch
    :return: DataLoader with batched data
    """
    n_batches = len(words)//batch_size
    words = words[:n_batches*batch_size]
    
    features, target = [], []
    for i in range(0, len(words) - sequence_length):
        features.append(words[i:i+sequence_length])
        target.append(words[i+sequence_length])
    data = TensorDataset(torch.from_numpy(np.asarray(features)), torch.from_numpy(np.asarray(target)))
    data_loader = DataLoader(data, shuffle=True, batch_size=batch_size)
    return data_loader

### Testing dataloader

In [16]:
test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[29, 30, 31, 32, 33],
        [24, 25, 26, 27, 28],
        [ 3,  4,  5,  6,  7],
        [ 2,  3,  4,  5,  6],
        [22, 23, 24, 25, 26],
        [ 4,  5,  6,  7,  8],
        [43, 44, 45, 46, 47],
        [21, 22, 23, 24, 25],
        [39, 40, 41, 42, 43],
        [35, 36, 37, 38, 39]])

torch.Size([10])
tensor([34, 29,  8,  7, 27,  9, 48, 26, 44, 40])


In [17]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM layers
        """
        super(RNN, self).__init__()
            
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
            
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        batch_size = nn_input.size(0)
        embeds = self.embedding(nn_input)
            
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.fc(lstm_out)
            
        out = out.view(batch_size, -1, self.output_size)
        return out[:,-1], hidden
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

### Forward and backpropagation

In [18]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    :param decoder: The PyTorch Module that holds the neural network
    :param decoder_optimizer: The PyTorch optimizer for the neural network
    :param criterion: The PyTorch loss function
    :param inp: A batch of input to the neural network
    :param target: The target output for the batch of input
    :return: The loss and the latest hidden state Tensor
    """
    
    if(train_on_gpu):
        rnn.cuda()
        inp, target = inp.cuda(), target.cuda()
        
    h = tuple([each.data for each in hidden])
    rnn.zero_grad()
    
    out, hidden = rnn(inp, h)
    loss = criterion(out, target)
    
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()
    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), hidden

### Training

In [12]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # iterating over completely full batches only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    # returns a trained rnn
    return rnn

## Hyperparameters

### Data params

In [38]:
sequence_length = 10
batch_size = 128

# data loader
train_loader = batch_data(int_text, sequence_length, batch_size)

### Training parameters

In [39]:
num_epochs = 10
learning_rate = 0.001

# Model parameters
vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 400
hidden_dim = 256
# Number of RNN Layers
n_layers = 2

# Show stats for every n number of batches
show_every_n_batches = 2000

In [40]:
# create model and move to gpu if available
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
helper.save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 10 epoch(s)...
Epoch:    1/10    Loss: 4.848484973073005

Epoch:    1/10    Loss: 4.32052004468441

Epoch:    1/10    Loss: 4.17849313557148

Epoch:    2/10    Loss: 3.9889707426337218

Epoch:    2/10    Loss: 3.903261484503746

Epoch:    2/10    Loss: 3.892122924685478

Epoch:    3/10    Loss: 3.770667808878454

Epoch:    3/10    Loss: 3.7277095453739166

Epoch:    3/10    Loss: 3.737306085467339

Epoch:    4/10    Loss: 3.654018964728898

Epoch:    4/10    Loss: 3.618053295135498

Epoch:    4/10    Loss: 3.6385857957601546

Epoch:    5/10    Loss: 3.5590090334736755

Epoch:    5/10    Loss: 3.5295723952054976

Epoch:    5/10    Loss: 3.569708483815193

Epoch:    6/10    Loss: 3.4879602127962035

Epoch:    6/10    Loss: 3.468995282769203

Epoch:    6/10    Loss: 3.5132147393226623

Epoch:    7/10    Loss: 3.427718820276286

Epoch:    7/10    Loss: 3.4228406841754913

Epoch:    7/10    Loss: 3.4621838989257814

Epoch:    8/10    Loss: 3.3865625669532067

Epoch:    8/10    

  "type " + obj.__name__ + ". It won't be checked "


Model Trained and Saved


Loading the saved model

In [41]:
import torch
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
trained_rnn = helper.load_model('./save/trained_rnn')

## Generating TV Script

In [42]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    """
    Generate text using the neural network
    :param decoder: The PyTorch Module that holds the trained neural network
    :param prime_id: The word id to start the first prediction
    :param int_to_vocab: Dict of word id keys to word values
    :param token_dict: Dict of puncuation tokens keys to puncuation values
    :param pad_value: The value used to pad a sequence
    :param predict_len: The length of text to generate
    :return: The generated text
    """
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
         
        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        # select the likely next word index with some element of randomness
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        
        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)     
        
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    
    # return all the sentences
    return gen_sentences

In [43]:
gen_length = 400
prime_word = 'jerry' # name for starting the script

pad_word = helper.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)



jerry:?

elaine: no, no. i can't believe it.

kramer: yeah, well, i'm sorry, i don't think i should go.

george: i don't understand.(to george) what are you saying?

jerry: yeah, well you have no idea what i said.

george:(to jerry) hey.

george: i know what you want. i'm going to get out with that.

jerry: i know.. you should take a look at the other side.

george: i think i may know.

jerry: what do you mean?

jerry: i don't know! i think you're not going out.

jerry: well i was in the neighborhood, and i was a little girl on the way of the front row.(to jerry) i don't know what to do with it. i don't think i'm going to be able to get going.

george: well, i don't know, but...

elaine:(to the phone) you wanna get out of here?

jerry: i know. i mean, i don't know what you want. i think i'm not really good looking.

george:(to jerry) you don't think you could do that.(to jerry) what?!

jerry: yeah.

jerry: what about?

elaine: yeah.

elaine: i know, but you gotta go to the game, you go