In [1]:
from collections import Counter

In [2]:
import helper
data_dir = './data/Seinfeld_Scripts.txt'
text = helper.load_data(data_dir)

In [3]:
view_line_range = (0, 10)

import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 46367
Number of lines: 109233
Average number of words in each line: 5.544240293684143

The lines 0 to 10:
jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and out is one of the single most enjoyable experiences of life. people...did you ever hear people talking about we should go out? this is what theyre talking about...this whole thing, were all out now, no one is home. not one person here is home, were all out! there are people trying to find us, they dont know where we are. (on an imaginary phone) did you ring?, i cant find him. where did he go? he didnt tell me where he was going. he must have gone out. you wanna go out you get ready, you pick out the clothes, right? you take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...then youre standing around, what do you do? you go we gotta be getting back. once youre out, you wanna get back! y

In [4]:
import problem_unittests as tests

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    word_counts = Counter(text)
    
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return (vocab_to_int, int_to_vocab)

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


In [5]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    tokens = dict()
    tokens['.'] = '<PERIOD>'
    tokens[','] = '<COMMA>'
    tokens['"'] = '<QUOTATION_MARK>'
    tokens[';'] = '<SEMICOLON>'
    tokens['!'] = '<EXCLAMATION_MARK>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['('] = '<LEFT_PAREN>'
    tokens[')'] = '<RIGHT_PAREN>'
    tokens['?'] = '<QUESTION_MARK>'
    tokens['-'] = '<DASH>'
    tokens['\n'] = '<NEW_LINE>'
    return tokens 
        

tests.test_tokenize(token_lookup)

Tests Passed


In [6]:

helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [7]:

import helper
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

In [8]:

import torch

train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

In [9]:
from torch.utils.data import TensorDataset, DataLoader


def batch_data(words, sequence_length, batch_size):

    n_batches = len(words)//batch_size

    words = words[:n_batches*batch_size]
    y_len = len(words) - sequence_length
    x, y = [], []
    for idx in range(0, y_len):
        idx_end = sequence_length + idx
        x_batch = words[idx:idx_end]
        x.append(x_batch)

        batch_y =  words[idx_end]

        y.append(batch_y)    


    data = TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))

    data_loader = DataLoader(data, shuffle=False, batch_size=batch_size)

    return data_loader    


In [10]:


test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[  0,   1,   2,   3,   4],
        [  1,   2,   3,   4,   5],
        [  2,   3,   4,   5,   6],
        [  3,   4,   5,   6,   7],
        [  4,   5,   6,   7,   8],
        [  5,   6,   7,   8,   9],
        [  6,   7,   8,   9,  10],
        [  7,   8,   9,  10,  11],
        [  8,   9,  10,  11,  12],
        [  9,  10,  11,  12,  13]])
torch.Size([10])
tensor([  5,   6,   7,   8,   9,  10,  11,  12,  13,  14])


In [11]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5,lr=0.001):

        super(RNN, self).__init__()
        
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, nn_input, hidden):


        batch_size = nn_input.size(0)
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.fc(lstm_out)
        
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1]

        return out, hidden

    
    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

tests.test_rnn(RNN, train_on_gpu)

Tests Passed


In [12]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):

    if(train_on_gpu):
        rnn.cuda()

    h = tuple([each.data for each in hidden])


    rnn.zero_grad()
    
    if(train_on_gpu):
        inputs, target = inp.cuda(), target.cuda()

    output, h = rnn(inputs, h)
    
    loss = criterion(output, target)
    
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)

    optimizer.step()
    return loss.item(), h

tests.test_forward_back_prop(RNN, forward_back_prop, train_on_gpu)

Tests Passed


In [13]:

def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            batch_losses.append(loss)

            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    return rnn

In [14]:

sequence_length =  10

batch_size = 128

train_loader = batch_data(int_text, sequence_length, batch_size)

In [15]:

num_epochs = 5

learning_rate = 0.001


vocab_size = len(vocab_to_int)

output_size = vocab_size

embedding_dim = 200

hidden_dim = 250

n_layers = 2

show_every_n_batches = 2000

print(len(vocab_to_int))

21388


In [16]:

rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()


optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()


trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)


helper.save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 5 epoch(s)...
Epoch:    1/5     Loss: 4.947368633985519

Epoch:    1/5     Loss: 4.498962902665138

Epoch:    1/5     Loss: 4.355078360199928

Epoch:    2/5     Loss: 4.116275652961911

Epoch:    2/5     Loss: 3.948169407486916

Epoch:    2/5     Loss: 3.910545413374901

Epoch:    3/5     Loss: 3.806903993424701

Epoch:    3/5     Loss: 3.7283408836126326

Epoch:    3/5     Loss: 3.721267277598381

Epoch:    4/5     Loss: 3.642319423088809

Epoch:    4/5     Loss: 3.5875446372032167

Epoch:    4/5     Loss: 3.588407546877861

Epoch:    5/5     Loss: 3.530836515751168

Epoch:    5/5     Loss: 3.487316879153252

Epoch:    5/5     Loss: 3.4958903646469115



  "type " + obj.__name__ + ". It won't be checked "


Model Trained and Saved


### Question: How did you decide on your model hyperparameters? 
For example, did you try different sequence_lengths and find that one size made the model converge faster? What about your hidden_dim and n_layers; how did you decide on those?

**Answer:** 



I tried:
- sequence_length =  10, batch_size = 64, learning_rate = 0.01, embedding_dim = 200, hidden_dim = 200, n_layers = 2. Started with loss 9.25 and after 4 epochs the loss was still around 9.26.
- sequence_length =  10, batch_size = 64, learning_rate = 0.003, embedding_dim = 300, hidden_dim = 250, n_layers = 2 Started with Loss: 9.202159190654754 and at epoch 4 it was Loss: 9.206429640371343
- sequence_length =  20, batch_size = 20, learning_rate = 0.3, embedding_dim = 300, hidden_dim = 250, n_layers = 2 Started with Loss: 9.70091618013382, and at epoch 4 it was still around 9.6
- sequence_length =  20, batch_size = 124, learning_rate = 1, embedding_dim = 200, hidden_dim = 200, n_layers = 2 Started with Epoch: 1/10 Loss: 9.50547212076187 .. 

At this point i had some bugs in my code related to zero_grad, extra dropout layer and sigmoid layer.
Fixed issues and retried:

- sequence_length =  10, batch_size = 128, learning_rate = 0.001, embedding_dim = 200, hidden_dim = 250, n_layers = 2 Started with:
    Training for 10 epoch(s)...
    Epoch:    1/10    Loss: 4.944083527803421
    ...
    Epoch:    4/10    Loss: 3.5780555000305174
    ...
    Epoch:    7/10    Loss: 3.3266124720573425
    ...
    
- sequence_length =  10, batch_size = 124, learning_rate = 0.1, embedding_dim = 200, hidden_dim = 200, n_layers = 2 Started with 
    Training for 10 epoch(s)... 
    Epoch:    1/5    Loss: 5.481069218158722
    Epoch:    2/5    Loss: 5.025624033570289
    Epoch:    3/5   Loss: 4.981013494968415
    
I stopped here, because, even if it was decreasing it seemd to converge way slower than the previous experiment with a lower learning rate and a slightly bigger hidden_dim.

The first experiment above reached:
`Epoch:   5/5    Loss: 3.4958903646469115`.

In [17]:

import torch
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
trained_rnn = helper.load_model('./save/trained_rnn')

In [18]:

import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):

    rnn.eval()
    

    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        

        hidden = rnn.init_hidden(current_seq.size(0))
        

        output, _ = rnn(current_seq, hidden)
        

        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() 
         

        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        

        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        

        word = int_to_vocab[word_i]
        predicted.append(word)     
        

        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    

    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    

    return gen_sentences

In [19]:
# run the cell multiple times to get different results!
gen_length = 400 # modify the length to your preference
prime_word = 'jerry' # name for starting the script

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
pad_word = helper.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)



jerry: rise.

hoyt: the honor.

jerry: you know what?

kramer: yeah, i can't believe that was the first thing.

[new witness: the phone

chiles: what do you want to do?

hoyt: what do you want to do?

elaine: no.

hoyt: what about the hell?

hoyt: you know what you want to do with the defendants?

elaine: what?!

elaine: no, i don't know.

hoyt: what?

george: yeah.

elaine: yeah! i think you are in the bottom of my shirt.

hoyt: you can't believe the defendants is going to be a little bystander.

hoyt: i think it's the most time.

elaine: oh.

hoyt: i don't want a little bystander to get a little bystander.

george: what?

hoyt: no, that's not the most exciting.

hoyt: you know, i think i could have watched it.

hoyt: the defendants- bone!

jerry: i can't do to see this. it's like a little mishap.

[new witness: voice:

george: so what is this?

hoyt: what do you think?

jerry: oh, yeah.

hoyt: what about that?

jerry: i don't know what i think that i got it, but i have a lot of selfi

In [20]:

f =  open("generated_script_2.txt","w")
f.write(generated_script)
f.close()