In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from google.colab import files
src = list(files.upload().values())[0]
open('mylib.py','wb').write(src)
import mylib

Saving helper.py to helper.py


In [4]:
data_path = "seinfeld_script.txt"
TEXT = mylib.load_data(data_path)
TEXT[:100]

'jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and '

### Explore Data

In [5]:
view_sentence_range = (0, 10)
unique_words = len({word: None for word in TEXT.split(" ")})
print("Unique Words : {}".format(unique_words))

Unique Words : 46713


In [6]:
num_scene = TEXT.split("\n\n")
print("Number of scenes : {}".format(len(num_scene)))

Number of scenes : 54617


In [7]:
sentences = [sentence for scene in num_scene for sentence in scene.split('\n')]
print("Number of lines : {}".format(len(sentences)))

Number of lines : 54617


In [8]:
word_count_sentence = [len(sentence.split()) for sentence in sentences]
print("Average number of words in each line : {}".format(np.average(word_count_sentence)))

Average number of words in each line : 11.088379076111833


In [9]:
print('\n'.join(TEXT.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))

jerry: do you know what this is all about? do you know, why were here? to be out, this is out...and out is one of the single most enjoyable experiences of life. people...did you ever hear people talking about we should go out? this is what theyre talking about...this whole thing, were all out now, no one is home. not one person here is home, were all out! there are people trying to find us, they dont know where we are. (on an imaginary phone) did you ring?, i cant find him. where did he go? he didnt tell me where he was going. he must have gone out. you wanna go out you get ready, you pick out the clothes, right? you take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...then youre standing around, what do you do? you go we gotta be getting back. once youre out, you wanna get back! you wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? where ever you are in life, its my feeling, youve gotta go. 

jerry: (pointi

### Data Preprocessing

In [10]:
def create_lookup_table(text):
    """create lookup tables for vocabulary"""
    unique_words = sorted(set(text))
    vocab_to_int = {}
    int_to_vocab = {}
    for idx, word in enumerate(unique_words):
        vocab_to_int[word] = idx
        int_to_vocab[idx] = word
    return vocab_to_int, int_to_vocab

In [11]:
def token_lookup():
    """generate a dict to turn punctuation into a token."""
    tokens_dict = {}
    tokens_dict["."] = "||Period||"
    tokens_dict[","] = "||Comma||"
    tokens_dict["\""] = "||Quotation_Mark||"
    tokens_dict[";"] = "||Semicolon||"
    tokens_dict["!"] = "||Exclamation_Mark||"
    tokens_dict["?"] = "||Question_Mark||"
    tokens_dict["("] = "||Left_Parntheses||"
    tokens_dict[")"] = "||Right_Parntheses||"
    tokens_dict["--"] = "||Dash||"
    tokens_dict["\n"] = "||Return||"
    return tokens_dict

In [12]:
mylib.preprocess_and_save_data(data_path, token_lookup, create_lookup_table)

### Check-point 1

In [13]:
int_text, vocab_to_int, int_to_vocab, token_dict = mylib.load_preprocess()

### Build Neural Network

In [28]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [14]:
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

In [17]:
def batch_data(words, sequence_length, batch_size):
    """batch the neural network data using DataLoader"""
    n_batches = len(words)//batch_size
    words = words[:n_batches*batch_size]
    y_len = len(words) - sequence_length
    x, y = [], []
    for idx in range(0, y_len):
        idx_end = sequence_length + idx
        x_batch = words[idx:idx_end]
        x.append(x_batch)
        batch_y =  words[idx_end]
        y.append(batch_y)
    data = TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))
    data_loader = DataLoader(data, shuffle=False, batch_size=batch_size)
    return data_loader

In [18]:
# test-case
test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)
data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()
print(sample_x.shape)
print(sample_x)
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[ 0,  1,  2,  3,  4],
        [ 1,  2,  3,  4,  5],
        [ 2,  3,  4,  5,  6],
        [ 3,  4,  5,  6,  7],
        [ 4,  5,  6,  7,  8],
        [ 5,  6,  7,  8,  9],
        [ 6,  7,  8,  9, 10],
        [ 7,  8,  9, 10, 11],
        [ 8,  9, 10, 11, 12],
        [ 9, 10, 11, 12, 13]])
torch.Size([10])
tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14])


In [20]:
class RNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5, lr=0.001):
        """
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(RNN, self).__init__()
        # embedding layer        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        ## LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        # class variables
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # fully-connected output layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """
        batch_size = nn_input.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout and fully-connected layer
        out = self.fc(lstm_out)
        # reshape into (batch_size, seq_length, output_size)
        out = out.view(batch_size, -1, self.output_size)
        # get last batch
        out = out[:, -1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        weight = next(self.parameters()).data
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [21]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    :param decoder: The PyTorch Module that holds the neural network
    :param decoder_optimizer: The PyTorch optimizer for the neural network
    :param criterion: The PyTorch loss function
    :param inp: A batch of input to the neural network
    :param target: The target output for the batch of input
    :return: The loss and the latest hidden state Tensor
    """
    if(train_on_gpu):
        rnn.cuda()
    # new var for hidden state
    h = tuple([each.data for each in hidden])
    # zero accumulated gradients
    rnn.zero_grad()
    # data to cuda
    if(train_on_gpu):
        inputs, target = inp.cuda(), target.cuda()
    # get predicted outputs
    output, h = rnn(inputs, h)
    # calculate loss
    loss = criterion(output, target)
    # optimizer.zero_grad()
    loss.backward()
    # 'clip_grad_norm' helps prevent the exploding gradient problem in RNNs / LSTMs
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()
    return loss.item(), h

### Training the Neural Network

In [22]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    rnn.train()
    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)
            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []
    # returns a trained rnn
    return rnn

In [23]:
# hyperparameters

sequence_length = 10
batch_size = 128
train_loader = batch_data(int_text, sequence_length, batch_size)
num_epochs = 10
learning_rate = 0.001
vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 200
hidden_dim = 250
n_layers = 2
show_every_n_batches = 2000

In [25]:
# create model and move to gpu if available
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
mylib.save_model('trained_rnn', trained_rnn)
print('Model Trained and Saved.')

Training for 10 epoch(s)...
Epoch:    1/10    Loss: 4.924959572196006

Epoch:    1/10    Loss: 4.474561125397682

Epoch:    1/10    Loss: 4.334161711931229

Epoch:    2/10    Loss: 4.084947553785637

Epoch:    2/10    Loss: 3.9270058200359346

Epoch:    2/10    Loss: 3.8821799390316007

Epoch:    3/10    Loss: 3.7825103751431595

Epoch:    3/10    Loss: 3.707846811532974

Epoch:    3/10    Loss: 3.679092717528343

Epoch:    4/10    Loss: 3.6133324389918524

Epoch:    4/10    Loss: 3.567855329632759

Epoch:    4/10    Loss: 3.5341529774665834

Epoch:    5/10    Loss: 3.499656192074907

Epoch:    5/10    Loss: 3.4651908386945722

Epoch:    5/10    Loss: 3.4292626764774323

Epoch:    6/10    Loss: 3.420078241964004

Epoch:    6/10    Loss: 3.3842597613334657

Epoch:    6/10    Loss: 3.353777425408363

Epoch:    7/10    Loss: 3.3542060119146515

Epoch:    7/10    Loss: 3.320476257801056

Epoch:    7/10    Loss: 3.295342052221298

Epoch:    8/10    Loss: 3.3008150756971895

Epoch:    8/10  

  "type " + obj.__name__ + ". It won't be checked "


### Check-point 2

In [27]:
_, vocab_to_int, int_to_vocab, token_dict = mylib.load_preprocess()
trained_rnn = mylib.load_model('./save/trained_rnn')

### Generate TV Script

In [34]:
def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    """
    Generate text using the neural network
    :param decoder: The PyTorch Module that holds the trained neural network
    :param prime_id: The word id to start the first prediction
    :param int_to_vocab: Dict of word id keys to word values
    :param token_dict: Dict of puncuation tokens keys to puncuation values
    :param pad_value: The value used to pad a sequence
    :param predict_len: The length of text to generate
    :return: The generated text
    """
    rnn.eval()
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu()
        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        # select the likely next word index with some element of randomness
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq.cpu(), -1, 1)
        current_seq[-1][-1] = word_i

    gen_sentences = ' '.join(predicted)
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    # return all the sentences
    return gen_sentences

In [35]:
gen_length = 500
prime_word = 'jerry'

pad_word = mylib.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

jerry: the evening of the eighth county penal ]

hoyt: i told him that you know what i mean.

jerry: what happened?

jerry: no, not really necessary.

kramer: well, i think i can.

jerry: oh.

elaine: hey, what's the matter with you?

kramer: well, it's a little less, huh?

george: i can't tell you something to do it.

kramer: no. i was going to be held accountable for a second.

jerry: you know, i was wondering about this. it's a little adjustment.

jerry: oh, i think it was a good mood - the one. i mean, if you don't have to get that car in the building.

kramer: oh, that's right.

kramer: yeah, that's it. i think we should have found a ride in a wheelchair to ignore him.

george: oh no, no.

jerry: so, what are you doing?

jerry: i can't believe it. it's the one who sent the saab.

jerry: i think it was a whim.

elaine: oh, i was thinking that was a little problem.

jerry: so, you want to get this out?

hoyt: yes, yes. i mean, i don't want to know why i can.

jerry: i don't even kno