In [1]:
%matplotlib inline

In [2]:
# reading the csv
category_lines = {'st' : []}
filterwords = ['NEXTEPISODE']
with open('./star_trek_transcripts_all_episodes_f.csv','r') as f:
    for line in f:
        v=line.strip().replace('=','').replace('/',' ').replace('+',' ').replace('(',' ') \
        .replace('[',' ').replace(')',' ').replace(']',' ').replace(', ','&').replace(',','^').replace('&',', ').split('^')
        for w in v:
            if (w not in filterwords) and (len(w)>1):
                category_lines['st'].append(w)
/display category_lines['st'][2]

"SPOCK: It can't be the screen then. Definitely something out there, Captain, headed this way."

In [2]:
# obtain characters
import string
all_letters = string.ascii_letters + string.digits + " '.,:?!\n"
n_letters = len(all_letters)
n_letters

70

In [7]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def IndexToTensor(index):
    tensor = torch.zeros(1, n_letters)
    tensor[0][index] = 1
    return tensor

# Turn a line into a <line_length x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line,dtype):
    tensor = torch.zeros(len(line), n_letters, dtype=dtype)
    for li, letter in enumerate(line):
        tensor[li][letterToIndex(letter)] = 1
    return tensor

idx = letterToIndex('\n')
IndexToTensor(idx)
# idx

tensor([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [5]:
import random
def partition(data, ratio=0.3):
    shuffled = random.sample(data, k=len(data))
    split_idx = int(len(shuffled)*0.3)
    return shuffled[:split_idx], shuffled[split_idx:]

In [6]:
from torch.utils.data import Dataset, DataLoader

class WordDataset(Dataset):
    def __init__(self, train_test_idx): # test=0, train=1
        self.train_test_idx = train_test_idx
        self.char_label_list = partition(category_lines['st'])[self.train_test_idx]
        
    def __getitem__(self, index):
        line = self.char_label_list[index] + '\n'
        label = torch.tensor([letterToIndex(letter) for letter in line[1:]], dtype=torch.long)
        word = lineToTensor(line[:-1], dtype=torch.float)
        return word, label # tuple of two tensors
        
    def __len__(self):
        return len(self.char_label_list)

In [7]:
# x=WordDataset(0)[0]
# x[0].shape, x[1].shape

In [8]:
def collate(batch):
    # batch = [(tensor, label), (tensor, label),...]
    # sort batch in descending order of tensor sequence length
#     print(batch)
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    
    # sends each (tensor, label) in sorted batch into zip
    tensor, label = zip(*sorted_batch)
    
#     print('label', label[0].shape)
    return torch.nn.utils.rnn.pack_sequence(tensor), torch.nn.utils.rnn.pack_sequence(label)

Creating the Network
====================





In [9]:
device = torch.device('cuda:0')

In [10]:
import torch.nn as nn

class LSTMmodel(nn.Module):    
    def __init__(self, input_size, hidden_size, num_layers, dropout, num_chars, device):
        super().__init__()
        self.device = device
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(0.1)
        self.lstm = torch.nn.LSTM(input_size=input_size,
                                  hidden_size=hidden_size,
                                  num_layers=num_layers,
                                  )
        self.fc = torch.nn.Linear(hidden_size, num_chars)
        
    def forward(self, x):
#         print('x', x.data.dtype)
        # x is PackedSequence
        h_0 = torch.zeros((self.num_layers, x.batch_sizes[0], self.hidden_size), device=self.device)
        c_0 = torch.zeros((self.num_layers, x.batch_sizes[0], self.hidden_size), device=self.device)
        
#         print('h0', h_0.dtype)
#         print('c0', c_0.dtype)
        
        output, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        # output = torch.nn.utils.rnn.PackedSequence of shape (seq_len, batch, num_directions * hidden_size)
        output = self.dropout(output.data)
        output = self.fc(output)
        return output
    
    def sample(self, char, temp):
        letter_indices = []
        char_idx = letterToIndex(char)
        
        while(char_idx != 69):
            letter_indices.append(char_idx)
            char_tensor = torch.nn.utils.rnn.pack_sequence(IndexToTensor(char_idx))
            output = self.forward(char_tensor) # (1 x #chars)
            output = output / temp
            output = torch.nn.functional.softmax(output, dim=1)
            output = torch.multinomial(output, 1) # (1 x 1)
            char_idx = output.item()
            

In [11]:
import copy
def train(model, train_dataloader, test_dataloader, train_data_len, device=device):
    model = model.to(device)
    cross_entropy = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    
    best_model_weights = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    total_iter = train_data_len / train_dataloader.batch_size  # len(train_dataloader)
    stats = []
    saved_epoch = 0
    
    for epoch in range(1,6):
        running_loss = 0
        totals = 0
        iteration = 0
        model.train()
        
#         print(len(train_dataloader))
        
        for packedseq, label in train_dataloader:
            if device.type == 'cuda':
                packedseq = packedseq.cuda()
                label = label.cuda()
            elif device.type == "cpu":
                packedseq = packedseq.cpu()
                label = label.cpu()
            
            iteration += 1
            
            optimizer.zero_grad()
            output = model(packedseq)
            
#             print('output', output, output.shape)
#             print('label', label, label.data.shape)
            
            loss = cross_entropy(output, label.data)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * label.data.shape[0]
            
            totals += label.data.shape[0]
            
            print('iter: {}/{}, running loss = {:.4f}'.format(iteration, total_iter, running_loss), end='\r')
            
        training_loss = running_loss / totals
        test_loss, test_acc = test(model, test_dataloader, device=device)
        stats.append((training_loss, test_loss, test_acc))
                   
        if test_acc > best_acc:
            best_acc = test_acc
            saved_epoch = epoch
            best_model_weights = copy.deepcopy(model.state_dict())
            
        print('Epoch: {} \tTraining Loss: {:.4f} \tTest Loss: {:.4f} \tTest Accuracy: {:.4f}'.format(
               epoch, training_loss, test_loss, test_acc))
        
    print('Best Test Acc: {:4f}'.format(best_acc))
    model.load_state_dict(best_model_weights)
    
    return stats, saved_epoch

In [12]:
def test(model, dataloader, device=device):
    cross_entropy = nn.CrossEntropyLoss()
    model = model.to(device)
    model.eval()
    
    total_loss = 0
    total_corrects = 0
    count = 0
  
    with torch.no_grad():
        for packedseq, label in train_dataloader:
            if device.type == 'cuda':
                packedseq = packedseq.cuda()
                label = label.cuda()
            elif device.type == "cpu":
                packedseq = packedseq.cpu()
                label = label.cpu()
            
            output = model(packedseq)
            
            loss = cross_entropy(output, label.data)
            total_loss += loss.item() * label.data.shape[0]
            
            correct = output.argmax(dim=1) == label.data
            total_corrects += correct.sum().item()
            count += label.data.shape[0]
            
    loss = total_loss/count
    accuracy = total_corrects/count
    
    return loss, accuracy

In [13]:
train_data = WordDataset(1)
test_data = WordDataset(0)

train_dataloader = DataLoader(train_data, batch_size=5, shuffle=True, collate_fn=collate)
test_dataloader = DataLoader(test_data, batch_size=5, shuffle=True, collate_fn=collate)

In [14]:
model = LSTMmodel(input_size=n_letters, hidden_size=100, num_layers=2, num_chars=n_letters, dropout=0.1, device=device)
epoch_stats = train(model, train_dataloader=train_dataloader, test_dataloader=test_dataloader, train_data_len=len(train_data), device=device)
test_loss, test_acc = test(model, test_dataloader)

Epoch: 1 	Training Loss: 3.1430 	Test Loss: 3.0503 	Test Accuracy: 0.1821
Epoch: 2 	Training Loss: 2.7382 	Test Loss: 2.4904 	Test Accuracy: 0.2978
Epoch: 3 	Training Loss: 2.3713 	Test Loss: 2.2694 	Test Accuracy: 0.3452
Epoch: 4 	Training Loss: 2.1788 	Test Loss: 2.1264 	Test Accuracy: 0.3812
Epoch: 5 	Training Loss: 2.0377 	Test Loss: 2.0680 	Test Accuracy: 0.4011
Best Test Acc: 0.401103


In [15]:
test_loss, test_acc

(2.067983208024586, 0.40110295782257893)

Training
=========
Preparing for Training
----------------------

First of all, helper functions to get random pairs of (category, line):




In [None]:
import random

# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random category and random line from that category
def randomTrainingPair():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    return category, line

For each timestep (that is, for each letter in a training word) the
inputs of the network will be
``(category, current letter, hidden state)`` and the outputs will be
``(next letter, next hidden state)``. So for each training set, we'll
need the category, a set of input letters, and a set of output/target
letters.

Since we are predicting the next letter from the current letter for each
timestep, the letter pairs are groups of consecutive letters from the
line - e.g. for ``"ABCD<EOS>"`` we would create ("A", "B"), ("B", "C"),
("C", "D"), ("D", "EOS").

.. figure:: https://i.imgur.com/JH58tXY.png
   :alt:

The category tensor is a `one-hot
tensor <https://en.wikipedia.org/wiki/One-hot>`__ of size
``<1 x n_categories>``. When training we feed it to the network at every
timestep - this is a design choice, it could have been included as part
of initial hidden state or some other strategy.




In [None]:
# # One-hot vector for category
# def categoryTensor(category):
#     li = all_categories.index(category)
#     tensor = torch.zeros(1, n_categories)
#     tensor[0][li] = 1
#     return tensor

# # One-hot matrix of first to last letters (not including EOS) for input
# def inputTensor(line):
#     tensor = torch.zeros(len(line), 1, n_letters)
#     for li in range(len(line)):
#         letter = line[li]
#         tensor[li][0][all_letters.find(letter)] = 1
#     return tensor

# # LongTensor of second letter to end (EOS) for target
# def targetTensor(line):
#     letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
#     letter_indexes.append(n_letters - 1) # EOS
#     return torch.LongTensor(letter_indexes)

For convenience during training we'll make a ``randomTrainingExample``
function that fetches a random (category, line) pair and turns them into
the required (category, input, target) tensors.




In [None]:
# Make category, input, and target tensors from a random category, line pair
def randomTrainingExample():
    category, line = randomTrainingPair()
    category_tensor = categoryTensor(category)
    input_line_tensor = inputTensor(line)
    target_line_tensor = targetTensor(line)
    return category_tensor, input_line_tensor, target_line_tensor

Training the Network
--------------------

In contrast to classification, where only the last output is used, we
are making a prediction at every step, so we are calculating loss at
every step.

The magic of autograd allows you to simply sum these losses at each step
and call backward at the end.




In [None]:
criterion = nn.CrossEntropyLoss()

learning_rate = 0.0005

def train(category_tensor, input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    hidden = rnn.initHidden()

    rnn.zero_grad()

    loss = 0

    for i in range(input_line_tensor.size(0)):
        output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
        l = criterion(output, target_line_tensor[i])
        loss += l

    loss.backward()

    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item() / input_line_tensor.size(0)

To keep track of how long training takes I am adding a
``timeSince(timestamp)`` function which returns a human readable string:

In [None]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

Training is business as usual - call train a bunch of times and wait a
few minutes, printing the current time and loss every ``print_every``
examples, and keeping store of an average loss per ``plot_every`` examples
in ``all_losses`` for plotting later.




In [None]:
rnn = RNN(n_letters, 128, n_letters)

n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(*randomTrainingExample())
    total_loss += loss

    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

Plotting the Losses
-------------------

Plotting the historical loss from all\_losses shows the network
learning:




In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

Sampling the Network
====================

To sample we give the network a letter and ask what the next one is,
feed that in as the next letter, and repeat until the EOS token.

-  Create tensors for input category, starting letter, and empty hidden
   state
-  Create a string ``output_name`` with the starting letter
-  Up to a maximum output length,

   -  Feed the current letter to the network
   -  Get the next letter from highest output, and next hidden state
   -  If the letter is EOS, stop here
   -  If a regular letter, add to ``output_name`` and continue

-  Return the final name

.. Note::
   Rather than having to give it a starting letter, another
   strategy would have been to include a "start of string" token in
   training and have the network choose its own starting letter.




In [None]:
max_length = 20

# Sample from a category and starting letter
def sample(category, start_letter='A'):
    with torch.no_grad():  # no need to track history in sampling
        category_tensor = categoryTensor(category)
        input = inputTensor(start_letter)
        hidden = rnn.initHidden()

        output_name = start_letter

        for i in range(max_length):
            output, hidden = rnn(category_tensor, input[0], hidden)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            if topi == n_letters - 1:
                break
            else:
                letter = all_letters[topi]
                output_name += letter
            input = inputTensor(letter)

        return output_name

# Get multiple samples from one category and multiple starting letters
def samples(category, start_letters='ABC'):
    for start_letter in start_letters:
        print(sample(category, start_letter))

samples('Russian', 'RUS')

samples('German', 'GER')

samples('Spanish', 'SPA')

samples('Chinese', 'CHI')

Exercises
=========

-  Try with a different dataset of category -> line, for example:

   -  Fictional series -> Character name
   -  Part of speech -> Word
   -  Country -> City

-  Use a "start of sentence" token so that sampling can be done without
   choosing a start letter
-  Get better results with a bigger and/or better shaped network

   -  Try the nn.LSTM and nn.GRU layers
   -  Combine multiple of these RNNs as a higher level network


