In [1]:
from argparse import Namespace
import json
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

from vocabulary import Vocabulary

%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 6)

START_TOKEN = "^"
END_TOKEN = "_"

IGNORE_INDEX_VALUE = -1

## Class Definitions 

Data Model:
- Raw data
- Vectorizer
- Vectorized Data
- Data generator

In [2]:
class RawSurnames(object):
    def __init__(self, data_path, delimiter=","):
        self.data = pd.read_csv(data_path, delimiter=delimiter)

    def get_data(self, filter_to_nationality=None):
        if filter_to_nationality is not None:
            return self.data[self.data.nationality.isin(filter_to_nationality)]
        return self.data

# vectorizer

class SurnamesVectorizer(object):
    def __init__(self, surname_vocab, nationality_vocab, max_seq_length):
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
        self.max_seq_length = max_seq_length
        
    def save(self, filename):
        vec_dict = {"surname_vocab": self.surname_vocab.get_serializable_contents(),
                    "nationality_vocab": self.nationality_vocab.get_serializable_contents(),
                    'max_seq_length': self.max_seq_length}

        with open(filename, "w") as fp:
            json.dump(vec_dict, fp)
        
    @classmethod
    def load(cls, filename):
        with open(filename, "r") as fp:
            vec_dict = json.load(fp)

        vec_dict["surname_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["surname_vocab"])
        vec_dict["nationality_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["nationality_vocab"])
        return cls(**vec_dict)

    @classmethod
    def fit(cls, surname_df):
        surname_vocab = Vocabulary(use_unks=False,
                                   use_mask=True,
                                   use_start_end=True,
                                   start_token=START_TOKEN,
                                   end_token=END_TOKEN)

        nationality_vocab = Vocabulary(use_unks=False, use_start_end=False, use_mask=False)

        max_seq_length = 0
        for index, row in surname_df.iterrows():
            surname_vocab.add_many(row.surname)
            nationality_vocab.add(row.nationality)

            if len(row.surname) > max_seq_length:
                max_seq_length = len(row.surname)
        max_seq_length = max_seq_length + 2

        return cls(surname_vocab, nationality_vocab, max_seq_length)

    @classmethod
    def fit_transform(cls, surname_df, split='train'):
        vectorizer = cls.fit(surname_df)
        return vectorizer, vectorizer.transform(surname_df, split)

    def transform(self, surname_df, split='train'):

        df = surname_df[surname_df.split==split].reset_index()
        n_data = len(df)
        
        x_surnames = np.zeros((n_data, self.max_seq_length), dtype=np.int64)
        y_surnames = np.ones((n_data, self.max_seq_length), dtype=np.int64) * IGNORE_INDEX_VALUE

        for index, row in df.iterrows():
            vectorized_surname = list(self.surname_vocab.map(row.surname, 
                                                             include_start_end=True))
            x_part = vectorized_surname[:-1]
            y_part = vectorized_surname[1:]
            x_surnames[index, :len(x_part)] = x_part
            y_surnames[index, :len(y_part)] = y_part

        return VectorizedSurnames(x_surnames, y_surnames)

# vec data

class VectorizedSurnames(Dataset):
    def __init__(self, x_surnames, y_surnames):
        self.x_surnames = x_surnames
        self.y_surnames = y_surnames

    def __len__(self):
        return len(self.x_surnames)

    def __getitem__(self, index):
        return {'x_surnames': self.x_surnames[index],
                'y_surnames': self.y_surnames[index],
                'x_lengths': len(self.x_surnames[index].nonzero()[0])}

# data generator

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## Class definitions for the model

In [3]:
def new_parameter(*size):
    out = torch.randn(*size, requires_grad=True, dtype=torch.float32)
    torch.nn.init.xavier_normal_(out)
    return nn.Parameter(out)

def column_gather(y_out, x_lengths):
    '''Get a specific vector from each batch datapoint in `y_out`.

    More precisely, iterate over batch row indices, get the vector that's at
    the position indicated by the corresponding value in `x_lengths` at the row
    index.

    Args:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, sequence, feature)
        x_lengths (torch.LongTensor, torch.cuda.LongTensor)
            shape: (batch,)

    Returns:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, feature)
    '''
    x_lengths = x_lengths.long().detach().cpu().numpy() - 1

    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)


class ExplicitRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=False):
        super(ExplicitRNN, self).__init__()
        self.W_in2hid = new_parameter(input_size, hidden_size)
        self.W_hid2hid = new_parameter(hidden_size, hidden_size)
            
        self.b_hid = new_parameter(1, hidden_size)
        
        self.hidden_size = hidden_size

        self.batch_first = batch_first
    
    def _compute_next_hidden(self, x, h):
        return F.tanh(x.matmul(self.W_in2hid) + 
                      h.matmul(self.W_hid2hid) + 
                      self.b_hid)

    def forward(self, x_in, hid_t=None):
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []
        if hid_t is None:
            hid_t = torch.ones((batch_size, self.hidden_size))
        
        if x_in.is_cuda:
            hid_t = hid_t.cuda()
            
        for t in range(seq_size):
            x_t = x_in[t]
            hid_t = self._compute_next_hidden(x_t, hid_t)
            
            hiddens.append(hid_t)
        hiddens = torch.stack(hiddens)

        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens
    
    
    
class CharNN(nn.Module):
    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, 
                 batch_first=False):
        super(CharNN, self).__init__()
        
        self.emb = nn.Embedding(embedding_dim=embedding_size, 
                                num_embeddings=in_vocab_size,
                                padding_idx=0)
        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
        self.rnn = ExplicitRNN(input_size=embedding_size, 
                               hidden_size=hidden_size, 
                               batch_first=batch_first)
    
    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        # x_in.shape == (batch_size, max_seq_length)
        x_in = self.emb(x_in)
        # x_in.shape == batch_size, max_seq_length, embedding_size
        y_out = self.rnn(x_in)
        # y_out.shape == batch_size, max_seq_lenth, hidden_size)

        
        #reshape into a mtrix so we can apply a linear layer.
        dim0, dim1, dim2 = y_out.size()
        y_out = y_out.contiguous().view(-1, dim2)

        #Now that its a matrix, can apply liear layer
        y_out = self.fc(y_out)

        # optionally apply the softmax
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        y_out = y_out.view(dim0, dim1, -1)
        #y_out.shape == (batch_size, max_seq_length, character_vocab_size)
        return y_out
    
def normalize_sizes(net_output, y_true):
    net_output = net_output.cpu()
    y_true = y_true.cpu()
    if len(net_output.size()) == 3:
        net_output.contiguous()
        net_output = net_output.view(-1, net_output.size(2))
    if len(y_true.size()) == 2:
        y_true.contiguous()
        y_true = y_true.view(-1)
    return net_output, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100


## Make, Train, and Eval

In [4]:
args = Namespace(
    surname_csv="../data/surnames.csv",
    batch_size = 128,
    cuda=True,
    learning_rate=0.001,
    num_epochs=100,
    load_zoo_model=True,
    zoo={
        'filename': '../modelzoo/charnn_emb16_hid64_surnames_predict.state',
        'vocab': '../modelzoo/surnames_classify.vocab',
        'comments': 'pre-trained surname sequence prediction (& generation model)',
        'parameters': {
            'embedding_size': 16,
            'hidden_size': 64
        }
    }
)
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")
args.device

Using CUDA: True


device(type='cuda')

In [5]:
# optional: set this to false to learn from scratch!
# args.load_zoo_model = False

In [6]:
raw_data = RawSurnames(args.surname_csv).get_data()

if os.path.exists(args.zoo['vocab']):
    vectorizer = SurnamesVectorizer.load(args.zoo['vocab'])
    print("Loading vectorizer!")
else:
    vectorizer = SurnamesVectorizer.fit(raw_data)
    print("Creating a new vectorizer.")
    
train_dataset = vectorizer.transform(raw_data, split='train')
test_dataset = vectorizer.transform(raw_data, split='test')

zoo_params = args.zoo['parameters']

net = CharNN(embedding_size=zoo_params['embedding_size'], 
             hidden_size=zoo_params['hidden_size'],
             in_vocab_size=len(vectorizer.surname_vocab), 
             out_vocab_size=len(vectorizer.surname_vocab), 
             batch_first=True)

if args.load_zoo_model and os.path.exists(args.zoo['filename']):
    print("Loading state dict!")
    net.load_state_dict(torch.load(args.zoo['filename'], 
                                   map_location=lambda storage, loc: storage))
else:
    print("Using newly initiated network!")

Loading vectorizer!
Loading state dict!


In [7]:
net = net.to(args.device)
    
optimizer = optim.Adam(net.parameters(), lr=args.learning_rate)

# loss function

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

# progress bars

epoch_bar = tqdm_notebook(desc='epochs', total=args.num_epochs, position=1)

num_train_batches = len(train_dataset) // args.batch_size
train_bar = tqdm_notebook(desc='training', total=num_train_batches, position=2)

num_test_batches = len(test_dataset) // args.batch_size
test_bar = tqdm_notebook(desc='test', total=num_test_batches, position=3)

# history

train_loss_history = []
train_accuracy_history = []

test_loss_history = []
test_accuracy_history = []


try:
    for _ in range(args.num_epochs):
        batch_generator = generate_batches(train_dataset, batch_size=args.batch_size,
                                           device=args.device)
        
        per_epoch_loss = []
        per_epoch_accuracy = []
        
        net.train()
            
        for batch_dict in batch_generator:
            # step 1
            optimizer.zero_grad()

            # step 2
            y_pred = net(batch_dict['x_surnames'], batch_dict['x_lengths'])
            y_target = batch_dict['y_surnames']
            
            # step 3
            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)
            
            # step 4
            loss.backward()
            optimizer.step()
          
            # bonus steps: bookkeeping
            
            per_epoch_loss.append(loss.item())
            
            accuracy = compute_accuracy(y_pred, batch_dict['y_surnames'], IGNORE_INDEX_VALUE)
            per_epoch_accuracy.append(accuracy)

            train_bar.update()
            
            train_bar.set_postfix(loss=per_epoch_loss[-1], 
                                  accuracy=per_epoch_accuracy[-1])
            
        train_loss_history.append(np.mean(per_epoch_loss))
        train_accuracy_history.append(np.mean(per_epoch_accuracy))
        
        # loop over test dataset
        
        batch_generator = generate_batches(test_dataset, batch_size=args.batch_size, 
                                           device=args.device)
        
        per_epoch_loss = []
        per_epoch_accuracy = []
            
        # set it to eval mode; this turns stochastic functions off
        net.eval()
            
        for batch_dict in batch_generator:
            
            # step 1: compute output
            y_pred = net(batch_dict['x_surnames'], batch_dict['x_lengths'])
            y_target = batch_dict['y_surnames']
            
            # step 2: compute metrics
            
            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)
            per_epoch_loss.append(loss.item())
          
            accuracy = compute_accuracy(y_pred, batch_dict['y_surnames'], IGNORE_INDEX_VALUE)
            per_epoch_accuracy.append(accuracy)

            test_bar.update()
            
            test_bar.set_postfix(loss=per_epoch_loss[-1], 
                                 accuracy=per_epoch_accuracy[-1])
            
        test_loss_history.append(np.mean(per_epoch_loss))
        test_accuracy_history.append(np.mean(per_epoch_accuracy))
        
        # update bars
        
        epoch_bar.set_postfix(train_loss=train_loss_history[-1], 
                              train_accuracy=train_accuracy_history[-1],
                              test_loss=test_loss_history[-1],
                              test_accuracy=test_accuracy_history[-1])
        epoch_bar.update()
        test_bar.n = 0
        train_bar.n = 0
        
except KeyboardInterrupt:
    print("...")

HBox(children=(IntProgress(value=0, description='epochs'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='training', max=125), HTML(value='')))

HBox(children=(IntProgress(value=0, description='test', max=31), HTML(value='')))



In [8]:
#Save model
torch.save(net.state_dict(), '02-Char-RNN-Predict-Surnames.pt')


##  Exercise!

Now that we have a model which was trained to predict sequences, let's make our own sampler!

The sampler will walk through the generation procedure, selecting one character a time.  The result is something like this: 

```
['Poldtoff',
 'Schestars',
 'Gordoud',
 'Kinsen',
 'Venzey',
 'Tumali',
 'Pets',
 'Aänchekin',
 'GDigkov',
 'Shadonov',
 'Boulyanson',
 'Gwae',
 'Zgerege',
 'Foxchevtsev',
 'Progkin',
 'Ussin']
```


see dl4nlp.info for more information on this exercise. 