In [1]:
from argparse import Namespace
import json
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

from vocabulary import Vocabulary

%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 6)

START_TOKEN = "^"
END_TOKEN = "_"
IGNORE_INDEX_VALUE = -1

## Class Definitions 

Data Model:
- Raw data
- Vectorizer
- Vectorized Data
- Data generator

In [2]:
class RawTrumpTweets(object):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        
    def get_data(self):
        return self.data  

# vectorizer

class TrumpTweetVectorizer(object):
    def __init__(self, word_vocab, max_seq_length):
        self.word_vocab = word_vocab
        self.max_seq_length = max_seq_length
        
    def save(self, filename):
        vec_dict = {"word_vocab": self.word_vocab.get_serializable_contents(),
                    'max_seq_length': self.max_seq_length}

        with open(filename, "w") as fp:
            json.dump(vec_dict, fp)
        
    @classmethod
    def load(cls, filename):
        with open(filename, "r") as fp:
            vec_dict = json.load(fp)

        vec_dict["word_vocab"] = Vocabulary.deserialize_from_contents(vec_dict["word_vocab"])
        return cls(**vec_dict)

    @classmethod
    def fit(cls, tweet_df):
        vocab = Vocabulary(use_unks=False,
                           use_start_end=True,
                           use_mask=True,
                           start_token=START_TOKEN,
                           end_token=END_TOKEN)
        max_seq_length = 0
        for text in tweet_df.text:
            split_text = text.split(" ")
            vocab.add_many(split_text)
            if len(split_text) > max_seq_length:
                max_seq_length = len(split_text)
        max_seq_length = max_seq_length + 2
        return cls(vocab, max_seq_length)

    @classmethod
    def fit_transform(cls, tweet_df, split='train'):
        vectorizer = cls.fit(tweet_df)
        return vectorizer, vectorizer.transform(tweet_df, split)

    def transform(self, tweet_df, split='train'):
        tweet_df = tweet_df[tweet_df.split==split].reset_index()
        num_data = len(tweet_df)
        
        x_words = np.zeros((num_data, self.max_seq_length), dtype=np.int64)
        y_words = np.ones((num_data, self.max_seq_length), dtype=np.int64) * IGNORE_INDEX_VALUE

        for index, row in tweet_df.iterrows():
            converted = list(self.word_vocab.map(row.text.split(' '), include_start_end=True))
            x_version = converted[:-1]
            y_version = converted[1:]
            
            x_words[index, :len(x_version)] = x_version
            y_words[index, :len(y_version)] = y_version
            
        return VectorizedTrumpTweets(x_words, y_words)

# vec data


class VectorizedTrumpTweets(Dataset):
    def __init__(self, x_words, y_words):
        self.x_words = x_words
        self.y_words = y_words

    def __len__(self):
        return len(self.x_words)

    def __getitem__(self, index):
        return {'x_words': self.x_words[index],
                'y_words': self.y_words[index],
                'x_lengths': len(self.x_words[index].nonzero()[0])}

# data generator

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

## Class definitions for the model

In [3]:
def new_parameter(*size):
    out = torch.randn(*size, requires_grad=True, dtype=torch.float32)
    torch.nn.init.xavier_normal_(out)
    return nn.Parameter(out)

def column_gather(y_out, x_lengths):
    '''Get a specific vector from each batch datapoint in `y_out`.

    More precisely, iterate over batch row indices, get the vector that's at
    the position indicated by the corresponding value in `x_lengths` at the row
    index.

    Args:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, sequence, feature)
        x_lengths (torch.LongTensor, torch.cuda.LongTensor)
            shape: (batch,)

    Returns:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, feature)
    '''
    x_lengths = x_lengths.long().detach().cpu().numpy() - 1

    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)


class ExplicitRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=False):
        super(ExplicitRNN, self).__init__()
        self.W_in2hid = new_parameter(input_size, hidden_size)
        self.W_hid2hid = new_parameter(hidden_size, hidden_size)
            
        self.b_hid = new_parameter(1, hidden_size)
        
        self.hidden_size = hidden_size

        self.batch_first = batch_first
    
    def _compute_next_hidden(self, x, h):
        return F.tanh(x.matmul(self.W_in2hid) + 
                      h.matmul(self.W_hid2hid) + 
                      self.b_hid)

    def forward(self, x_in, hid_t=None):
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []
        if hid_t is None:
            hid_t = torch.ones((batch_size, self.hidden_size))
        
        if x_in.is_cuda:
            hid_t = hid_t.cuda()
            
        for t in range(seq_size):
            x_t = x_in[t]
            hid_t = self._compute_next_hidden(x_t, hid_t)
            
            hiddens.append(hid_t)
        hiddens = torch.stack(hiddens)

        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens
    
    
class WordRNN(nn.Module):
    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, 
                 batch_first=True):
        super(WordRNN, self).__init__()
        
        self.emb = nn.Embedding(embedding_dim=embedding_size, 
                                num_embeddings=in_vocab_size, 
                                padding_idx=0)
        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
        
        self.rnn = ExplicitRNN(input_size=embedding_size, 
                               hidden_size=hidden_size, 
                               batch_first=batch_first)
    
    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        x_in = self.emb(x_in)
        y_out = self.rnn(x_in)

        dim0, dim1, dim2 = y_out.size()
        y_out = y_out.contiguous().view(-1, dim2)

        y_out = self.fc(y_out)

        # optionally apply the softmax
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        y_out = y_out.view(dim0, dim1, -1)
        
        return y_out
    
def normalize_sizes(net_output, y_true):
    net_output = net_output.cpu()
    y_true = y_true.cpu()
    if len(net_output.size()) == 3:
        net_output.contiguous()
        net_output = net_output.view(-1, net_output.size(2))
    if len(y_true.size()) == 2:
        y_true.contiguous()
        y_true = y_true.view(-1)
    return net_output, y_true

def sequence_loss(net_output, y_true, loss_func=F.cross_entropy):
    net_output, y_true = normalize_sizes(net_output, y_true)
    return F.cross_entropy(net_output, y_true, ignore_index=IGNORE_INDEX_VALUE)
def new_parameter(*size):
    out = torch.randn(*size, requires_grad=True, dtype=torch.float32)
    torch.nn.init.xavier_normal_(out)
    return nn.Parameter(out)

def column_gather(y_out, x_lengths):
    '''Get a specific vector from each batch datapoint in `y_out`.

    More precisely, iterate over batch row indices, get the vector that's at
    the position indicated by the corresponding value in `x_lengths` at the row
    index.

    Args:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, sequence, feature)
        x_lengths (torch.LongTensor, torch.cuda.LongTensor)
            shape: (batch,)

    Returns:
        y_out (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, feature)
    '''
    x_lengths = x_lengths.long().detach().cpu().numpy() - 1

    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)


class ExplicitRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=False):
        super(ExplicitRNN, self).__init__()
        self.W_in2hid = new_parameter(input_size, hidden_size)
        self.W_hid2hid = new_parameter(hidden_size, hidden_size)
            
        self.b_hid = new_parameter(1, hidden_size)
        
        self.hidden_size = hidden_size

        self.batch_first = batch_first
    
    def _compute_next_hidden(self, x, h):
        return F.tanh(x.matmul(self.W_in2hid) + 
                      h.matmul(self.W_hid2hid) + 
                      self.b_hid)

    def forward(self, x_in, hid_t=None):
        if self.batch_first:
            batch_size, seq_size, feat_size = x_in.size()
            x_in = x_in.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = x_in.size()

        hiddens = []
        if hid_t is None:
            hid_t = torch.ones((batch_size, self.hidden_size))
        
        if x_in.is_cuda:
            hid_t = hid_t.cuda()
            
        for t in range(seq_size):
            x_t = x_in[t]
            hid_t = self._compute_next_hidden(x_t, hid_t)
            
            hiddens.append(hid_t)
        hiddens = torch.stack(hiddens)

        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)

        return hiddens
    
    
class WordRNN(nn.Module):
    def __init__(self, embedding_size, in_vocab_size, out_vocab_size, hidden_size, 
                 batch_first=True):
        super(WordRNN, self).__init__()
        
        self.emb = nn.Embedding(embedding_dim=embedding_size, 
                                num_embeddings=in_vocab_size, 
                                padding_idx=0)
        self.fc = nn.Linear(in_features=hidden_size, out_features=out_vocab_size)
        
        self.rnn = ExplicitRNN(input_size=embedding_size, 
                               hidden_size=hidden_size, 
                               batch_first=batch_first)
    
    def forward(self, x_in, x_lengths=None, apply_softmax=False):
        x_in = self.emb(x_in)
        y_out = self.rnn(x_in)

        dim0, dim1, dim2 = y_out.size()
        y_out = y_out.contiguous().view(-1, dim2)

        y_out = self.fc(y_out)

        # optionally apply the softmax
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        y_out = y_out.view(dim0, dim1, -1)
        
        return y_out
    


def sequence_loss(net_output, y_true, loss_func=F.cross_entropy):
    net_output, y_true = normalize_sizes(net_output, y_true)
    return F.cross_entropy(net_output, y_true, ignore_index=IGNORE_INDEX_VALUE)

def normalize_sizes(net_output, y_true):
    net_output = net_output.cpu()
    y_true = y_true.cpu()
    if len(net_output.size()) == 3:
        net_output.contiguous()
        net_output = net_output.view(-1, net_output.size(2))
    if len(y_true.size()) == 2:
        y_true.contiguous()
        y_true = y_true.view(-1)
    return net_output, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

### sampling functions

in the last couple of examples, we've seen sampling.  we include the sampling code up front here so that we can sample during training to get a sense of what's going on!

In [4]:
def sample(emb, rnn, fc, h_t=None, idx_t=None, n=20, temp=1):
    hiddens = [h_t]
    indices = [idx_t]
    out_dists = []
    
    for t in range(n):
        x_t = emb(idx_t)
        h_t = rnn._compute_next_hidden(x_t, h_t)
        
        y_t = fc(h_t)
        y_t = F.softmax( y_t / temp, dim=1)
        idx_t = torch.multinomial(y_t, 1)[:, 0]
        
        
        hiddens.append(h_t)
        indices.append(idx_t)
        out_dists.append(y_t)
     
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices

def make_initial_hidden(batch_size, hidden_size):
    return torch.ones(batch_size, hidden_size)
    
def make_initial_x(batch_size, vectorizer):
    return torch.ones(batch_size, dtype=torch.int64) * vectorizer.word_vocab.start_index


def decode_one(vectorizer, seq):
    out = []
    for i in seq:
        if vectorizer.word_vocab.start_index == i:
            continue
        if vectorizer.word_vocab.end_index == i:
            return ' '.join(out)
        out.append(vectorizer.word_vocab.lookup(i))
    return ' '.join(out)
            
def decode_matrix(vectorizer, mat):
    mat = mat.cpu().detach().numpy()
    return [decode_one(vectorizer, mat[i]) for i in range(len(mat))]

## Make, Train, and Eval

In [5]:
args = Namespace(
    trump_csv="../data/trump.csv",
    glove_filename="../data/glove.6B.100d.txt",
    batch_size=128,
    cuda=True,
    learning_rate=0.001,
    num_epochs=100,
    load_zoo_model=True,
    zoo={
        'filename': '../modelzoo/wordrnn_emb100_hid64_trump_tweets_predict.state',
        'vocab': '../modelzoo/trump_twitter.vocab',
        'comments': 'pre-trained trump sequence prediction (& generation)',
        'parameters': {
            'embedding_size': 100,
            'hidden_size': 64
        }
    }

)


# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")
args.device

Using CUDA: True


device(type='cuda')

In [6]:
# optional: set this to false to learn from scratch!
# args.load_zoo_model = False

In [7]:
raw_data = RawTrumpTweets(args.trump_csv).get_data()

if os.path.exists(args.zoo['vocab']):
    vectorizer = TrumpTweetVectorizer.load(args.zoo['vocab'])
    print("Loading vectorizer!")
else:
    vectorizer = TrumpTweetVectorizer.fit(raw_data)
    print("Creating a new vectorizer.")

train_dataset = vectorizer.transform(raw_data, split='train')
test_dataset = vectorizer.transform(raw_data, split='test')

zoo_params = args.zoo['parameters']

net = WordRNN(embedding_size=zoo_params['embedding_size'], 
              hidden_size=zoo_params['hidden_size'],
              in_vocab_size=len(vectorizer.word_vocab), 
              out_vocab_size=len(vectorizer.word_vocab), 
              batch_first=True)

Loading vectorizer!


### quick inspection of model performance 

Before we load the model weights, what do the samples look like? 

In [8]:
# initial vectors
initial_hidden = make_initial_hidden(batch_size=8, hidden_size=args.zoo['parameters']['hidden_size'])
initial_x = make_initial_x(batch_size=8, vectorizer=vectorizer)

# sampled matrix of indices
sample_matrix = sample(net.emb, net.rnn, net.fc, 
                       initial_hidden, initial_x,
                       temp=0.8, n=5)

# decode matrix into text!
decode_matrix(vectorizer, sample_matrix)



['solve Important Your Macomb appearance',
 'trip biggest 43 lion Four',
 'describing fired ball opinions lowest',
 'take disproportionate Catholics receive ALConvention2016',
 'poses Frozen Lines SHARE flip',
 'FactCheck facility Good -- entertainer',
 'Former operative November Wednesday blows',
 'violation reviewing You treaties Malik']

###  loading model weights

In [9]:
if args.load_zoo_model and os.path.exists(args.zoo['filename']):
    print("Loading state dict!")
    net.load_state_dict(torch.load(args.zoo['filename'], map_location=lambda storage, loc: storage))
else:
    print("Using newly initiated network!")

Loading state dict!


### introspection post model weight load

notice we drop the `n` argument to sample this time. This was done assuming pretrained model weights were loaded.  Thish is because we assume the model is able to predict the end of sequence now! before, it could potentially go on forever. 

In [10]:
# initial vectors
initial_hidden = make_initial_hidden(batch_size=8, hidden_size=args.zoo['parameters']['hidden_size'])
initial_x = make_initial_x(batch_size=8, vectorizer=vectorizer)

# sampled matrix of indices
sample_matrix = sample(net.emb, net.rnn, net.fc, 
                       initial_hidden, initial_x,
                       temp=0.8)

# decode matrix into text!
decode_matrix(vectorizer, sample_matrix)

['Poll : Trump 38 Carson has no sense of politics . We need great American prosperity . This is the',
 'Poll is going to spend on immigration !',
 'for massive Muslim problem works rally last night . It is is rigged down . Not well ۪ t be',
 'who tells it to me that I want to refocus NATO in Iowa to Barack Obama ( in primary states',
 'who can negotiate his incompetent deal - for all of my new book against me concerning s president , VETS',
 'the grubby is full of the twelve year old gives up down for the house , live & amp ;',
 'who voted for an arm of the U . S . P . Let ۪ s I want to thank',
 'Poll with Trump - wrong & amp ; incompetence that are for a speech . MAGA !']

In [11]:
net = net.to(args.device)

# optimizer
optimizer = optim.Adam(net.parameters(), lr=args.learning_rate)

# loss function

# progress bars

epoch_bar = tqdm_notebook(desc='epochs', total=args.num_epochs, position=1)

num_train_batches = len(train_dataset) // args.batch_size
train_bar = tqdm_notebook(desc='training', total=num_train_batches, position=2)

num_test_batches = len(test_dataset) // args.batch_size
test_bar = tqdm_notebook(desc='test', total=num_test_batches, position=3)

# history

train_loss_history = []
train_accuracy_history = []

test_loss_history = []
test_accuracy_history = []


try:
    for _ in range(args.num_epochs):
        batch_generator = generate_batches(train_dataset, batch_size=args.batch_size,
                                           device=args.device)
               
        per_epoch_loss = []
        per_epoch_accuracy = []
        
        net.train()
            
        for batch_dict in batch_generator:
            
            # step 1
            optimizer.zero_grad()
            
            # step 2
            y_pred = net(batch_dict['x_words'], batch_dict['x_lengths'])
            y_target = batch_dict['y_words']
            
            # step 3
            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)
            
            # step 4
            loss.backward()
            optimizer.step()
          
            # bonus steps: bookkeeping
            
            per_epoch_loss.append(loss.item())
            
            accuracy = compute_accuracy(y_pred, batch_dict['y_words'], IGNORE_INDEX_VALUE)
            per_epoch_accuracy.append(accuracy)

            train_bar.update()
            
            train_bar.set_postfix(loss=per_epoch_loss[-1], 
                                  accuracy=per_epoch_accuracy[-1])
            
        train_loss_history.append(np.mean(per_epoch_loss))
        train_accuracy_history.append(np.mean(per_epoch_accuracy))
        
        # loop over test dataset
        
        batch_generator = generate_batches(test_dataset, batch_size=args.batch_size, 
                                           device=args.device)
        per_epoch_loss = []
        per_epoch_accuracy = []
            
        # set it to eval mode; this turns stochastic functions off
        net.eval()
            
        for batch_dict in batch_generator:
            # step 1: compute output
            y_pred = net(batch_dict['x_words'], batch_dict['x_lengths'])
            y_target = batch_dict['y_words'] 
            
            # step 2: compute metrics
            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)
            per_epoch_loss.append(loss.item())
          
            accuracy = compute_accuracy(y_pred, batch_dict['y_words'], IGNORE_INDEX_VALUE)
            per_epoch_accuracy.append(accuracy)

            test_bar.update()
            
            test_bar.set_postfix(loss=per_epoch_loss[-1], 
                                 accuracy=per_epoch_accuracy[-1])
            
        test_loss_history.append(np.mean(per_epoch_loss))
        test_accuracy_history.append(np.mean(per_epoch_accuracy))
        
        # update bars
        
        epoch_bar.set_postfix(train_loss=train_loss_history[-1], 
                              train_accuracy=train_accuracy_history[-1],
                              test_loss=test_loss_history[-1],
                              test_accuracy=test_accuracy_history[-1])
        epoch_bar.update()
        test_bar.n = 0
        train_bar.n = 0
        
except KeyboardInterrupt:
    print("...")

HBox(children=(IntProgress(value=0, description='epochs'), HTML(value='')))

HBox(children=(IntProgress(value=0, description='training', max=38), HTML(value='')))

HBox(children=(IntProgress(value=0, description='test', max=9), HTML(value='')))

RuntimeError: $ Torch: not enough memory: you tried to allocate 0GB. Buy new RAM! at /pytorch/aten/src/TH/THGeneral.cpp:204

In [None]:
#Save model
torch.save(net.state_dict(), '04-Word-RNN-Trump-Generator.pt')


## Hypothetical Scenario

Up until now, you haven't had pretrained weights. You've only been using the freshly initialized network. Suddenly, you realize pre-trained word vectors exist and can be used in your network!

So, we will start by loading the glove word vectors and then incorporating them into our network

In [None]:
def load_word_vectors(filename):
    word_to_index = {}
    word_vectors = []
    
    with open(filename) as fp:
        for line in tqdm_notebook(fp.readlines(), leave=False):
            line = line.split(" ")
            
            word = line[0]
            word_to_index[word] = len(word_to_index)
            
            vec = np.array([float(x) for x in line[1:]])
            word_vectors.append(vec)
    word_vector_size = len(word_vectors[0])
    return word_to_index, word_vectors, word_vector_size

word_to_index, word_vectors, word_vector_size = load_word_vectors(args.glove_filename)

now, we want to collate what we have from the word vectors with what is is on our vocabulary!

In [None]:
net.emb.weight.size() 

In [None]:
net = WordRNN(embedding_size=zoo_params['embedding_size'], 
              hidden_size=zoo_params['hidden_size'],
              in_vocab_size=len(vectorizer.word_vocab), 
              out_vocab_size=len(vectorizer.word_vocab), 
              batch_first=True)

net = net.to(args.device)

In [None]:
n = 0
for word, emb_index in tqdm_notebook(vectorizer.word_vocab.items(), leave=False):
    if word.lower() in word_to_index:
        n += 1
        glove_index = word_to_index[word.lower()]
        glove_vec = torch.FloatTensor(word_vectors[glove_index])
        if net.emb.weight.is_cuda:
            glove_vec = glove_vec.cuda()
        net.emb.weight.data[emb_index, :].set_(glove_vec)

print(n, 'replaced')

### Re-running training with these vectors

While you won't be able to really appreciate the gains if training on a cpu, if you were to run the code again (this is the same training routine as above), you would see faster convergence as a lot of the ground work is already done by the pre-trained embeddings!

In [None]:
net = net.to(args.device)

# optimizer
optimizer = optim.Adam(net.parameters(), lr=args.learning_rate)

# loss function

# progress bars

epoch_bar = tqdm_notebook(desc='epochs', total=args.num_epochs, position=1)

num_train_batches = len(train_dataset) // args.batch_size
train_bar = tqdm_notebook(desc='training', total=num_train_batches, position=2)

num_test_batches = len(test_dataset) // args.batch_size
test_bar = tqdm_notebook(desc='test', total=num_test_batches, position=3)

# history

train_loss_history = []
train_accuracy_history = []

test_loss_history = []
test_accuracy_history = []


try:
    for _ in range(args.num_epochs):
        batch_generator = generate_batches(train_dataset, batch_size=args.batch_size,
                                           device=args.device)
               
        per_epoch_loss = []
        per_epoch_accuracy = []
        
        net.train()
            
        for batch_dict in batch_generator:
            
            # step 1
            optimizer.zero_grad()
            
            # step 2
            y_pred = net(batch_dict['x_words'], batch_dict['x_lengths'])
            y_target = batch_dict['y_words']
            
            # step 3
            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)
            
            # step 4
            loss.backward()
            optimizer.step()
          
            # bonus steps: bookkeeping
            
            per_epoch_loss.append(loss.item())
            
            accuracy = compute_accuracy(y_pred, batch_dict['y_words'], IGNORE_INDEX_VALUE)
            per_epoch_accuracy.append(accuracy)

            train_bar.update()
            
            train_bar.set_postfix(loss=per_epoch_loss[-1], 
                                  accuracy=per_epoch_accuracy[-1])
            
        train_loss_history.append(np.mean(per_epoch_loss))
        train_accuracy_history.append(np.mean(per_epoch_accuracy))
        
        # loop over test dataset
        
        batch_generator = generate_batches(test_dataset, batch_size=args.batch_size, 
                                           device=args.device)
        per_epoch_loss = []
        per_epoch_accuracy = []
            
        # set it to eval mode; this turns stochastic functions off
        net.eval()
            
        for batch_dict in batch_generator:
            # step 1: compute output
            y_pred = net(batch_dict['x_words'], batch_dict['x_lengths'])
            y_target = batch_dict['y_words'] 
            
            # step 2: compute metrics
            loss = sequence_loss(y_pred, y_target, IGNORE_INDEX_VALUE)
            per_epoch_loss.append(loss.item())
          
            accuracy = compute_accuracy(y_pred, batch_dict['y_words'], IGNORE_INDEX_VALUE)
            per_epoch_accuracy.append(accuracy)

            test_bar.update()
            
            test_bar.set_postfix(loss=per_epoch_loss[-1], 
                                 accuracy=per_epoch_accuracy[-1])
            
        test_loss_history.append(np.mean(per_epoch_loss))
        test_accuracy_history.append(np.mean(per_epoch_accuracy))
        
        # update bars
        
        epoch_bar.set_postfix(train_loss=train_loss_history[-1], 
                              train_accuracy=train_accuracy_history[-1],
                              test_loss=test_loss_history[-1],
                              test_accuracy=test_accuracy_history[-1])
        epoch_bar.update()
        test_bar.n = 0
        train_bar.n = 0
        
except KeyboardInterrupt:
    print("...")

In [None]:
#Save model
torch.save(net.state_dict(), '04-Word-RNN-Trump-Generator_Retrained.pt')


In [None]:
net = net.cpu()

# initial vectors
initial_hidden = make_initial_hidden(batch_size=8, hidden_size=args.zoo['parameters']['hidden_size'])
initial_x = make_initial_x(batch_size=8, vectorizer=vectorizer)

# sampled matrix of indices
sample_matrix = sample(net.emb, net.rnn, net.fc, 
                       initial_hidden, initial_x,
                       temp=0.8)

# decode matrix into text!
decode_matrix(vectorizer, sample_matrix)