# Project 2
Kai Bagley - 21984315

* Task 2

#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

## Task 2: Multiclass Document Classification

This time, I will be using the `ACTIVITY_CD` labels as classes, and continue to use `NARRATIVE` as the documents.

#### Define `Task2Dataset` class

I must define a new dataset class, since the column names in the dataframe are all different.

The descriptions for many of the below classes are omitted, as they are just the same as in Task 1, and are defined because Task 2 must be in a seperate notebook.

In [2]:
from torch.utils.data import Dataset

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

class Task2Dataset(Dataset):
    def __init__(self, train_df, test_df, valid_df, vectorizer):
        """
        Args:
            train_df (pandas.DataFrame): Training dataset
            test_df (pandas.DataFrame): Test dataset
            valid_df (pandas.DataFrame): Validation dataset
            vectorizer (object): Vectorizer created from dataset
        """
        self._vectorizer = vectorizer
        
        self.train_df = train_df
        self.train_size = len(self.train_df)
        
        self.test_df = test_df
        self.test_size = len(self.test_df)
        
        self.valid_df = valid_df
        self.valid_size = len(self.valid_df)
        
        self.df = train_df.append(test_df).append(valid_df)
        
        # +2 for end and begin tokens
        measure_len = lambda sent: len(sent.split(" "))
        self._max_seq_length = max(map(measure_len, self.df["NARRATIVE"])) + 2
        
        self._lookup_dict = {"train": (self.train_df, self.train_size),
                             "test": (self.test_df, self.test_size),
                             "valid": (self.valid_df, self.valid_size)}
        self.set_split("train")
        
        # Class weights
        class_counts = self.train_df["ACTIVITY_CD"].value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.activity_codes.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        freq = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(freq, dtype=torch.float32)
        
    @classmethod
    def load_dataset_make_vectorizer(cls, folder_path, vectorizer):
        """Load dataset and make a new vectorizer from it
        Args:
            csv (str): Path to folder containing data CSVs
            vectorizer (object): One of the two Vectorizer classes,
                OHVectorizer for one hot, or 
                PEVectorizer for use with pretrained embeddings
        Returns:
            Instance of Task1Dataset
        """
        train_df = pd.read_csv(folder_path + "train.csv")
        test_df  = pd.read_csv(folder_path + "test.csv")
        valid_df = pd.read_csv(folder_path + "valid.csv")
        
        # Lemmatize
        for x in [train_df, test_df, valid_df]:
            x["NARRATIVE"] = x["NARRATIVE"].apply(lambda x:" ".join([lemmatizer.lemmatize(w) for w in x.split(" ")]))
        
        return cls(train_df, test_df, valid_df, vectorizer.from_dataframe(train_df))
    
    def get_vectorizer(self):
        """Returns vectorizer"""
        return self._vectorizer
    
    def set_split(self, split="train"):
        """Selects the chosen dataset
        Args: 
            split (str): Select "train", "test", "valid"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """Allow indexing of dataset
        Args:
            index (int): Index of desired datapoint
        Returns:
            Dictionary with datapoint's features and labels
        """
        row = self._target_df.iloc[index]
        
        narr_vector, vec_length = self._vectorizer.vectorize(row["NARRATIVE"], self._max_seq_length)
        activity_idx = self._vectorizer.activity_codes.lookup_token(row["ACTIVITY_CD"])
        return {"features":narr_vector,
                "labels":activity_idx,
                "feat_length":vec_length}

    def get_num_batches(self, batch_size):
        """Return number of batches in dataset from a given batch size
        Args:
            batch_size (int)
        Returns:
            Number of batches in the dataset
        """
        return len(self) // batch_size

In [3]:
from torch.utils.data import DataLoader
def gen_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """Generator function, wraps PyTorch DataLoader and ensures 
    each tensor is in the right device
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [4]:
class Vocabulary(object):
    """Processes text and extracts vocab"""
    def __init__(self, tok_to_idx=None):
        """Args:
            tok_to_idx (dict): Dictionary that maps tokens to indices
        """
        if tok_to_idx is None:
            tok_to_idx = {}
        self._tok_to_idx = tok_to_idx
        self._idx_to_tok = {idx:token for token, idx in self._tok_to_idx.items()}

    def to_serializable(self):
        return {"tok_to_idx":self._tok_to_idx}

    @classmethod
    def from_serializable(cls, dictionary):
        """Creates a Vocabulary from a serialized dict"""
        return cls(**dictionary)

    def add_token(self, token):
        """Update Vocabulary with a new token

        Args:
            token (str): The token to add to the Vocabulary
        Returns:
            index (int): Integer index corresponding to the token 
        """

        if token in self._tok_to_idx:
            index = self._tok_to_idx[token]
        else:
            index = len(self._tok_to_idx)
            self._tok_to_idx[token] = index
            self._idx_to_tok[index] = token
        return index

    def add_tokens(self, tokens):
        """Updates Vocabulary with multiple tokens

        Args: 
            tokens (list): List of tokens (str)s
        Returns:
            indices (list): List of indices (int)s
        """

        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Returns index of a token

        Args:
            token (str): Token to to find index for
        Returns:
            index (int): The index of the token
        """
        return self._tok_to_idx[token]

    def lookup_index(self, index):
        """Returns token at index

        Args:
            index (int): Index to search for
        Returns:
            token (str): Associated token
        """

        if index not in self._idx_to_tok:
            raise KeyError(f"Index ({index}) is not in the Vocabulary")
        return self._idx_to_tok[index]

    def __str__(self):
        return f"<Vocabulary(size={len(self)})>"

    def __len__(self):
        return len(self._tok_to_idx)

In [5]:
# The below Vocabulary object is from lab 9

class SequenceVocabulary(Vocabulary):
    """Processes text and extracts vocab, for sequences"""
    def __init__(self, add_unk=True, tok_to_idx=None, unk_token="<UNK>", mask_token="<MASK>", begin_seq_token="<BEGIN>", end_seq_token="<END>"):
        """Args:
            tok_to_idx (dict): Dictionary that maps tokens to indices
            unk_token (str): The UNK token that will be added to the Vocabulary
            mask_token (str): Used as padding for embedding
            begin_seq_token (str): Start of a sequence
            end_seq_token (str): End of a sequence
        """
        super(SequenceVocabulary, self).__init__(tok_to_idx)
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token
        
        self.unk_index = self.add_token(self._unk_token)
        self.mask_index = self.add_token(self._mask_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)
        
    def to_serializable(self):
        return {"tok_to_idx":self._tok_to_idx,
                "unk_token":self._unk_token,
                "mask_token":self._mask_token,
                "begin_seq_token":self._begin_seq_token,
                "end_seq_token":self._end_seq_token}

    def lookup_token(self, token):
        """Returns index of a token, or <UNK> index if not present

        Args:
            token (str): Token to to find index for
        Returns:
            index (int): The index of the token
        """
        if self.unk_index >= 0:
            return self._tok_to_idx.get(token, self.unk_index)
        else:
            return self._tok_to_idx[token]

In [6]:
from collections import Counter
import string

class Task2Vectorizer(object):
    """Vectorizer for Task 2"""
    def __init__(self, narrative_vocab, activity_codes):
        """
        Args:
            narrative_vocab (Vocabulary): Vocab for the narrative feature
            activity_codes (Vocabulary): Injuries are already encoded as integers
        """
        self.narrative_vocab = narrative_vocab
        self.activity_codes = activity_codes
    
    def vectorize(self, narrative, vec_length=-1):
        """
        Args:
            narrative (str): The space-separated narrative
            vec_length (int): Fix length of vector
        Returns:
            out (np.ndarray): Vectorized narrative
        """
        idx = [self.narrative_vocab.begin_seq_index]
        idx.extend(self.narrative_vocab.lookup_token(token) for token in narrative.split(" "))
        idx.append(self.narrative_vocab.end_seq_index)
        
        if vec_length < 0:
            vec_length = len(idx)
            
        out = np.zeros(vec_length, dtype=np.int64)
        out[:len(idx)] = idx
        out[len(idx):] = self.narrative_vocab.mask_index
        
        return out, len(idx)
    
    @classmethod
    def from_dataframe(cls, df, cutoff=0):
        """Create a vectorizer from a dataframe
        
        Args:
            df (pandas.DataFrame): Dataframe
        Returns:
            Task2Vectorizer object
        """
        word_counter = Counter()
        narrative_vocab = SequenceVocabulary(add_unk=True)
        for narrative in sorted(set(df["NARRATIVE"])):
            for token in narrative.split(" "):
                if token not in string.punctuation:
                    word_counter[token] += 1
                    
        for word, count in word_counter.items():
            if count > cutoff:
                narrative_vocab.add_token(word)
        
        activity_codes = Vocabulary()
        for code in sorted(set(df["ACTIVITY_CD"])):
            activity_codes.add_token(code)
        
        return cls(narrative_vocab, activity_codes)

In [7]:
def column_gather(outputs, input_lengths):
    '''Get a vector from each batch datapoint in outputs 

    Args:
        outputs (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, sequence, feature)
        input_lengths (torch.LongTensor, torch.cuda.LongTensor)
            shape: (batch,)

    Returns:
        outputs (torch.FloatTensor, torch.cuda.FloatTensor)
            shape: (batch, feature)
    '''
    input_lengths = input_lengths.long().detach().cpu().numpy() - 1

    out = []
    for batch_index, column_index in enumerate(input_lengths):
        out.append(outputs[batch_index, column_index])

    return torch.stack(out)

### Define the RNN

Below I will define a Elman RNN cell, which will be more or less copied from the lab notes. This will be the main part of the classifier, which uses embeddings, fed into the RNN cell, fed into a dense later, which is fully connected to a dense layer as the output.

Inputs into the RNN will be word vectors from the dataset. These will be embedded using an embedding function within the RNN classfier. These embedded word vectors are run through a series of cells, in this case the RNN cell first, then two dense layers, with the second being the output layer.

In [8]:
class ElmanRNN(nn.Module):
    """Elman RNN using RNNCell"""
    def __init__(self, input_size, hidden_size, batch_first=False):
        """
        Args:
            input_size (int): size of input vector
            hidden_size (int): size of the hidden states
            batch_first (bool): true if 0th dim is batch
        """
        super(ElmanRNN, self).__init__()
        
        self.rnn_cell = nn.RNNCell(input_size, hidden_size)
        
        self.batch_first = batch_first
        self.hidden_size = hidden_size
        
    def _initial_hidden(self, batch_size):
        return torch.zeros((batch_size, self.hidden_size))
    
    def forward(self, inputs, initial_hidden=None):
        """
        Args:
            inputs (torch.Tensor): input data tensor,
                if self.batch_first: 
                    inputs.shape = (batch, seq_size, feature_size)
                else: 
                    inputs.shape = (seq_size, batch, feature_size)
            initial_hidden (torch.Tensor): the initial RNN hidden state
            
        Returns:
            hiddens (torch.Tensor): The outputs of the RNN at each step
                hiddens.shape = inputs.shape
        """
        if self.batch_first:
            batch_size, seq_size, feat_size = inputs.size()
            x = inputs.permute(1, 0, 2)
        else:
            seq_size, batch_size, feat_size = inputs.size()
            
        hiddens = []
        
        if initial_hidden is None:
            initial_hidden = self._initial_hidden(batch_size)
            initial_hidden = initial_hidden.to(x.device)
            
        hidden_t = initial_hidden
        
        for t in range(seq_size):
            hidden_t = self.rnn_cell(x[t], hidden_t)
            hiddens.append(hidden_t)
            
        hiddens = torch.stack(hiddens)
        
        if self.batch_first:
            hiddens = hiddens.permute(1, 0, 2)
        
        return hiddens
    

In [9]:
class NarrRNNClf(nn.Module):
    """Narrative classifier using RNN"""
    def __init__(self, embedding_size, num_embeddings, num_classes, rnn_hidden_size, batch_first=True, padding_idx=0):
        """
        Args:
            embedding_size (int): The size of embeddings
            num_embeddings (int): The number of chars to embed
            num_classes (int): The number of classes
            rnn_hidden_size (int): Size of RNN hidden state
            batch_first (bool): True batch is 0th dim
            padding_idx (int): Index for tensor pad        
        """
        super(NarrRNNClf, self).__init__()
        
        self.emb = nn.Embedding(num_embeddings=num_embeddings,
                                embedding_dim=embedding_size,
                                padding_idx=padding_idx)
        self.rnn = ElmanRNN(input_size=embedding_size,
                            hidden_size=rnn_hidden_size,
                            batch_first=batch_first)
        self.dense1 = nn.Linear(in_features=rnn_hidden_size,
                                out_features=rnn_hidden_size)
        self.dense2 = nn.Linear(in_features=rnn_hidden_size,
                                out_features=num_classes)
        self.relu    = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, inputs, in_lengths=None, apply_softmax=False):
        """
        Args:
            inputs (torch.Tensor): input data tensor
                inputs.shape = (batch, input_dim)
            in_lengths (torch.Tensor): lengths of each seq in batch
            apply_softmax (bool): true if use softmax activation.
        Returns:
            output tensor, tensor.shape = (batch, output_dim)
        """
        x_emb = self.emb(inputs)
        x = self.rnn(x_emb)
        
        if in_lengths is not None:
            x = column_gather(x, in_lengths)
        else:
            x = x[:, -1, :]
            
        x = self.dropout(x)
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)
        
        if apply_softmax:
            x = self.softmax(x)
        
        return x
    

### GRU Classifier

While I'm at it, I will also define the GRU classfier for the second part of Task 2. This just uses the PyTorch `nn.GRU` class as its main part.

Very similar to the above RNN classifier, it simply takes word vectors which are calculated in the `Task2Dataset` object, then embedded within the classifier `NarrGRUClf`. These are fed into a GRU cell, then into 1 dense layer as the output.

In [10]:
class NarrGRUClf(nn.Module):
    """Simple GRU nn"""
    def __init__(self, embedding_size, num_embeddings, hidden_size, num_classes, batch_first=True, padding_idx=0):
        super(NarrGRUClf, self).__init__()
        
        self.emb = nn.Embedding(num_embeddings=num_embeddings,
                                embedding_dim=embedding_size,
                                padding_idx=padding_idx)
        self.gru = nn.GRU(input_size=embedding_size,
                          hidden_size=hidden_size,
                          num_layers=1,
                          batch_first=batch_first)
        self.dense1 = nn.Linear(in_features=hidden_size,
                                out_features=num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, inputs, apply_softmax=False):
        x = self.emb(inputs)
        x, _ = self.gru(x)
        
        batch_size, seq_size, feat_size = x.shape
        x = x.contiguous().view(batch_size*seq_size, feat_size)
        
        x = self.dropout(x)
        x = self.dense1(x)
        
        if apply_softmax:
            x = self.softmax(x)
        
        new_feat_size = x.shape[-1]
        x = x.view(batch_size, seq_size, new_feat_size)
        
        return x
        

Redefine the arguments namespace once again for the RNN

In [11]:
from argparse import Namespace
args = Namespace(
    frequency_cutoff=0,
    narrative_folder_path="./data/task2/",
    model_state_file="./data/task2/RNNmodel.pth",
    vectorizer_file="./data/task2/RNNvectorizer.json",
    save_dir="./data/task2/",
    embedding_size=100,
    rnn_hidden_size=100,
    num_epochs=100,
    learning_rate=1e-3,
    batch_size=64,
    early_stopping_criteria=5,
    cuda=True
)
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

In [12]:
dataset = Task2Dataset.load_dataset_make_vectorizer(args.narrative_folder_path, Task2Vectorizer)
vectorizer = dataset.get_vectorizer()

clf = NarrRNNClf(embedding_size=args.embedding_size,
                 num_embeddings=len(vectorizer.narrative_vocab),
                 rnn_hidden_size=args.rnn_hidden_size,
                 num_classes=len(vectorizer.activity_codes),
                 padding_idx=vectorizer.narrative_vocab.mask_index)

clf = clf.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

In [13]:
import torch.optim as optim

def make_train_state(args):
    return {"epoch_index": 0,
            "train_loss": [],
            "train_acc": [],
            "valid_loss": [],
            "valid_acc": [],
            "test_loss": 1,
            "test_acc": 1,
            "learning_rate":args.learning_rate,
            "stop_early":False,
            "early_stopping_step":0,
            "early_stopping_best_valid": 1e8,
            "model_filename": args.model_state_file}

def update_train_state(args, model, train_state):
    """Implements early stopping
    Args:
        args (Namespace): Main model arguments
        model (nn.Module): Model
        train_state (dict): Current training state
    """
    # Save first model
    if train_state["epoch_index"] == 0:
        torch.save(model.state_dict(), train_state["model_filename"])
        train_state["stop_early"] = False
    
    # Save model if performance increase
    if train_state["epoch_index"] >= 1:
        _, loss = train_state["valid_loss"][-2:]
        
        # If loss increased
        if loss >= train_state["early_stopping_best_valid"]:
            train_state["early_stopping_step"] += 1
        # If loss decreased
        else:
            # Save best model
            if loss < train_state["early_stopping_best_valid"]:
                torch.save(model.state_dict(), train_state["model_filename"])
            
            train_state["early_stopping_step"] = 0
            
        train_state["stop_early"] = train_state["early_stopping_step"] >= args.early_stopping_criteria
        
    return train_state

def compute_accuracy(y_pred, y_targ):
    _, y_pred_idx = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_idx, y_targ).sum().item()
    return n_correct / len(y_pred_idx) * 100

#### Training

In [14]:
import torch.optim as optim

loss_f    = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(clf.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode="min", factor=0.5, patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc="Epoch",
                 total=args.num_epochs,
                 position=0)

dataset.set_split("train")
train_bar = tqdm(desc="Train Split",
                 total=dataset.get_num_batches(args.batch_size),
                 position=1,
                 leave=True)

for epoch_index in range(args.num_epochs):
    train_state["epoch_index"] = epoch_index
    
    # Training dataset
    dataset.set_split("train")
    batch_gen = gen_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    clf.train()
    for batch_index, batch_dict in enumerate(batch_gen):
        # Zero gradients
        optimizer.zero_grad()
        # Compute output
        y_pred = clf(inputs=batch_dict["features"], 
                     in_lengths=batch_dict["feat_length"])
        # Compute loss
        loss = loss_f(y_pred, batch_dict["labels"])
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index+1)
        # Compute gradients using loss
        loss.backward()
        # Use optimizer to take gradient step
        optimizer.step()
        # Compute accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict["labels"])
        running_acc += (acc_batch-running_acc) / (batch_index+1)
        
        # Update the bar
        train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
        train_bar.update()
            
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    
    # Validation dataset
    dataset.set_split("valid")
    batch_gen = gen_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    clf.eval()
    for batch_index, batch_dict in enumerate(batch_gen):
        # Compute output
        y_pred = clf(inputs=batch_dict["features"],
                     in_lengths=batch_dict["feat_length"])
        # Compute loss
        loss = loss_f(y_pred, batch_dict["labels"])
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index+1)
        # Compute accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict["labels"])
        running_acc += (acc_batch-running_acc) / (batch_index+1)
        
    train_state["valid_loss"].append(running_loss)
    train_state["valid_acc"].append(running_acc)
    
    train_state = update_train_state(args=args, model=clf, train_state=train_state)

    scheduler.step(train_state['valid_loss'][-1])

    train_bar.n = 0
    epoch_bar.update()
    
    if train_state['stop_early']:
        break
train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
train_bar.update()

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Train Split:   0%|          | 0/14 [00:00<?, ?it/s]

#### Evaluate on test set

In [15]:
# Test dataset
dataset.set_split("test")
batch_gen = gen_batches(dataset, batch_size=args.batch_size, device=args.device)
running_loss = 0.0
running_acc = 0.0
clf.eval()

for batch_index, batch_dict in enumerate(batch_gen):
    # Compute output
    y_pred = clf(inputs=batch_dict["features"], 
                 in_lengths=batch_dict["feat_length"])
    # Compute loss
    loss = loss_f(y_pred, batch_dict["labels"])
    loss_batch = loss.item()
    running_loss += (loss_batch-running_loss) / (batch_index+1)
    # Compute accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict["labels"])
    running_acc += (acc_batch-running_acc) / (batch_index+1)

train_state["test_loss"] = running_loss
train_state["test_acc"] = running_acc

In [16]:
print(f"Test accuracy: {train_state['test_acc']}")
print(f"Test loss    : {train_state['test_loss']}")

Test accuracy: 3.515625
Test loss    : 2.8411765694618225


#### Discussion

Clearly the RNN defined here is a terrible example of an RNN, as the purpose of an RNN is to run very well on sequences, which is what sentences/documents are! But due to time constraints, I was unable to troubleshoot this stage too much.

### GRU classifier

Define the classifier, dataset etc.

In [17]:
dataset = Task2Dataset.load_dataset_make_vectorizer(args.narrative_folder_path, Task2Vectorizer)
vectorizer = dataset.get_vectorizer()

clf = NarrGRUClf(embedding_size=args.embedding_size,
                 num_embeddings=len(vectorizer.narrative_vocab),
                 hidden_size=args.rnn_hidden_size,
                 num_classes=len(vectorizer.activity_codes),
                 padding_idx=vectorizer.narrative_vocab.mask_index)

clf = clf.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

In [18]:
def normalize_sizes(y_pred, y_true):
    """Normalize tensor sizes
    
    Args:
        y_pred (torch.Tensor): the output of the model
            If a 3-dimensional tensor, reshapes to a matrix
        y_true (torch.Tensor): the target predictions
            If a matrix, reshapes to be a vector
    """
    if len(y_pred.size()) == 3:
#         y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
        y_pred = y_pred.contiguous().view(y_pred.size(0),-1)
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()
    
    return n_correct / n_valid*100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

#### Training

In [19]:
import torch.optim as optim

mask_index = vectorizer.narrative_vocab.mask_index
optimizer = optim.Adam(clf.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode="min", factor=0.5, patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm(desc="Epoch",
                 total=args.num_epochs,
                 position=0)

dataset.set_split("train")
train_bar = tqdm(desc="Train Split",
                 total=dataset.get_num_batches(args.batch_size),
                 position=1,
                 leave=True)

for epoch_index in range(args.num_epochs):
    train_state["epoch_index"] = epoch_index
    
    # Training dataset
    dataset.set_split("train")
    batch_gen = gen_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    clf.train()
    for batch_index, batch_dict in enumerate(batch_gen):
        # Zero gradients
        optimizer.zero_grad()
        # Compute output
        y_pred = clf(inputs=batch_dict["features"])
        # Compute loss
        loss = sequence_loss(y_pred, batch_dict["labels"], mask_index)
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index+1)
        # Compute gradients using loss
        loss.backward()
        # Use optimizer to take gradient step
        optimizer.step()
        # Compute accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict["labels"], mask_index)
        running_acc += (acc_batch-running_acc) / (batch_index+1)
        
        # Update the bar
        train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
        train_bar.update()
            
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    
    # Validation dataset
    dataset.set_split("valid")
    batch_gen = gen_batches(dataset, batch_size=args.batch_size, device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    clf.eval()
    for batch_index, batch_dict in enumerate(batch_gen):
        # Compute output
        y_pred = clf(inputs=batch_dict["features"])
        # Compute loss
        loss = sequence_loss(y_pred, batch_dict["labels"], mask_index)
        loss_batch = loss.item()
        running_loss += (loss_batch-running_loss) / (batch_index+1)
        # Compute accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict["labels"], mask_index)
        running_acc += (acc_batch-running_acc) / (batch_index+1)
        
    train_state["valid_loss"].append(running_loss)
    train_state["valid_acc"].append(running_acc)
    
    train_state = update_train_state(args=args, model=clf, train_state=train_state)

    scheduler.step(train_state['valid_loss'][-1])

    train_bar.n = 0
    epoch_bar.update()
    
    if train_state['stop_early']:
        break
train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
train_bar.update()

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Train Split:   0%|          | 0/14 [00:00<?, ?it/s]

#### Evaluate on test set

In [20]:
dataset.set_split("test")
batch_gen = gen_batches(dataset, batch_size=args.batch_size, device=args.device)
running_loss = 0.0
running_acc = 0.0
clf.eval()
for batch_index, batch_dict in enumerate(batch_gen):
    # Compute output
    y_pred = clf(inputs=batch_dict["features"])
    # Compute loss
    loss = sequence_loss(y_pred, batch_dict["labels"], mask_index)
    loss_batch = loss.item()
    running_loss += (loss_batch-running_loss) / (batch_index+1)
    # Compute accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict["labels"], mask_index)
    running_acc += (acc_batch-running_acc) / (batch_index+1)
        
train_state["test_loss"] = running_loss
train_state["test_acc"] = running_acc

In [21]:
print(f"Test accuracy: {train_state['test_acc']}")
print(f"Test loss    : {train_state['test_loss']}")

Test accuracy: 20.013228387600414
Test loss    : 2.373884081840515


### Comparison of the RNN and the GRU classifiers

Unfortunate I was unable to delve too deep into the results of the two classifiers, but in this case, the GRU classifier performed somewhat better.

That isn't say that it performed well, which it certainly didn't (accuracy of ~21%), but its better than the RNNs ~3%.

Similary to the comparison done in Task 1, I didn't experiment much with the hyperparameters, and there is likely much room for improvement. Same with the arcitechture of the RNN/GRU model. As far as I could tell, there was no notable improvement with increasing the number of layers in the RNN/GRU.