# Sequence Classification Task (POS Tagging)

In this tutorial we will see how we can use PyTorchWrapper to tackle the task of pos tagging in the Penn Treebank
dataset.

#### Additional libraries

First of all we need to install the `nltk` library in order to download the data.

In [None]:
! pip install nltk


#### Downloading Data
Next we download the data.

In [None]:
import nltk

nltk.download('treebank')
nltk.download('universal_tagset')


#### Import Statements

In [None]:
import numpy as np
import torch
import random
import math

from tqdm.auto import tqdm
from nltk.corpus import treebank
from torch import nn
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from pytorch_wrapper import modules, System
from pytorch_wrapper import evaluators as evaluators
from pytorch_wrapper.samplers import SubsetSequentialSampler
from pytorch_wrapper.loss_wrappers import TokenLabelingGenericPointWiseLossWrapper
from pytorch_wrapper.training_callbacks import EarlyStoppingCriterionCallback


#### Dataset Definition

In [None]:
class TreeBankDataset(Dataset):
    def __init__(self, sentences, w2i, l2i):

        self.ids = []
        self.texts = []
        self.texts_len = []
        self.targets = []

        for i, ex in enumerate(tqdm(sentences)):
            self.ids.append(i)
            tokens, labels = list(zip(*ex))
            self.texts.append(TreeBankDataset.convert_tokens_to_indices(tokens, w2i))
            self.texts_len.append(len(tokens))
            self.targets.append(TreeBankDataset.convert_tokens_to_indices(labels, l2i))

        self._shuffle_examples()

    def __getitem__(self, index):

        return (
            self.ids[index],
            (
                self.texts[index],
                self.texts_len[index]
            ),
            self.targets[index]
        )

    def __len__(self):
        return len(self.ids)

    def _shuffle_examples(self, seed=12345):
        """
        Shuffles the examples with the given seed.
        :param seed: The seed used for shuffling.
        """
        random.seed(seed)
        l = list(zip(self.ids, self.texts, self.texts_len, self.targets))
        random.shuffle(l)
        self.ids, self.texts, self.texts_len, self.targets = zip(*l)

    @staticmethod
    def collate_fn(batch):
        """
        Function that combines a list of examples in order to a batch. Called internally
        by dataloaders.
        """
        batch_zipped = list(zip(*batch))
        input_zipped = list(zip(*batch_zipped[1]))
        ids = batch_zipped[0]
        texts = torch.tensor(TreeBankDataset.pad_to_max(input_zipped[0]), dtype=torch.long)
        texts_len = torch.tensor(input_zipped[1], dtype=torch.int)
        targets = torch.tensor(TreeBankDataset.pad_to_max(batch_zipped[2]), dtype=torch.long)

        return {
            'id': ids,
            'input': [texts, texts_len],
            'target': targets
        }

    @staticmethod
    def convert_tokens_to_indices(token_list, t2i, unk_token_index=1):
        return [t2i[t] if t in t2i else unk_token_index for t in token_list]

    @staticmethod
    def pad_to_max(lst, pad_int=0):
        pad = len(max(lst, key=len))
        return [i + [pad_int] * (pad - len(i)) if len(i) <= pad else i[:pad] for i in lst]

    @staticmethod
    def create_vocab(sentences):
        vocab = set()
        labels = set()
        for s in tqdm(sentences):
            s_tokens, s_labels = list(zip(*s))
            vocab.update(s_tokens)
            labels.update(s_labels)
        i2w = ['!!PAD!!', '!!UNK!!'] + [x for x in vocab]
        w2i = {i2w[i]: i for i in range(len(i2w))}

        i2l = [x for x in labels]
        l2i = {i2l[i]: i for i in range(len(i2l))}

        return w2i, i2w, l2i, i2l


#### Model Definition
In this example we will use a bidirectional GRU.

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size, output_size):
        super(Model, self).__init__()

        embeddings_size = 128

        self.embedding_layer = modules.EmbeddingLayer(
            vocab_size,
            embeddings_size,
            trainable=True,
            padding_idx=0
        )

        self.text_rnn = nn.GRU(
            input_size=embeddings_size,
            hidden_size=128,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )

        self.output_mlp = modules.MLP(
            input_size=256,
            num_hidden_layers=1,
            hidden_layer_size=128,
            hidden_activation=nn.ReLU,
            output_size=output_size,
            output_activation=None
        )

    def forward(self, text, text_len):
        text = self.embedding_layer(text)
        text_rnn_out = self.text_rnn(text)[0]
        out = self.output_mlp(text_rnn_out)

        return out


#### Training

Next we create the dataset object along with three data loaders (for training, validation, and testing).

In [None]:
sentences = treebank.tagged_sents(tagset='universal')

w2i, i2w, l2i, i2l = TreeBankDataset.create_vocab(sentences)

dataset = TreeBankDataset(sentences, w2i, l2i)

eval_size = math.floor(0.1 * len(dataset))
dataset_indicis = list(range(len(dataset)))
train_split_indicis = dataset_indicis[2 * eval_size:]
val_split_indicis = dataset_indicis[eval_size:2 * eval_size]
test_split_indicis = dataset_indicis[:eval_size]

train_dataloader = DataLoader(
    dataset,
    sampler=SubsetRandomSampler(train_split_indicis),
    batch_size=128,
    collate_fn=TreeBankDataset.collate_fn
)

val_dataloader = DataLoader(
    dataset,
    sampler=SubsetSequentialSampler(val_split_indicis),
    batch_size=128,
    collate_fn=TreeBankDataset.collate_fn
)

test_dataloader = DataLoader(
    dataset,
    sampler=SubsetSequentialSampler(test_split_indicis),
    batch_size=128,
    collate_fn=TreeBankDataset.collate_fn
)


Then we create the model and we wrap it with a System object.

In [None]:
model = Model(len(i2w), len(i2l))

last_activation = nn.Softmax(dim=-1)
if torch.cuda.is_available():
    system = System(model, last_activation=last_activation, device=torch.device('cuda'))
else:
    system = System(model, last_activation=last_activation, device=torch.device('cpu'))

Next we train the model on the training set, using the validation set for early stopping. PyTorchWrapper provides
`pytorch_wrapper.loss_wrappers.SequenceLabelingGenericPointWiseLossWrapper` that wraps a native pointwise loss and `pytorch_wrapper.evaluators.SequenceLabelingEvaluatorWrapper` which wraps an evaluator. These two classes make sure that labels
that correspond to padding tokens are ignored. For this reason they need the `batch_input_sequence_length_idx` 
argument that points to the position of the input list where the length of each example of the batch resides.

In [None]:
loss_wrapper = TokenLabelingGenericPointWiseLossWrapper(
    loss=nn.CrossEntropyLoss(),
    batch_input_sequence_length_idx=1
)

evals = {

    'prec': evaluators.TokenLabelingEvaluatorWrapper(
        evaluators.MultiClassPrecisionEvaluator(average='macro'),
        batch_input_sequence_length_idx=1
    ),

    'rec': evaluators.TokenLabelingEvaluatorWrapper(
        evaluators.MultiClassRecallEvaluator(average='macro'),
        batch_input_sequence_length_idx=1
    ),

    'f1': evaluators.TokenLabelingEvaluatorWrapper(
        evaluators.MultiClassF1Evaluator(average='macro'),
        batch_input_sequence_length_idx=1
    )

}

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, system.model.parameters()))

_ = system.train(
    loss_wrapper,
    optimizer,
    train_data_loader=train_dataloader,
    evaluators=evals,
    evaluation_data_loaders={
        'val': val_dataloader
    },
    callbacks=[
        EarlyStoppingCriterionCallback(
            patience=3,
            evaluation_data_loader_key='val',
            evaluator_key='f1',
            tmp_best_state_filepath='data/pos_tagging_cur_best.weights'
        )
    ]
)


Next we evaluate the model.

In [None]:
results = system.evaluate(test_dataloader, evals)
for r in results:
    print(results[r])


We can also use the `predict` method in order to predict for all the examples returned by a `Dataloder`.

In [None]:
predictions = system.predict(test_dataloader, perform_last_activation=True)


In [None]:
example_id = 50
input_loc = 1
text_loc = 0

tokens = [i2w[x] for x in dataset[test_split_indicis[example_id]][input_loc][text_loc]]
predicted_labes = [i2l[np.argmax(scores)] for scores in predictions['outputs'][example_id][:len(tokens)]]


In [None]:
print(list(zip(tokens, predicted_labes)))


Finally we save the model's weights.

In [None]:
# Then we save the model's state.
system.save_model_state('data/pos_tagging_final.weights')
