# Text Classification Task (IMDB Reviews)

In this example we will demonstrate how to train and evaluate a Text Classification model on the IMDB Reviews dataset using PyTorchWrapper.

#### Downloading Data
First of all we download and extract the data.

In [None]:
! mkdir -p data /
! wget -P data / http: // ai.stanford.edu / ~amaas / data / sentiment / aclImdb_v1.tar.gz
! tar xvzf data / aclImdb_v1.tar.gz -C data / > / dev / null


#### Import Statements

In [None]:
import torch
import os
import random
import math

from torch import nn
from collections import Counter
from glob import glob
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.sampler import SequentialSampler, SubsetRandomSampler
from tqdm.auto import tqdm
from pytorch_wrapper import modules, System
from pytorch_wrapper import evaluators as evaluators
from pytorch_wrapper.loss_wrappers import GenericPointWiseLossWrapper
from pytorch_wrapper.training_callbacks import EarlyStoppingCriterionCallback
from pytorch_wrapper.samplers import SubsetSequentialSampler


#### Dataset Definition
In this example we will see how we can use a custom `collate_fn`. This function will be called internally by the
dataloaders in order to combine a list of exaples to a ready to use batch. Each individual example will be the output of the `Dataset`'s `__get_item__` method.

In [None]:
class IMDBDataset(Dataset):
    MAX_LEN = 1000

    def __init__(self, folder_root, w2i):
        self.w2i = w2i
        self.ids = []
        self.texts = []
        self.texts_len = []
        self.targets = []

        for label in ['pos', 'neg']:
            for filepath in tqdm(glob(f'{folder_root}/{label}/*')):
                with open(filepath, 'r') as fr:
                    ex = fr.read()
                _, filename = os.path.split(filepath)
                self.ids.append(filename)
                text = self.process_example(ex)
                self.texts.append(text)
                self.texts_len.append(len(text))
                self.targets.append(label == 'pos')

        self._shuffle_examples()

    def __getitem__(self, index):

        return (
            self.ids[index],
            (
                self.texts[index],
                self.texts_len[index]
            ),
            self.targets[index]
        )

    def __len__(self):
        return len(self.ids)

    def _shuffle_examples(self, seed=12345):
        """
        Shuffles the examples with the given seed.
        :param seed: The seed used for shuffling.
        """
        random.seed(seed)
        l = list(zip(self.ids, self.texts, self.texts_len, self.targets))
        random.shuffle(l)
        self.ids, self.texts, self.texts_len, self.targets = zip(*l)

    @staticmethod
    def preprocess_text(text):
        """
        Preprocess text.
        :param text: Text to be preprocessed.
        :return: Preprocessed text.
        """
        if not text:
            return ''
        text = ''.join([ch if ch.isspace() or ch.isalnum() else '' for ch in text])
        text = ' '.join(text.split())
        text = text.lower()
        return text

    @staticmethod
    def collate_fn(batch):
        """
        Function that combines a list of examples into a batch (Called internally by dataloaders).
        """
        batch_zipped = list(zip(*batch))
        input_zipped = list(zip(*batch_zipped[1]))

        ids = batch_zipped[0]
        texts = torch.tensor(IMDBDataset.pad_to_max(input_zipped[0], IMDBDataset.MAX_LEN), dtype=torch.long)
        texts_len = torch.tensor(input_zipped[1], dtype=torch.int)
        targets = torch.tensor(batch_zipped[2], dtype=torch.float)

        return {

            'id': ids,
            'input': [texts, texts_len],
            'target': targets
        }

    def process_example(self, ex):
        """
        Preprocesses a single example.
        :param ex: The text to preprocess.
        :return: A list of indexes that correspond to the tokens of the text.
        """

        ex = IMDBDataset.preprocess_text(ex).split()
        if len(ex) > 0:
            ex = self.convert_tokens_to_indices(ex)
        else:
            ex = [0]

        return ex

    def convert_tokens_to_indices(self, token_list, unk_token_index=1):
        """
        Converts Token to indices based on a dictionary.
        :param token_list: List of tokens.
        :param unk_token_index: Number with which unknown tokens will be replaced.
        :return: List of indices.
        """
        return [self.w2i[t] if t in self.w2i else unk_token_index for t in token_list]

    @staticmethod
    def pad_to_max(lst, max_len=None, pad_int=0):
        """
        Pads the given list of list of tokens to the maximum length.
        :param lst: List of list of tokens.
        """
        pad = len(max(lst, key=len))
        if max_len is not None:
            pad = min(max_len, pad)

        return [i + [pad_int] * (pad - len(i)) if len(i) <= pad else i[:pad] for i in lst]

    @staticmethod
    def create_vocab(folder_root, thr=10):
        """
        Creates a vocabulary from a dataset while discarding words that show up less than 'thr' times.
        :param folder_root: The path where the IMDB dataset was extracted.
        :param thr: The threshold.
        :returns: A list of words and a dictionary that maps the index of a word to the actual word.
        """
        vocab = Counter()
        for label in ['pos', 'neg']:
            for filepath in tqdm(glob(f'{folder_root}/{label}/*')):
                with open(filepath, 'r') as fr:
                    ex = fr.read()
                text = IMDBDataset.preprocess_text(ex)
                vocab.update(text.split())
        i2w = ['!!PAD!!', '!!UNK!!'] + [x for x in vocab if vocab[x] >= thr]
        w2i = {i2w[i]: i for i in range(len(i2w))}
        return w2i, i2w


#### Model Definition
In this example we will use a bidirectional GRU with deep self-attention.

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        embeddings_size = 128

        self.embedding_layer = modules.EmbeddingLayer(
            vocab_size,
            embeddings_size,
            trainable=True,
            padding_idx=0
        )

        self.text_rnn = nn.GRU(
            input_size=embeddings_size,
            hidden_size=128,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )

        text_att_mlp = modules.MLP(
            input_size=256,
            num_hidden_layers=1,
            hidden_layer_size=128,
            hidden_activation=nn.ReLU,
            output_size=1,
            output_activation=None
        )

        self.text_att = modules.SoftmaxSelfAttentionEncoder(text_att_mlp)

        self.output_mlp = modules.MLP(
            input_size=256,
            num_hidden_layers=1,
            hidden_layer_size=128,
            hidden_activation=nn.ReLU,
            output_size=1,
            output_activation=None
        )

    def forward(self, text, text_len):
        text = self.embedding_layer(text)
        text_rnn_out = self.text_rnn(text)[0]
        text_encoding = self.text_att(text_rnn_out, text_len)['output']
        out = self.output_mlp(text_encoding).squeeze(-1)

        return out


#### Training

Next we create the dataset object along with three data loaders (for training, validation, and testing).

In [None]:
w2i, i2w = IMDBDataset.create_vocab('data/aclImdb/train/')

train_val_dataset = IMDBDataset('data/aclImdb/train/', w2i)
test_dataset = IMDBDataset('data/aclImdb/test/', w2i)

eval_size = math.floor(0.1 * len(train_val_dataset))
train_val_dataset_indexes = list(range(len(train_val_dataset)))
train_split_indexes = train_val_dataset_indexes[eval_size:]
val_split_indexes = train_val_dataset_indexes[:eval_size]

batch_size = 128
train_dataloader = DataLoader(
    train_val_dataset,
    sampler=SubsetRandomSampler(train_split_indexes),
    batch_size=batch_size,
    collate_fn=IMDBDataset.collate_fn
)

val_dataloader = DataLoader(
    train_val_dataset,
    sampler=SubsetSequentialSampler(val_split_indexes),
    batch_size=batch_size,
    collate_fn=IMDBDataset.collate_fn
)

test_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size,
    collate_fn=IMDBDataset.collate_fn
)


Then we create the model and we wrap it with a `System` object.

In [None]:
model = Model(len(i2w))

last_activation = nn.Sigmoid()
if torch.cuda.is_available():
    system = System(model, last_activation=last_activation, device=torch.device('cuda'))
else:
    system = System(model, last_activation=last_activation, device=torch.device('cpu'))


Next we train the model on the training set, using the validation set for early stopping.

In [None]:
loss_wrapper = GenericPointWiseLossWrapper(nn.BCEWithLogitsLoss())
evals = {

    'acc': evaluators.AccuracyEvaluator(),
    'auc': evaluators.AUROCEvaluator()

}

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, system.model.parameters()))

_ = system.train(
    loss_wrapper,
    optimizer,
    train_data_loader=train_dataloader,
    evaluators=evals,
    evaluation_data_loaders={
        'val': val_dataloader
    },
    callbacks=[
        EarlyStoppingCriterionCallback(
            patience=3,
            evaluation_data_loader_key='val',
            evaluator_key='acc',
            tmp_best_state_filepath='data/imdb_cur_best.weights'
        )
    ]
)


Next we evaluate the model.

In [None]:
results = system.evaluate(test_dataloader, evals)
for r in results:
    print(results[r])


We can also use the `predict` method in order to predict for all the examples returned by a `Dataloder`.

In [None]:
predictions = system.predict(test_dataloader, perform_last_activation=True)


In [None]:
example_id = 3
input_loc = 1
text_loc = 0

print(' '.join(i2w[x] for x in test_dataset[example_id][input_loc][text_loc]))
print(predictions['outputs'][example_id])


Finally we save the model's weights.

In [None]:
# Then we save the model's state.
system.save_model_state('data/imdb_final.weights')
