# BERT classifier fine-tuning with PyTorch, HuggingFace, and Catalyst

## 1. Reading data and basic EDA

**The task is to classify articles into Sustainable Development Goals**
<img src="../img/all_sdgs.png">

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv('../data/sdg_classification/train_set_sdg_1_7_8_12_13_toy.csv')
valid_df = pd.read_csv('../data/sdg_classification/val_set_sdg_1_7_8_12_13_toy.csv')
test_df = pd.read_csv('../data/sdg_classification/eval_set_sdg_1_7_8_12_13_curated_journals_toy.csv')

In [None]:
train_df.info()

In [None]:
train_df.head(2)

In [None]:
train_df.loc[0, 'title_keywords_abstract']

In [None]:
train_df['sdg_id'].value_counts()

In [None]:
valid_df['sdg_id'].value_counts()

In [None]:
test_df['sdg_id'].value_counts()

In [None]:
# we have limitation of 512 tokens (for basic implementation)

train_df['title_keywords_abstract'].apply(
    lambda s: len(s.split())).describe()

## 2. PyTorch Datasets and DataLoaders

In [None]:
from typing import List, Mapping, Tuple

In [None]:
import logging

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

In [None]:
class TextClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        label_dict: Mapping[str, int] = None,
        max_seq_length: int = 512,
        model_name: str = "distilbert-base-uncased",
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        x = self.texts[index]

        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict = self.tokenizer.encode_plus(
            x,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_seq_length,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )

        # for Catalyst, there needs to be a key called features
        output_dict["features"] = output_dict["input_ids"].squeeze(0)
        del output_dict["input_ids"]

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y, -1)]).long().squeeze(0)
            output_dict["targets"] = y_encoded

        return output_dict

In [None]:
MODEL_NAME = 'distilbert-base-uncased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
tokenizer

In [None]:
input_text = 'yidude! I enjoy playing football under rain'

In [None]:
output_dict = tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            padding="max_length",
            max_length=16,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )

In [None]:
output_dict

In [None]:
voc = tokenizer.get_vocab()
len(voc)

In [None]:
inv_voc = {v: k for (k, v) in voc.items()}

In [None]:
# wordpiece tokenization bert

' '.join([inv_voc[i] for i in output_dict['input_ids'].tolist()[0]])

In [None]:
output_dict['attention_mask'].tolist()[0]

In [None]:
train_dataset = TextClassificationDataset(
        texts=train_df['title_keywords_abstract'].values.tolist(),
        labels=train_df['sdg_id'].values,
        max_seq_length=16,
        model_name=MODEL_NAME,
    )

In [None]:
valid_dataset = TextClassificationDataset(
        texts=valid_df['title_keywords_abstract'].values.tolist(),
        labels=valid_df['sdg_id'].values,
        max_seq_length=16,
        model_name=MODEL_NAME,
    )

In [None]:
train_dataset[17]

In [None]:
train_dataset.label_dict

In [None]:
train_val_loaders = {
        "train": DataLoader(
            dataset=train_dataset,
            batch_size=32,
            shuffle=True,
        ),
        "valid": DataLoader(
            dataset=valid_dataset,
            batch_size=32,
            shuffle=False,
        ),
    }

In [None]:
train_val_loaders['train']

In [None]:
# next(iter(train_val_loaders['train']))

In [None]:
next(iter(train_val_loaders['train']))['features'].size()

## 3. The model

In [None]:
import torch.nn as nn
from transformers import AutoConfig, AutoModel

In [None]:
class BertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(
        self,
        pretrained_model_name: str,
        num_classes: int = None,
        dropout: float = 0.3
    ):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()

        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=num_classes
        )

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class scores
        """
        assert attention_mask is not None, "attention mask is none"

        # taking BERTModel output
        # see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
        bert_output = self.model(
            input_ids=features, attention_mask=attention_mask, head_mask=head_mask
        )
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation of all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        scores = self.classifier(pooled_output)  # (bs, num_classes)

        return scores


In [None]:
MODEL_NAME

In [None]:
config = AutoConfig.from_pretrained(
            MODEL_NAME, num_labels=5
        )

model = AutoModel.from_pretrained(MODEL_NAME, config=config)

In [None]:
config

In [None]:
model

In [None]:
# model.forward?

In [None]:
mini_batch = next(iter(train_val_loaders['train']))

In [None]:
mini_batch.keys()

In [None]:
bert_output = model(
    input_ids=mini_batch['features'],
    attention_mask=mini_batch['attention_mask'],
)

In [None]:
type(bert_output)

In [None]:
bert_output[0].size()
# [batch_size x seq_len x emb_dim]

In [None]:
bert_output2 = model(
    input_ids=mini_batch['features'],
    attention_mask=mini_batch['attention_mask'],
    output_hidden_states=True,
    return_dict=True
)

In [None]:
bert_output2.keys()

In [None]:
bert_output2['last_hidden_state'].size()

In [None]:
len(bert_output2['hidden_states']) # like number of attention heads

In [None]:
bert_output2['hidden_states'][0].size()

In [None]:
torch.allclose(bert_output2['hidden_states'][-1], bert_output[0])

In [None]:
seq_output = bert_output[0]  
# [batch_size x seq_len x emb_dim]

In [None]:
seq_output.size()

In [None]:
# seq_output[:, 0, :].size() # [CLS-token]

In [None]:
classifier = nn.Linear(768, 5)
dropout = nn.Dropout(0.3)

In [None]:
# mean pooling, i.e. getting average representation of all tokens
# hmmm, what better? "cls" token or average of all tokens ?
pooled_output = seq_output.mean(axis=1)  # [batch_size, emb_dim]
pooled_output = dropout(pooled_output)  # [batch_size, emb_dim]
scores = classifier(pooled_output)  # [batch_size, num_classes]

In [None]:
scores.size()

In [None]:
# or we can use [CLS-token]
# classifier(dropout(seq_output[:, 0, :])).size()

In [None]:
clf_model = BertForSequenceClassification(
    pretrained_model_name=MODEL_NAME,
    num_classes=5
)

In [None]:
mini_batch = next(iter(train_val_loaders['train']))

In [None]:
clf_model(
    features=mini_batch['features'],
    attention_mask=mini_batch['attention_mask']
)

## 4. Training

In [None]:
import torch
import transformers
import catalyst

from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import (
    AccuracyCallback,
    CheckpointCallback,
    InferCallback,
    OptimizerCallback,
)
from catalyst.utils import prepare_cudnn, set_global_seed

In [None]:
torch.__version__

In [None]:
transformers.__version__

In [None]:
catalyst.__version__

In [None]:
# specify criterion for the multi-class classification task, optimizer and scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=3e-5
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)


In [None]:
set_global_seed(42)
prepare_cudnn(deterministic=True)

In [None]:
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 2

In [None]:
train_dataset = TextClassificationDataset(
        texts=train_df['title_keywords_abstract'].values.tolist(),
        labels=train_df['sdg_id'].values,
        max_seq_length=MAX_SEQ_LENGTH,
        model_name=MODEL_NAME,
    )

In [None]:
valid_dataset = TextClassificationDataset(
        texts=valid_df['title_keywords_abstract'].values.tolist(),
        labels=valid_df['sdg_id'].values,
        max_seq_length=MAX_SEQ_LENGTH,
        model_name=MODEL_NAME,
    )

In [None]:
test_dataset = TextClassificationDataset(
        texts=test_df['title_keywords_abstract'].values.tolist(),
        max_seq_length=MAX_SEQ_LENGTH,
        model_name=MODEL_NAME,
    )

In [None]:
train_val_loaders = {
        "train": DataLoader(
            dataset=train_dataset,
            batch_size=BATCH_SIZE,
            shuffle=True,
        ),
        "valid": DataLoader(
            dataset=valid_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
        ),
    }

In [None]:
test_loaders = {
        "test": DataLoader(
            dataset=test_dataset,
            batch_size=BATCH_SIZE,
            shuffle=True,
        )
    }

In [None]:
runner = SupervisedRunner(input_key=("features", "attention_mask"))

In [None]:
!rm -r ../logdir/

In [None]:
# finally, training the model with Catalyst

runner.train(
    model=clf_model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=train_val_loaders,
    callbacks=[
        AccuracyCallback(num_classes=5),
        OptimizerCallback(accumulation_steps=4),
    ],
    logdir='../logdir/',
    num_epochs=1,
    verbose=True,
)

In [None]:
# and running inference
torch.cuda.empty_cache()
runner.infer(
    model=clf_model,
    loaders=test_loaders,
    callbacks=[
        CheckpointCallback(
            resume="../logdir/checkpoints/best.pth"
        ),
        InferCallback(),
    ],
    verbose=True,
)

In [None]:
# lastly, saving predicted scores for the test set
predicted_scores = runner.callbacks[0].predictions["logits"]
np.savetxt(X=predicted_scores, fname='../data/output/pred.txt')

## 5. Evaluating predictions

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

In [None]:
PRESENT_LABELS = sorted(train_df['sdg_id'].unique())

In [None]:
test_pred_scores = np.loadtxt('.../data/output/pred.txt')
test_pred = [PRESENT_LABELS[i] for i in test_pred_scores.argmax(axis=1)]

In [None]:
f1_score(y_true=test_df['sdg_id'], y_pred=test_pred, average='macro'), \
f1_score(y_true=test_df['sdg_id'], y_pred=test_pred, average='micro')

In [None]:
test_clf_report = classification_report(y_true=test_df['sdg_id'],
                                        y_pred=test_pred,
                                        output_dict=True
                                       )

In [None]:
def highlight_greaterthan(s, threshold, column, color='yellow'):
    is_max = pd.Series(data=False, index=s.index)
    is_max[column] = s.loc[column] >= threshold
    return [f'background-color: {color}' if is_max.any() else '' for v in is_max]

In [None]:
test_clf_report_df = pd.DataFrame(test_clf_report).rename(columns={str(i): f'sdg_{i}'
                                                                   for i in PRESENT_LABELS}).transpose()
test_clf_report_df['support'] = test_clf_report_df['support'].astype('int')

test_clf_report_df = test_clf_report_df.style.apply(highlight_greaterthan,
                                                    threshold=0.7,
                                                    column='f1-score',
                                                    color='green',
                                                    axis=1)