# BERT classifier fine-tuning with PyTorch, HuggingFace, and Catalyst

## 1. Reading data and basic EDA

**The task is to classify articles into Sustainable Development Goals**
<img src="../img/all_sdgs.png">

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('../data/sdg_classification/train_set_sdg_1_7_8_12_13_toy.csv')
valid_df = pd.read_csv('../data/sdg_classification/val_set_sdg_1_7_8_12_13_toy.csv')
test_df = pd.read_csv('../data/sdg_classification/eval_set_sdg_1_7_8_12_13_curated_journals_toy.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   eid                      150 non-null    int64 
 1   sdg_id                   150 non-null    int64 
 2   title                    150 non-null    object
 3   keywords                 150 non-null    object
 4   abstract                 150 non-null    object
 5   title_keywords_abstract  150 non-null    object
dtypes: int64(2), object(4)
memory usage: 7.2+ KB


In [4]:
train_df.head(2)

Unnamed: 0,eid,sdg_id,title,keywords,abstract,title_keywords_abstract
0,84895022699,13,GIS-based risk assessment for the Nile Delta c...,GIS Inundation Sea level rise,Sea level changes are caused by several natura...,[TITLE] gis-based risk assessment for the nile...
1,84978997581,1,Ritual well-being: toward a social signaling m...,Costly signaling religion and mental health re...,Religion is positively correlated with subject...,[TITLE] ritual well-being: toward a social sig...


In [5]:
train_df.loc[0, 'title_keywords_abstract']

'[TITLE] gis-based risk assessment for the nile delta coastal zone under different sea level rise scenarios case study: kafr el sheikh governorate, egypt [KEYWORDS] gis inundation sea level rise [ABSTRACT] sea level changes are caused by several natural phenomena, including mainly ocean thermal expansion, glacial melt from greenland and antarctica. it was estimated, in this respect, that global average sea level rose, during the 20th century, by at least 10 cm. this trend is expected to continue and most likely accelerated during the 21st century due to human-induced global warming. global average sea level is expected to rise, by the year 2100, due to global warming between 0.18 and 0.59 cm. such a rise in sea-level will significantly impact coastal areas due to the high concentration of natural and socioeconomic activities and assets located along the coast. the northern coastal zone of the nile delta is generally low land, and is consequently vulnerable to direct and indirect impact

In [6]:
train_df['sdg_id'].value_counts()

13    33
12    31
8     31
1     29
7     26
Name: sdg_id, dtype: int64

In [7]:
valid_df['sdg_id'].value_counts()

13    12
1     12
8     11
12    10
7      5
Name: sdg_id, dtype: int64

In [8]:
test_df['sdg_id'].value_counts()

7     88
8     46
13     9
12     7
Name: sdg_id, dtype: int64

In [9]:
# we have limitation of 512 tokens (for basic implementation)

train_df['title_keywords_abstract'].apply(
    lambda s: len(s.split())).describe()

count    150.000000
mean     203.773333
std       78.735147
min       58.000000
25%      148.000000
50%      191.000000
75%      234.750000
max      564.000000
Name: title_keywords_abstract, dtype: float64

## 2. PyTorch Datasets and DataLoaders

In [10]:
from typing import List, Mapping, Tuple

In [11]:
import logging

In [12]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

In [13]:
class TextClassificationDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(
        self,
        texts: List[str],
        labels: List[str] = None,
        label_dict: Mapping[str, int] = None,
        max_seq_length: int = 512,
        model_name: str = "distilbert-base-uncased",
    ):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
            label_dict (dict): a dictionary mapping class names to class ids,
                to be passed to the validation data (optional)
            max_seq_length (int): maximal sequence length in tokens,
                texts will be stripped to this length
            model_name (str): transformer model name, needed to perform
                appropriate tokenization

        """

        self.texts = texts
        self.labels = labels
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length

        if self.label_dict is None and labels is not None:
            # {'class1': 0, 'class2': 1, 'class3': 2, ...}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.texts)

    def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """

        # encoding the text
        x = self.texts[index]

        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict = self.tokenizer.encode_plus(
            x,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_seq_length,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )

        # for Catalyst, there needs to be a key called features
        output_dict["features"] = output_dict["input_ids"].squeeze(0)
        del output_dict["input_ids"]

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y, -1)]).long().squeeze(0)
            output_dict["targets"] = y_encoded

        return output_dict

In [14]:
MODEL_NAME = 'distilbert-base-uncased'

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [16]:
tokenizer

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [32]:
input_text = 'yidude! I enjoy playing football under rain'

In [33]:
output_dict = tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            padding="max_length",
            max_length=16,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )

In [34]:
output_dict

{'input_ids': tensor([[  101, 12316,  8566,  3207,   999,  1045,  5959,  2652,  2374,  2104,
          4542,   102,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}

In [35]:
voc = tokenizer.get_vocab()
len(voc)

30522

In [36]:
inv_voc = {v: k for (k, v) in voc.items()}

In [37]:
# wordpiece tokenization bert

' '.join([inv_voc[i] for i in output_dict['input_ids'].tolist()[0]])

'[CLS] yi ##du ##de ! i enjoy playing football under rain [SEP] [PAD] [PAD] [PAD] [PAD]'

In [38]:
output_dict['attention_mask'].tolist()[0]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

In [39]:
train_dataset = TextClassificationDataset(
        texts=train_df['title_keywords_abstract'].values.tolist(),
        labels=train_df['sdg_id'].values,
        max_seq_length=16,
        model_name=MODEL_NAME,
    )

In [40]:
valid_dataset = TextClassificationDataset(
        texts=valid_df['title_keywords_abstract'].values.tolist(),
        labels=valid_df['sdg_id'].values,
        max_seq_length=16,
        model_name=MODEL_NAME,
    )

In [41]:
train_dataset[17]

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'features': tensor([  101,  1031,  2516,  1033, 10808,  3169, 19940,  5549,  1998, 20600,
         2241,  2006,  3612,  2373,  1998,   102]), 'targets': tensor(1)}

In [42]:
train_dataset.label_dict

{1: 0, 7: 1, 8: 2, 12: 3, 13: 4}

In [43]:
train_val_loaders = {
        "train": DataLoader(
            dataset=train_dataset,
            batch_size=32,
            shuffle=True,
        ),
        "valid": DataLoader(
            dataset=valid_dataset,
            batch_size=32,
            shuffle=False,
        ),
    }

In [44]:
train_val_loaders['train']

<torch.utils.data.dataloader.DataLoader at 0x7fe24280c100>

In [45]:
# next(iter(train_val_loaders['train']))

In [46]:
next(iter(train_val_loaders['train']))['features'].size()

torch.Size([32, 16])

## 3. The model

In [47]:
import torch.nn as nn
from transformers import AutoConfig, AutoModel

In [48]:
class BertForSequenceClassification(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(
        self,
        pretrained_model_name: str,
        num_classes: int = None,
        dropout: float = 0.3
    ):
        """
        Args:
            pretrained_model_name (str): HuggingFace model name.
                See transformers/modeling_auto.py
            num_classes (int): the number of class labels
                in the classification task
        """
        super().__init__()

        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=num_classes
        )

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features, attention_mask=None, head_mask=None):
        """Compute class probabilities for the input sequence.

        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class scores
        """
        assert attention_mask is not None, "attention mask is none"

        # taking BERTModel output
        # see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
        bert_output = self.model(
            input_ids=features, attention_mask=attention_mask, head_mask=head_mask
        )
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation of all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        scores = self.classifier(pooled_output)  # (bs, num_classes)

        return scores


## 4. Training

## 5. Evaluating predictions