This notebook is done following 
* [Building text classifier with Differential Privacy](https://github.com/pytorch/opacus/blob/main/tutorials/building_text_classifier.ipynb)
* [Fine-tuning with custom datasets](https://huggingface.co/transformers/v3.4.0/custom_datasets.html#seq-imdb)

# Initial Setup
https://huggingface.co/docs/transformers/training

## Install

In [1]:
!pip install datasets
import datasets

[0m

## Import

In [2]:
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import gc

pd.set_option('display.max_columns', None)

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tokenize-social-bias-using-bert/__results__.html
/kaggle/input/tokenize-social-bias-using-bert/validation.pkl
/kaggle/input/tokenize-social-bias-using-bert/train.pkl
/kaggle/input/tokenize-social-bias-using-bert/test.pkl
/kaggle/input/tokenize-social-bias-using-bert/validation.csv
/kaggle/input/tokenize-social-bias-using-bert/__notebook__.ipynb
/kaggle/input/tokenize-social-bias-using-bert/__output__.json
/kaggle/input/tokenize-social-bias-using-bert/train.csv
/kaggle/input/tokenize-social-bias-using-bert/test.csv
/kaggle/input/tokenize-social-bias-using-bert/custom.css


## Config

In [4]:
from dataclasses import dataclass

@dataclass
class Config:
    # train config
    model_name = 'bert-base-uncased'
    batch_size = 64
    learning_rate = 1e-4
    epochs = 20
    num_labels = 2

    dataset_name = 'social_bias_frames'
    text_column = 'post'

    # the original id column HITId has been replaced with index because it was string 
    # and torch didn't support str format
    raw_id_column = 'HITId'
    id_column = 'index'

    # target in raw dataset is offensiveYN. However, it will be renamed to `labels` here to facilitate training setup
    raw_target_column = 'offensiveYN'
    target_column = 'labels'
    
    # If needs to be splitted into train test validation set
    need_to_split = False
    # test and validation data with each be 50% of this amount
    test_size = 0.3
    max_seq_length = 128
    seed = 2022

## Set seed

In [5]:
import random

def seed_torch(seed=7):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

global_seed = Config.seed
seed_torch(global_seed)

## Get device

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


# Load tokenized data

From my [other notebook](https://www.kaggle.com/code/khairulislam/tokenize-jigsaw-comments). The dataset is tokenized from the [Jigsaw competition]( https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification) and [all_data.csv](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/data?select=all_data.csv)

In [7]:
text = Config.text_column
target = Config.target_column
root = '/kaggle/input/tokenize-social-bias-using-bert/'

In [8]:
import pickle
    
with open(root + 'train.pkl', 'rb') as input_file:
    train_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'validation.pkl', 'rb') as input_file:
    validation_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'test.pkl', 'rb') as input_file:
    test_tokenized = pickle.load(input_file)
    input_file.close()

In [9]:
print(train_tokenized)

Dataset({
    features: ['index', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 35933
})


In [10]:
id_column = Config.id_column # the original id column HITId has been replaced with index because it was string and torch didn't support str format
train_ids = train_tokenized[id_column]
test_ids = test_tokenized[id_column]
validation_ids = validation_tokenized[id_column]

train_tokenized = train_tokenized.remove_columns(id_column)
test_tokenized = test_tokenized.remove_columns(id_column)
validation_tokenized = validation_tokenized.remove_columns(id_column)

# Training

## Data loader

In [11]:
BATCH_SIZE = Config.batch_size

train_dataloader = DataLoader(train_tokenized, batch_size=BATCH_SIZE)
validation_dataloader = DataLoader(validation_tokenized, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_tokenized, batch_size=BATCH_SIZE)

## Model, hyper-parameters and callbacks

In [12]:
# add the utility script from File->Add utility script
from train_utils import TrainUtil, ModelCheckPoint, EarlyStopping

num_labels = Config.num_labels
model_name = Config.model_name
train_util = TrainUtil(Config.id_column, Config.target_column, device)

In [13]:
model = TrainUtil.load_pretrained_model(model_name, num_labels)

# Define optimizer
LEARNING_RATE = Config.learning_rate
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
EPOCHS = Config.epochs

# https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True) 

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Total parameters count: 109483778
Trainable parameters count: 7680002, percent 7.015


In [14]:
result_dir = ''
best_model_path = os.path.join(result_dir, 'model.pt')

if result_dir != '':
    os.makedirs(result_dir, exist_ok=True)

check_point = ModelCheckPoint(filepath=best_model_path)
early_stopping = EarlyStopping(patience=3, min_delta=0)

## Loop

In [15]:
start_epoch = 1
# load a previous model if there is any
# model, optimizer, lr_scheduler, start_epoch = load_model(model, optimizer, lr_scheduler, device, filepath=best_model_path)
model = model.to(device)

for epoch in range(start_epoch, EPOCHS+1):
    gc.collect()
    
    train_loss, train_result, train_probs = train_util.train(model, train_dataloader, optimizer, epoch)
    val_loss, val_result, val_probs = train_util.evaluate(model, validation_dataloader, epoch, 'Validation')

    print(
      f"Epoch: {epoch} | "
      f"Train loss: {train_loss:.3f} | "
      f"Train result: {train_result} |\n"
      f"Validation loss: {val_loss:.3f} | "
      f"Validation result: {val_result} | "
    )
    
    loss = -val_result['f1']
    lr_scheduler.step(loss)
    check_point(model, optimizer, lr_scheduler, epoch, loss)
    
    early_stopping(loss)
    if early_stopping.early_stop:
        break
    print()
    # break

Epoch 1 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 1 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 1 | Train loss: 0.469 | Train result: {'accuracy': 0.7751, 'f1': 0.8088, 'auc': 0.8527} |
Validation loss: 0.531 | Validation result: {'accuracy': 0.7487, 'f1': 0.7845, 'auc': 0.823} | 

Loss improved from inf to -0.784. Saving model.



Epoch 2 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 2 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 2 | Train loss: 0.447 | Train result: {'accuracy': 0.791, 'f1': 0.8184, 'auc': 0.868} |
Validation loss: 0.513 | Validation result: {'accuracy': 0.7549, 'f1': 0.7768, 'auc': 0.8388} | 
Early stopping counter 1 of 3



Epoch 3 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 3 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 3 | Train loss: 0.430 | Train result: {'accuracy': 0.8029, 'f1': 0.8259, 'auc': 0.8793} |
Validation loss: 0.507 | Validation result: {'accuracy': 0.753, 'f1': 0.7712, 'auc': 0.8417} | 
Epoch 00003: reducing learning rate of group 0 to 1.0000e-05.
Early stopping counter 2 of 3



Epoch 4 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 4 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 4 | Train loss: 0.435 | Train result: {'accuracy': 0.8009, 'f1': 0.8215, 'auc': 0.8764} |
Validation loss: 0.470 | Validation result: {'accuracy': 0.7795, 'f1': 0.8054, 'auc': 0.8621} | 

Loss improved from -0.784 to -0.805. Saving model.



Epoch 5 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 5 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 5 | Train loss: 0.426 | Train result: {'accuracy': 0.8029, 'f1': 0.8237, 'auc': 0.8808} |
Validation loss: 0.460 | Validation result: {'accuracy': 0.7885, 'f1': 0.8152, 'auc': 0.8673} | 

Loss improved from -0.805 to -0.815. Saving model.



Epoch 6 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 6 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 6 | Train loss: 0.419 | Train result: {'accuracy': 0.8098, 'f1': 0.8297, 'auc': 0.8853} |
Validation loss: 0.455 | Validation result: {'accuracy': 0.7927, 'f1': 0.82, 'auc': 0.8701} | 

Loss improved from -0.815 to -0.820. Saving model.



Epoch 7 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 7 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 7 | Train loss: 0.413 | Train result: {'accuracy': 0.8107, 'f1': 0.8307, 'auc': 0.8884} |
Validation loss: 0.451 | Validation result: {'accuracy': 0.797, 'f1': 0.8235, 'auc': 0.8722} | 

Loss improved from -0.820 to -0.824. Saving model.



Epoch 8 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 8 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 8 | Train loss: 0.406 | Train result: {'accuracy': 0.8149, 'f1': 0.834, 'auc': 0.8919} |
Validation loss: 0.450 | Validation result: {'accuracy': 0.7949, 'f1': 0.8219, 'auc': 0.8727} | 
Early stopping counter 1 of 3



Epoch 9 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 9 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 9 | Train loss: 0.401 | Train result: {'accuracy': 0.8179, 'f1': 0.8369, 'auc': 0.8954} |
Validation loss: 0.447 | Validation result: {'accuracy': 0.7962, 'f1': 0.8215, 'auc': 0.875} | 
Epoch 00009: reducing learning rate of group 0 to 1.0000e-06.
Early stopping counter 2 of 3



Epoch 10 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 10 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 10 | Train loss: 0.415 | Train result: {'accuracy': 0.8076, 'f1': 0.8257, 'auc': 0.8878} |
Validation loss: 0.439 | Validation result: {'accuracy': 0.7998, 'f1': 0.8316, 'auc': 0.8803} | 

Loss improved from -0.824 to -0.832. Saving model.



Epoch 11 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 11 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 11 | Train loss: 0.411 | Train result: {'accuracy': 0.8096, 'f1': 0.8295, 'auc': 0.8898} |
Validation loss: 0.436 | Validation result: {'accuracy': 0.8043, 'f1': 0.8355, 'auc': 0.8825} | 

Loss improved from -0.832 to -0.836. Saving model.



Epoch 12 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 12 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 12 | Train loss: 0.410 | Train result: {'accuracy': 0.811, 'f1': 0.8307, 'auc': 0.891} |
Validation loss: 0.434 | Validation result: {'accuracy': 0.8036, 'f1': 0.8347, 'auc': 0.8838} | 
Early stopping counter 1 of 3



Epoch 13 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 13 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 13 | Train loss: 0.407 | Train result: {'accuracy': 0.8115, 'f1': 0.8312, 'auc': 0.8917} |
Validation loss: 0.432 | Validation result: {'accuracy': 0.8051, 'f1': 0.8359, 'auc': 0.8845} | 

Loss improved from -0.836 to -0.836. Saving model.



Epoch 14 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 14 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 14 | Train loss: 0.405 | Train result: {'accuracy': 0.8122, 'f1': 0.8315, 'auc': 0.893} |
Validation loss: 0.432 | Validation result: {'accuracy': 0.8053, 'f1': 0.8358, 'auc': 0.8849} | 
Early stopping counter 1 of 3



Epoch 15 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 15 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 15 | Train loss: 0.407 | Train result: {'accuracy': 0.8126, 'f1': 0.8322, 'auc': 0.8921} |
Validation loss: 0.431 | Validation result: {'accuracy': 0.8056, 'f1': 0.836, 'auc': 0.8853} | 

Loss improved from -0.836 to -0.836. Saving model.



Epoch 16 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 16 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 16 | Train loss: 0.403 | Train result: {'accuracy': 0.8154, 'f1': 0.8347, 'auc': 0.894} |
Validation loss: 0.431 | Validation result: {'accuracy': 0.8049, 'f1': 0.8357, 'auc': 0.8856} | 
Early stopping counter 1 of 3



Epoch 17 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 17 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 17 | Train loss: 0.402 | Train result: {'accuracy': 0.8139, 'f1': 0.8331, 'auc': 0.8944} |
Validation loss: 0.430 | Validation result: {'accuracy': 0.8073, 'f1': 0.8372, 'auc': 0.8859} | 

Loss improved from -0.836 to -0.837. Saving model.



Epoch 18 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 18 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 18 | Train loss: 0.401 | Train result: {'accuracy': 0.816, 'f1': 0.8348, 'auc': 0.8953} |
Validation loss: 0.430 | Validation result: {'accuracy': 0.806, 'f1': 0.8361, 'auc': 0.8861} | 
Early stopping counter 1 of 3



Epoch 19 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 19 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 19 | Train loss: 0.400 | Train result: {'accuracy': 0.8153, 'f1': 0.8341, 'auc': 0.8961} |
Validation loss: 0.430 | Validation result: {'accuracy': 0.8053, 'f1': 0.8356, 'auc': 0.8861} | 
Epoch 00019: reducing learning rate of group 0 to 1.0000e-07.
Early stopping counter 2 of 3



Epoch 20 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 20 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch: 20 | Train loss: 0.400 | Train result: {'accuracy': 0.8157, 'f1': 0.8343, 'auc': 0.896} |
Validation loss: 0.430 | Validation result: {'accuracy': 0.8056, 'f1': 0.8357, 'auc': 0.8863} | 
Early stopping counter 3 of 3
Early stopping..


In [16]:
# load the best model
model, _, _, best_epoch = TrainUtil.load_model(model, optimizer, lr_scheduler, device, filepath=best_model_path)

train_loss, train_result, train_probs = train_util.evaluate(model, train_dataloader, best_epoch, 'Train')
# no need to reevaluate if the validation set if the last model is the best one
if best_epoch != epoch:
    val_loss, val_result, val_probs = train_util.evaluate(model, validation_dataloader, best_epoch, 'Validation')
test_loss, test_result, test_probs = train_util.evaluate(model, test_dataloader, best_epoch, 'Test')

Loaded best model from epoch 17


Epoch 17 (Train):   0%|          | 0/562 [00:00<?, ?it/s]

Epoch 17 (Validation):   0%|          | 0/74 [00:00<?, ?it/s]

Epoch 17 (Test):   0%|          | 0/74 [00:00<?, ?it/s]

## Dump results and others

In [17]:
with open(root + 'train.pkl', 'rb') as input_file:
    train_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'validation.pkl', 'rb') as input_file:
    validation_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'test.pkl', 'rb') as input_file:
    test_tokenized = pickle.load(input_file)
    input_file.close()

In [18]:
train_util.dump_results(
    result_dir,train_probs, train_tokenized, 
    val_probs, validation_tokenized, test_probs, test_tokenized
)

## Dump config

In [19]:
import json

config_dict = dict(Config.__dict__)
# exclude hidden variables
keys = list(config_dict.keys())
for key in keys:
    if key.startswith('__'):
        del config_dict[key]
        
with open(os.path.join(result_dir, 'config.json'), 'w') as output:
    json.dump(config_dict, output, indent=4)