This notebook is done following 
* [Building text classifier with Differential Privacy](https://github.com/pytorch/opacus/blob/main/tutorials/building_text_classifier.ipynb)
* [Fine-tuning with custom datasets](https://huggingface.co/transformers/v3.4.0/custom_datasets.html#seq-imdb)

# Intial Setup
https://huggingface.co/docs/transformers/training

## Install

In [1]:
!pip install opacus
# !pip install transformers
!pip install datasets
import datasets

Collecting opacus
  Downloading opacus-1.1.2-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 KB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: opacus
Successfully installed opacus-1.1.2
[0m

## Import

In [2]:
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader

from opacus.utils.batch_memory_manager import BatchMemoryManager

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import gc

pd.set_option('display.max_columns', None)

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tokenize-ucberkeley-using-bert/__results__.html
/kaggle/input/tokenize-ucberkeley-using-bert/validation.pkl
/kaggle/input/tokenize-ucberkeley-using-bert/train.pkl
/kaggle/input/tokenize-ucberkeley-using-bert/test.pkl
/kaggle/input/tokenize-ucberkeley-using-bert/validation.csv
/kaggle/input/tokenize-ucberkeley-using-bert/__notebook__.ipynb
/kaggle/input/tokenize-ucberkeley-using-bert/__output__.json
/kaggle/input/tokenize-ucberkeley-using-bert/train.csv
/kaggle/input/tokenize-ucberkeley-using-bert/test.csv
/kaggle/input/tokenize-ucberkeley-using-bert/custom.css


## Config

In [4]:
from dataclasses import dataclass

@dataclass
class Config:
    model_name = 'bert-base-uncased'
    dataset_name = 'ucberkeley-dlab/measuring-hate-speech'
    text_column = 'text'
    # if the raw id column is string, replace that with an integer index during preprocessing
    id_column = 'comment_id'

    # target in raw dataset. However, it will be renamed to `labels` here to facilitate training setup
    raw_target_column = 'hatespeech'
    target_column = 'labels'
    
    # If needs to be splitted into train test validation set
    need_to_split = False
    # if need_to_split is True, test and validation data with each be 50% of this amount
    test_size = 0.3
    max_seq_length = 128
    seed = 2022
    
    batch_size = 64
    learning_rate = 1e-3
    epochs = 15
    num_labels = 2
    
    # Private training config
    delta_list = [5e-2, 1e-3, 1e-5]
    noise_multiplier = 0.45
    max_grad_norm = 1
    max_physical_batch_size = 32

## Set seed

In [5]:
import random

def seed_torch(seed=7):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    

global_seed = Config.seed
seed_torch(global_seed)

## Get device

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


# Load tokenized data

From my [other notebook](https://www.kaggle.com/code/khairulislam/tokenize-jigsaw-comments). The dataset is tokenized from the [Jigsaw competition]( https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification) and [all_data.csv](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/data?select=all_data.csv)

In [7]:
text = Config.text_column
target = Config.target_column
root = '/kaggle/input/tokenize-ucberkeley-using-bert/'

In [8]:
import pickle
    
with open(root + 'train.pkl', 'rb') as input_file:
    train_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'validation.pkl', 'rb') as input_file:
    validation_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'test.pkl', 'rb') as input_file:
    test_tokenized = pickle.load(input_file)
    input_file.close()

In [9]:
print(train_tokenized)

Dataset({
    features: ['comment_id', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 26994
})


In [10]:
# Remove id column from the data to be batched
id_column = Config.id_column 
train_tokenized = train_tokenized.remove_columns(id_column)
test_tokenized = test_tokenized.remove_columns(id_column)
validation_tokenized = validation_tokenized.remove_columns(id_column)

# Private Training

## Data loader

[How to choose batch size in DP](https://github.com/pytorch/opacus/blob/main/tutorials/building_text_classifier.ipynb)

In [11]:
BATCH_SIZE = Config.batch_size

train_dataloader = DataLoader(train_tokenized, batch_size=BATCH_SIZE)
validation_dataloader = DataLoader(validation_tokenized, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_tokenized, batch_size=BATCH_SIZE)

## Model and optimizer

In [12]:
# add the utility script from File->Add utility script
from train_utils import TrainUtil, ModelCheckPoint, EarlyStopping

num_labels = Config.num_labels
model_name = Config.model_name
train_util = TrainUtil(Config.id_column, Config.target_column, device)

In [13]:
# load a fresh model each time
model = TrainUtil.load_pretrained_model(model_name, num_labels)

# Set the model to train mode (HuggingFace models load in eval mode)
model = model.train().to(device)
LEARNING_RATE = Config.learning_rate
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8)
EPOCHS = Config.epochs

# https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Total parameters count: 109483778
Trainable parameters count: 7680002, percent 7.015


In [14]:
result_dir = ''
best_model_path = os.path.join(result_dir, 'model.pt')

if result_dir != '':
    os.makedirs(result_dir, exist_ok=True)

check_point = ModelCheckPoint(filepath=best_model_path)
early_stopping = EarlyStopping(patience=3, min_delta=0)

## Privacy Engine

In [15]:
from opacus import PrivacyEngine

privacy_engine = PrivacyEngine()

In [16]:
model, optimizer, train_dataloader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_dataloader,
    noise_multiplier=Config.noise_multiplier,
    max_grad_norm=Config.max_grad_norm,
    poisson_sampling=False,
)

## Loop

In [17]:
start_epoch = 1
# load a previous model if there is any
# model, optimizer, lr_scheduler, start_epoch = load_model(model, optimizer, lr_scheduler, device, filepath=best_model_path)

for epoch in range(start_epoch, EPOCHS+1):
    gc.collect()
    
    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=Config.max_physical_batch_size, 
        optimizer=optimizer
    ) as memory_safe_data_loader:
        train_loss, train_result, train_probs = train_util.dp_train(
            model, optimizer, epoch, memory_safe_data_loader
        )
    val_loss, val_result, val_probs = train_util.evaluate(
        model, validation_dataloader, epoch, 'Validation'
    )

    epsilons = []
    for delta in Config.delta_list:
        epsilons.append(privacy_engine.get_epsilon(delta))

    print(
      f"Epoch: {epoch} | "
      f"ɛ: {np.round(epsilons, 2)} |"
      f"Train loss: {train_loss:.3f} | "
      f"Train result: {train_result} |\n"
      f"Validation loss: {val_loss:.3f} | "
      f"Validation result: {val_result} | "
    )
    
    loss = -val_result['f1']
    lr_scheduler.step(loss)
    check_point(model, optimizer, lr_scheduler, epoch, loss)
    
    early_stopping(loss)
    if early_stopping.early_stop:
        break
    print()

Epoch 1 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 1 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 1 | ɛ: [1.68 4.47 7.47] |Train loss: 1.084 | Train result: {'accuracy': 0.7227, 'f1': 0.0826, 'auc': 0.7088} |
Validation loss: 0.960 | Validation result: {'accuracy': 0.7573, 'f1': 0.4005, 'auc': 0.7915} | 

Loss improved from inf to -0.401. Saving model.



Epoch 2 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 2 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 2 | ɛ: [2.18 5.27 8.53] |Train loss: 0.920 | Train result: {'accuracy': 0.7674, 'f1': 0.4661, 'auc': 0.7933} |
Validation loss: 0.968 | Validation result: {'accuracy': 0.7787, 'f1': 0.4984, 'auc': 0.8125} | 

Loss improved from -0.401 to -0.498. Saving model.



Epoch 3 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 3 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 3 | ɛ: [2.57 5.86 9.3 ] |Train loss: 0.932 | Train result: {'accuracy': 0.7766, 'f1': 0.5127, 'auc': 0.8013} |
Validation loss: 0.988 | Validation result: {'accuracy': 0.7851, 'f1': 0.5283, 'auc': 0.8176} | 

Loss improved from -0.498 to -0.528. Saving model.



Epoch 4 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 4 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 4 | ɛ: [2.92 6.38 9.99] |Train loss: 0.934 | Train result: {'accuracy': 0.7867, 'f1': 0.543, 'auc': 0.8078} |
Validation loss: 1.012 | Validation result: {'accuracy': 0.7917, 'f1': 0.5626, 'auc': 0.8152} | 

Loss improved from -0.528 to -0.563. Saving model.



Epoch 5 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 5 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 5 | ɛ: [ 3.25  6.86 10.58] |Train loss: 0.946 | Train result: {'accuracy': 0.7888, 'f1': 0.5583, 'auc': 0.8029} |
Validation loss: 1.045 | Validation result: {'accuracy': 0.7895, 'f1': 0.5445, 'auc': 0.8202} | 
Early stopping counter 1 of 3



Epoch 6 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 6 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 6 | ɛ: [ 3.55  7.3  11.16] |Train loss: 0.945 | Train result: {'accuracy': 0.7915, 'f1': 0.5583, 'auc': 0.8022} |
Validation loss: 1.037 | Validation result: {'accuracy': 0.7915, 'f1': 0.55, 'auc': 0.8085} | 
Epoch 00006: reducing learning rate of group 0 to 1.0000e-04.
Early stopping counter 2 of 3



Epoch 7 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 7 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 7 | ɛ: [ 3.83  7.69 11.64] |Train loss: 0.947 | Train result: {'accuracy': 0.792, 'f1': 0.5504, 'auc': 0.7957} |
Validation loss: 1.042 | Validation result: {'accuracy': 0.7927, 'f1': 0.5642, 'auc': 0.8136} | 

Loss improved from -0.563 to -0.564. Saving model.



Epoch 8 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 8 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 8 | ɛ: [ 4.11  8.08 12.11] |Train loss: 0.943 | Train result: {'accuracy': 0.7926, 'f1': 0.5581, 'auc': 0.7976} |
Validation loss: 1.041 | Validation result: {'accuracy': 0.7939, 'f1': 0.5656, 'auc': 0.8145} | 

Loss improved from -0.564 to -0.566. Saving model.



Epoch 9 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 9 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 9 | ɛ: [ 4.37  8.47 12.59] |Train loss: 0.944 | Train result: {'accuracy': 0.7935, 'f1': 0.5578, 'auc': 0.799} |
Validation loss: 1.039 | Validation result: {'accuracy': 0.7936, 'f1': 0.5633, 'auc': 0.816} | 
Early stopping counter 1 of 3



Epoch 10 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 10 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 10 | ɛ: [ 4.61  8.8  13.05] |Train loss: 0.938 | Train result: {'accuracy': 0.7927, 'f1': 0.5613, 'auc': 0.8034} |
Validation loss: 1.035 | Validation result: {'accuracy': 0.7945, 'f1': 0.5709, 'auc': 0.8188} | 

Loss improved from -0.566 to -0.571. Saving model.



Epoch 11 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 11 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 11 | ɛ: [ 4.85  9.13 13.44] |Train loss: 0.950 | Train result: {'accuracy': 0.7923, 'f1': 0.5645, 'auc': 0.8015} |
Validation loss: 1.030 | Validation result: {'accuracy': 0.7938, 'f1': 0.5713, 'auc': 0.8204} | 

Loss improved from -0.571 to -0.571. Saving model.



Epoch 12 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 12 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 12 | ɛ: [ 5.1   9.46 13.83] |Train loss: 0.936 | Train result: {'accuracy': 0.7938, 'f1': 0.5698, 'auc': 0.8089} |
Validation loss: 1.028 | Validation result: {'accuracy': 0.7946, 'f1': 0.5754, 'auc': 0.8216} | 

Loss improved from -0.571 to -0.575. Saving model.



Epoch 13 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 13 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 13 | ɛ: [ 5.34  9.79 14.22] |Train loss: 0.941 | Train result: {'accuracy': 0.7921, 'f1': 0.5694, 'auc': 0.8073} |
Validation loss: 1.020 | Validation result: {'accuracy': 0.7946, 'f1': 0.5745, 'auc': 0.8218} | 
Early stopping counter 1 of 3



Epoch 14 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 14 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 14 | ɛ: [ 5.57 10.12 14.61] |Train loss: 0.932 | Train result: {'accuracy': 0.7934, 'f1': 0.5698, 'auc': 0.8077} |
Validation loss: 1.025 | Validation result: {'accuracy': 0.7943, 'f1': 0.5747, 'auc': 0.8222} | 
Epoch 00014: reducing learning rate of group 0 to 1.0000e-05.
Early stopping counter 2 of 3



Epoch 15 (Train):   0%|          | 0/844 [00:00<?, ?it/s]

Epoch 15 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch: 15 | ɛ: [ 5.78 10.41 15.  ] |Train loss: 0.929 | Train result: {'accuracy': 0.7945, 'f1': 0.5729, 'auc': 0.808} |
Validation loss: 1.023 | Validation result: {'accuracy': 0.7945, 'f1': 0.5752, 'auc': 0.8223} | 
Early stopping counter 3 of 3
Early stopping..


In [18]:
# load the best model
model, _, _, best_epoch = TrainUtil.load_model(model, optimizer, lr_scheduler, device, filepath=best_model_path)

train_loss, train_result, train_probs = train_util.evaluate(model, train_dataloader, best_epoch, 'Train')
# no need to reevaluate if the validation set if the last model is the best one
if best_epoch != epoch:
    val_loss, val_result, val_probs = train_util.evaluate(model, validation_dataloader, best_epoch, 'Validation')
test_loss, test_result, test_probs = train_util.evaluate(model, test_dataloader, best_epoch, 'Test')

Loaded best model from epoch 12


Epoch 12 (Train):   0%|          | 0/422 [00:00<?, ?it/s]

Epoch 12 (Validation):   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 12 (Test):   0%|          | 0/91 [00:00<?, ?it/s]

## Dump results and others

In [19]:
# load the original tokenized files, since we removed the id columns earlier
# and id columns are needed for the result dumping part
with open(root + 'train.pkl', 'rb') as input_file:
    train_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'validation.pkl', 'rb') as input_file:
    validation_tokenized = pickle.load(input_file)
    input_file.close()
    
with open(root + 'test.pkl', 'rb') as input_file:
    test_tokenized = pickle.load(input_file)
    input_file.close()

In [20]:
# Save the results
train_util.dump_results(
    result_dir,train_probs, train_tokenized, 
    val_probs, validation_tokenized, test_probs, test_tokenized
)

## Save config

In [21]:
import json

config_dict = dict(Config.__dict__)
# exclude hidden variables
keys = list(config_dict.keys())
for key in keys:
    if key.startswith('__'):
        del config_dict[key]
        
with open(os.path.join(result_dir, 'config.json'), 'w') as output:
    json.dump(config_dict, output, indent=4)