# Libraries
https://huggingface.co/docs/transformers/training

## Install

In [1]:
!pip install datasets
!pip install transformers
!pip install opacus

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 4.2 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 10.1 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 8.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 9.0 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.6 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-

## Import

In [2]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
from torch.utils.data import TensorDataset
import torch
from torch.nn.utils.rnn import pad_sequence
import gc
from opacus.utils.batch_memory_manager import BatchMemoryManager

import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np

def accuracy(labels, preds):
    return (np.array(labels) == np.array(preds)).mean()

## [Check GPU footprint](https://stackoverflow.com/questions/59789059/gpu-out-of-memory-error-message-on-google-colab)

In [4]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7411 sha256=90a2803cd36e1f69703f1bf7a06275b6baf1536f862e36cda150e6753afe23db
  Stored in directory: /root/.cache/pip/wheels/6e/f8/83/534c52482d6da64622ddbf72cd93c35d2ef2881b78fd08ff0c
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 12.0 GB  |     Proc size: 1.6 GB
GPU RAM Free: 15109MB | Used: 0MB | Util   0% | Total     15109MB


## Get device

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


# Load dataset

## Download

In [6]:
from datasets import load_dataset

# dataset = load_dataset("yelp_review_full")
imdb_dataset = load_dataset("imdb")

for key in imdb_dataset.keys():
  print(key, imdb_dataset[key].shape)

# positive or negative review
num_labels = 2

imdb_dataset["train"][100]

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

train (25000, 2)
test (25000, 2)
unsupervised (50000, 2)


{'label': 0,
 'text': "Terrible movie. Nuff Said.<br /><br />These Lines are Just Filler. The movie was bad. Why I have to expand on that I don't know. This is already a waste of my time. I just wanted to warn others. Avoid this movie. The acting sucks and the writing is just moronic. Bad in every way. The only nice thing about the movie are Deniz Akkaya's breasts. Even that was ruined though by a terrible and unneeded rape scene. The movie is a poorly contrived and totally unbelievable piece of garbage.<br /><br />OK now I am just going to rag on IMDb for this stupid rule of 10 lines of text minimum. First I waste my time watching this offal. Then feeling compelled to warn others I create an account with IMDb only to discover that I have to write a friggen essay on the film just to express how bad I think it is. Totally unnecessary."}

## Tokenize
In this experiment we are using the Bert-base-cased model and tokenizer.

In [32]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = imdb_dataset.map(tokenize_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [33]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

## Sample train, test set

In [34]:
# select a smaller subset for faster debugging
small_train_dataset = tokenized_datasets["train"].shuffle(seed=2022).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2022).select(range(1000))

In [10]:
# check input data shapes
for f in small_train_dataset:
  for key in f.keys():
    print(key, f[key].shape, f[key].reshape((1,-1)).shape)
  break

labels torch.Size([]) torch.Size([1, 1])
input_ids torch.Size([512]) torch.Size([1, 512])
token_type_ids torch.Size([512]) torch.Size([1, 512])
attention_mask torch.Size([512]) torch.Size([1, 512])


# Model

In [11]:
def load_pretrained_model(model_name, num_labels, freeze_final_layers=True):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    trainable_layers = [model.bert.encoder.layer[-1], model.bert.pooler, model.classifier]
    total_params = 0
    trainable_params = 0

    for p in model.parameters():
            p.requires_grad = False
            total_params += p.numel()

    for layer in trainable_layers:
        for p in layer.parameters():
            p.requires_grad = True
            trainable_params += p.numel()

    print(f"Total parameters count: {total_params}") # ~108M
    print(f"Trainable parameters count: {trainable_params}") # ~7M

    return model

# Non-private Training

## Load pretrained model

In [35]:
model = load_pretrained_model(model_name, num_labels, freeze_final_layers=True)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Total parameters count: 109483778
Trainable parameters count: 7680002


## Hyper-parameters

In [36]:
num_epochs = 2
learning_rate = 1e-4
batch_size = 4

## Dataloader

In [37]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=batch_size)

## Optimizer and learning rate scheduler

In [38]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Training utils

In [27]:
def train(model, dataloader, epoch):
    model.train()
    losses = 0.0
    total_preds = []
    total_labels = []

    progress_bar = tqdm(range(len(dataloader)), desc=f'Epoch {epoch+1}')
    for step, batch in enumerate(dataloader):
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        
        losses += loss.item()
        predictions = np.argmax(outputs.logits.detach().cpu().numpy(), axis=1)
        labels = batch['labels'].detach().cpu().numpy()
        
        total_preds.extend(predictions)
        total_labels.extend(labels)

        progress_bar.update(1)
        acc = accuracy(np.array(total_labels), np.array(total_preds))
        progress_bar.set_postfix(loss=losses/(step+1), accuracy=acc)

    loss = losses/len(dataloader)
    acc = accuracy(np.array(total_labels), np.array(total_preds))
    return losses/len(dataloader), acc

def evaluate(model, dataloader):
    model.eval()
    accuracies = 0.0
    losses = 0.0
    
    for batch in tqdm(dataloader, desc='Evaluating'):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        predictions = np.argmax(outputs.logits.detach().cpu().numpy(), axis=1)
        labels = batch['labels'].detach().cpu().numpy()
        accuracies += accuracy(labels, predictions)

        losses += outputs.loss.item()

    return losses/len(dataloader), accuracies/len(dataloader)

## Training loop

In [39]:
gc.collect()
torch.cuda.empty_cache()

In [40]:
model.train().to(device)

for epoch in range(num_epochs):
    train_loss, train_accuracy = train(model, train_dataloader, epoch)
    print(f'Train loss {train_loss:0.3f}, accuracy {train_accuracy:0.6}')

    # evaluation
    # eval_loss, eval_accuracy = evaluate(model, eval_dataloader)
    # print(f'Evaluation loss {eval_loss:0.3f}, accuracy {eval_accuracy:0.6f}')
    
    gc.collect()

Epoch 1:   0%|          | 0/250 [00:00<?, ?it/s]

Train loss 0.431, accuracy 0.776


Epoch 2:   0%|          | 0/250 [00:00<?, ?it/s]

Train loss 0.211, accuracy 0.916


In [41]:
del model
gc.collect()
torch.cuda.empty_cache()

# Differentially Private Training

## Load pretrained model

In [42]:
model = load_pretrained_model(model_name, num_labels, freeze_final_layers=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Total parameters count: 109483778
Trainable parameters count: 7680002


## Hyper-parameters

In [44]:
# Training hyper-parameters
num_epochs = 2
learning_rate = 1e-4
batch_size = 8
max_physical_batch_size = 2

# Privacy engine hyper-parameters
max_grad_norm = 1.0
# Parameter for privacy accounting. Probability of not achieving privacy guarantees
delta = 1/small_train_dataset.shape[0] 
epsilon = 12.0


## Dataloader

In [45]:
# dynamic padding is more advantagious if you have varying size of inputs
# this is not needed for this data, as bert tokenizer already does the padding
# but the collating part is needed to make the tensors of same size
# def padded_collate(batch, padding_idx=0):
#     x = pad_sequence(
#         [elem["input_ids"] for elem in batch],
#         batch_first=True,
#         padding_value=padding_idx,
#     )
#     y = torch.stack([elem["labels"] for elem in batch]).long()
#     return x, y

# for bert this converts all ids into [1, 512] tensor from [512]. And the labels column to [1] from []
# so now the first dimension is 1 for all of them and there is no tensor mismatch
def collate_fn(batch):
   batch = list(filter(lambda x: x is not None, batch))
   return torch.utils.data.dataloader.default_collate(batch)   

def get_dataloader(data, batch_size):
  dataset = TensorDataset(
      data['input_ids'], data['attention_mask'], 
      data['token_type_ids'], data['labels']
  )

  return DataLoader(
      dataset,
      num_workers=1,
      batch_size=batch_size,
      collate_fn=collate_fn,
      pin_memory=True,
      shuffle=False,
  )

In [46]:
train_dataloader = get_dataloader(small_train_dataset, batch_size)
val_dataloader = get_dataloader(small_eval_dataset, batch_size)

In [None]:
# iter_loader = iter(train_dataloader)
# batch1 = next(iter_loader)
# print(batch1)

## Optimizer and learning rate scheduler

In [47]:
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Privacy Engine

In [48]:
gc.collect()
torch.cuda.empty_cache()

In [49]:
from opacus import PrivacyEngine
privacy_engine = PrivacyEngine(secure_mode=None)

In [50]:
# model, optimizer, train_dataloader = privacy_engine.make_private(
#     module=model,
#     optimizer=optimizer,
#     data_loader=train_dataloader,
#     noise_multiplier=0.1,
#     max_grad_norm=1.0,
#     poisson_sampling=False,
# )

model.train()
# can't be used for per batch training or have to use BatchMemoryManager
# or load the total data in memory and do per epoch training and prediction
model, optimizer, train_loader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_dataloader,
    max_grad_norm=max_grad_norm,
    target_delta=delta,
    target_epsilon=epsilon,
    epochs=num_epochs,
)

## Training utils

In [51]:
def private_evaluate(model, dataloader):    
    model.eval()

    losses = 0.0
    total_preds = []
    total_labels = []
    
    for batch in tqdm(dataloader, desc='Evaluating'):
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}

            outputs = model(**inputs)
            loss, logits = outputs[:2]
            
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = inputs['labels'].detach().cpu().numpy()
            
            losses += loss
            total_preds.extend(preds)
            total_labels.extend(labels)

        gc.collect()
        torch.cuda.empty_cache()

    loss = losses / len(dataloader)
    acc = accuracy(np.array(total_labels), np.array(total_preds))

    return loss, acc

def private_train(model, dataloader, epoch):
    model.train()

    losses = 0.0
    total_preds = []
    total_labels = []

    with BatchMemoryManager(
        data_loader=dataloader, 
        max_physical_batch_size=max_physical_batch_size, 
        optimizer=optimizer
    ) as memory_safe_data_loader:
      progress = tqdm(range(len(memory_safe_data_loader)), desc=f"Epoch {epoch}")
      
      for step, batch in enumerate(memory_safe_data_loader):
          optimizer.zero_grad()
          
          # batch = {k: v.to(device) for k, v in batch.items()}
          batch = tuple(t.to(device) for t in batch)
          inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2],
                  'labels':         batch[3]}

          outputs = model(**inputs) # output = loss, logits, hidden_states, attentions

          loss = outputs.loss
          loss.backward()
          losses += loss.item()

          optimizer.step()
          lr_scheduler.step()

          predictions = np.argmax(outputs.logits.detach().cpu().numpy(), axis=1)
          labels = inputs['labels'].detach().cpu().numpy()
          
          total_preds.extend(predictions)
          total_labels.extend(labels)
          
          acc = accuracy(np.array(total_labels), np.array(total_preds))
          progress.set_postfix(loss= losses / (step+1), accuracy=acc)
          progress.update(1)

          gc.collect()
          torch.cuda.empty_cache()

    loss = losses / len(dataloader)
    acc = accuracy(np.array(total_labels), np.array(total_preds))
    
    return loss, acc

## Training loop

In [53]:
# training gets a warning like this
# /usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py:1025: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior.
#   warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes "

# https://github.com/pytorch/opacus/blob/main/tutorials/building_text_classifier.ipynb
model.to(device)

for epoch in range(1, num_epochs+1):
    train_loss, train_accuracy = private_train(model, train_dataloader, epoch)
  
    eps = privacy_engine.get_epsilon(delta)
    print_str = f"Epoch: {epoch}, ɛ {eps:.3f}, delta {delta} | Train loss: {train_loss:.3f}, accuracy: {train_accuracy:.3f} | "

    eval_loss, eval_accuracy = private_evaluate(model, val_dataloader)
    print_str += f"Eval loss: {eval_loss:.3f}, accuracy: {eval_accuracy:.3f}."
    print(print_str)

Epoch 1:   0%|          | 0/500 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 1, ɛ 9.648, delta 0.001 | Train loss: 2.805, accuracy: 0.544 | Eval loss: 0.696, accuracy: 0.543.


Epoch 2:   0%|          | 0/500 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 2, ɛ 11.991, delta 0.001 | Train loss: 2.806, accuracy: 0.542 | Eval loss: 0.696, accuracy: 0.543.


In [54]:
del model
gc.collect()
torch.cuda.empty_cache()