In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
from datasets import load_dataset

from opacus.validators import ModuleValidator
from opacus.utils.batch_memory_manager import BatchMemoryManager
from opacus import PrivacyEngine

import torch
import torch.nn as nn
import numpy as np

from tqdm.notebook import tqdm
from torch.optim import SGD
from torch.utils.data import DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, DataCollatorWithPadding
from transformers.modeling_outputs import TokenClassifierOutput

from sklearn.metrics import accuracy_score

In [None]:
model_name = "prajjwal1/bert-tiny"
EPOCHS = 4
BATCH_SIZE = 32
LR = 2e-5

In [None]:
# Prepare data
dataset = load_dataset("glue", "sst2")
num_labels = dataset["train"].features["label"].num_classes

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
def preprocess_function(examples):
    result = {}
    result["input_ids"] = examples["input_ids"]
    result["attention_mask"] = examples["attention_mask"]
    result["labels"] = examples["label"]
    return result

processed_datasets = tokenized_datasets.map(preprocess_function, batched=True)

In [None]:
processed_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
processed_datasets = processed_datasets.remove_columns(["sentence", "idx", "label"])

In [None]:
train_dataloader = DataLoader(processed_datasets["train"], batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(processed_datasets["validation"], batch_size=BATCH_SIZE, shuffle=False)

In [None]:
EPSILON = 8.0
DELTA = 1/len(train_dataloader)
MAX_GRAD_NORM = 0.5
MAX_PHYSICAL_BATCH_SIZE = int(BATCH_SIZE/4)

In [None]:
class ClassifierHeadLayer (nn.Module):

    def __init__(self, model_name, num_labels):
        super(ClassifierHeadLayer, self).__init__()
        self.num_labels = num_labels

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, 
                                                                                                       output_attention = True, 
                                                                                                       output_hidden_state = True))
        # Freeze all original layers

        for param in self.model.parameters():
            param.requires_grad = False

        # New Layer
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(128, num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None):

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[-1]

        sequence_outputs = self.dropout(last_hidden_state)
        logits = self.classifier(sequence_outputs[:, 0, : ].view(-1, 128))

        loss = None

        if labels is not None:
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ClassifierHeadLayer(model_name=model_name, num_labels=num_labels)

model.to(device)

In [None]:
errors = ModuleValidator.validate(model, strict=False)
print(errors)

In [None]:
optimizer = SGD(params=model.parameters(), lr=LR)

In [None]:
privacy_engine = PrivacyEngine(accountant="rdp")

model, optimizer, train_dataloader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_dataloader,
    epochs=EPOCHS,
    target_epsilon=EPSILON,
    target_delta=DELTA,
    max_grad_norm=MAX_GRAD_NORM,
    batch_first=True,
)

In [None]:
print(f"Using Sigma = {optimizer.noise_multiplier:.3f} | C = {optimizer.max_grad_norm} | Initial DP (ε, δ) = ({privacy_engine.get_epsilon(DELTA)}, {DELTA})")

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable Parameters: {trainable_params} || All Parameters: {all_param} || Trainable Parameters (%): {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(model)

In [None]:
def train(model, train_dataloader, optimizer, epoch, device):
    model.train()

    losses = []
    epsilon = []

    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE,
        optimizer=optimizer,
        ) as memory_safe_data_loader:

        for i, batch in tqdm(enumerate(memory_safe_data_loader), total=len(memory_safe_data_loader), desc=f"Training Epoch: {epoch}"):
            
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.item())

            if i % 8000 == 0:
                epsilon = privacy_engine.get_epsilon(DELTA)

                print(f"Training Epoch: {epoch} | Loss: {np.mean(losses):.6f} | ε = {epsilon:.2f}")

In [None]:
train(model, train_dataloader, optimizer, 1, device)