In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import datasets

from peft import PrefixTuningConfig, TaskType, get_peft_model

import opacus
from opacus.validators import ModuleValidator
from opacus.utils.batch_memory_manager import BatchMemoryManager
from opacus import PrivacyEngine

import torch
import torch.nn as nn
import numpy as np

from tqdm.notebook import tqdm
from torch.optim import SGD
from torch.utils.data import DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, get_linear_schedule_with_warmup

from sklearn.metrics import accuracy_score

In [3]:
model_name = "prajjwal1/bert-tiny"
EPOCHS = 32
BATCH_SIZE = 1024
LR = 0.05

In [4]:
dataset = datasets.load_dataset('glue', 'mnli')
num_labels = dataset["train"].features["label"].num_classes

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [6]:
def preprocess_function(examples):
    result = {}
    result["input_ids"] = examples["input_ids"]
    result["attention_mask"] = examples["attention_mask"]
    result["labels"] = examples["label"]
    return result

In [7]:
processed_datasets = tokenized_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [8]:
processed_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
processed_datasets = processed_datasets.remove_columns(["premise", "hypothesis", "idx", "label"])

In [9]:
train_dataloader = DataLoader(processed_datasets["train"], batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(processed_datasets["validation_matched"], batch_size=BATCH_SIZE, shuffle=False)

In [10]:
EPSILON = 8.0
DELTA = 1/len(train_dataloader)
MAX_GRAD_NORM = 0.05
MAX_PHYSICAL_BATCH_SIZE = int(BATCH_SIZE/4)

In [11]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels

peft_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    num_virtual_tokens=50,
)

model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
model = get_peft_model(model, peft_config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 128, padding_idx=0)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-1): 2 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=128, out_features=128, bias=True)
                (key): Linear(in_features=128, out_features=128, bias=True)
                (value): Linear(in_features=128, out_features=128, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features

In [12]:
errors = ModuleValidator.validate(model, strict=False)
print(errors)

[]


In [13]:
optimizer = SGD(params=model.parameters(), lr=LR)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader)*EPOCHS
)

In [14]:
privacy_engine = PrivacyEngine(accountant="rdp")

model, optimizer, train_dataloader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_dataloader,
    epochs=EPOCHS,
    target_epsilon=EPSILON,
    target_delta=DELTA,
    max_grad_norm=MAX_GRAD_NORM,
    batch_first=True,
)

In [15]:
print(f"Using Sigma = {optimizer.noise_multiplier:.3f} | C = {optimizer.max_grad_norm} | Initial DP (ε, δ) = ({privacy_engine.get_epsilon(DELTA)}, {DELTA})")

Using Sigma = 0.515 | C = 0.05 | Initial DP (ε, δ) = (0, 0.0026041666666666665)


In [16]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable Parameters: {trainable_params} || All Parameters: {all_param} || Trainable Parameters (%): {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(model)

Trainable Parameters: 25987 || All Parameters: 4412294 || Trainable Parameters (%): 0.59


In [17]:
def train(model, train_dataloader, optimizer, epoch, device):
    model.train()
    criterion = nn.CrossEntropyLoss()

    losses = []
    epsilon = []

    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE,
        optimizer=optimizer,
        ) as memory_safe_data_loader:

        for i, batch in tqdm(enumerate(memory_safe_data_loader), total=len(memory_safe_data_loader), desc=f"Training Epoch: {epoch}"):
            
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()

            outputs = model(**batch)
            loss = criterion(outputs.logits, batch["labels"])
            loss.backward()

            optimizer.step()
            losses.append(loss.item())

            if i % 8000 == 0:
                epsilon = privacy_engine.get_epsilon(DELTA)

                print(f"Training Epoch: {epoch} | Loss: {np.mean(losses):.6f} | ε = {epsilon:.2f}")                   

In [18]:
def test(model, test_dataloader, device):
    model.eval()
    eval_loss = 0
    eval_preds = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Test"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
        loss = outputs.loss

        eval_loss += loss.detach().float()
        preds = outputs.logits.argmax(dim=-1)

        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )
        acc = accuracy_score(preds.cpu().numpy(), batch["labels"].cpu().numpy())

    print(
        f"Test set: Loss: {eval_loss:.4f}, Accuracy: {acc*100:.2f}%"
    )

    return eval_loss, acc

In [19]:
for epoch in tqdm(range(EPOCHS), desc=f'Training {EPOCHS} Epochs'):
    train(model, train_dataloader, optimizer, epoch + 1, device)

Training 32 Epochs:   0%|          | 0/32 [00:00<?, ?it/s]

Training Epoch: 1:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 1 | Loss: 1.123367 | ε = 0.00


Training Epoch: 2:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 2 | Loss: 1.106358 | ε = 2.46


Training Epoch: 3:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 3 | Loss: 1.094918 | ε = 2.86


Training Epoch: 4:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 4 | Loss: 1.097218 | ε = 3.18


Training Epoch: 5:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 5 | Loss: 1.100776 | ε = 3.46


Training Epoch: 6:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 6 | Loss: 1.107612 | ε = 3.71


Training Epoch: 7:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 7 | Loss: 1.098192 | ε = 3.94


Training Epoch: 8:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 8 | Loss: 1.101162 | ε = 4.15


Training Epoch: 9:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 9 | Loss: 1.093347 | ε = 4.36


Training Epoch: 10:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 10 | Loss: 1.096795 | ε = 4.55


Training Epoch: 11:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 11 | Loss: 1.099161 | ε = 4.74


Training Epoch: 12:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 12 | Loss: 1.094879 | ε = 4.93


Training Epoch: 13:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 13 | Loss: 1.094972 | ε = 5.10


Training Epoch: 14:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 14 | Loss: 1.099452 | ε = 5.27


Training Epoch: 15:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 15 | Loss: 1.100907 | ε = 5.44


Training Epoch: 16:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 16 | Loss: 1.102894 | ε = 5.61


Training Epoch: 17:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 17 | Loss: 1.097755 | ε = 5.77


Training Epoch: 18:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 18 | Loss: 1.101599 | ε = 5.92


Training Epoch: 19:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 19 | Loss: 1.102873 | ε = 6.07


Training Epoch: 20:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 20 | Loss: 1.096473 | ε = 6.22


Training Epoch: 21:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 21 | Loss: 1.100459 | ε = 6.37


Training Epoch: 22:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 22 | Loss: 1.097930 | ε = 6.52


Training Epoch: 23:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 23 | Loss: 1.104749 | ε = 6.67


Training Epoch: 24:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 24 | Loss: 1.100429 | ε = 6.80


Training Epoch: 25:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 25 | Loss: 1.103464 | ε = 6.94


Training Epoch: 26:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 26 | Loss: 1.098615 | ε = 7.07


Training Epoch: 27:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 27 | Loss: 1.103375 | ε = 7.21


Training Epoch: 28:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 28 | Loss: 1.099708 | ε = 7.34


Training Epoch: 29:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 29 | Loss: 1.094832 | ε = 7.48


Training Epoch: 30:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 30 | Loss: 1.101023 | ε = 7.61


Training Epoch: 31:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 31 | Loss: 1.104648 | ε = 7.75


Training Epoch: 32:   0%|          | 0/1533 [00:00<?, ?it/s]

Training Epoch: 32 | Loss: 1.098444 | ε = 7.87


In [20]:
final_epsilon = privacy_engine.get_epsilon(DELTA)
print(f"Final DP Guarantee (ε, δ)-DP = ({final_epsilon:.2f}, {DELTA})")

Final DP Guarantee (ε, δ)-DP = (7.99, 0.0026041666666666665)


In [21]:
test(model, test_dataloader, device)

Test:   0%|          | 0/10 [00:00<?, ?it/s]

Test set: Loss: 1.0958, Accuracy: 36.23%


(tensor(1.0958, device='cuda:0'), 0.3622704507512521)