In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model

import opacus
from opacus.validators import ModuleValidator
from opacus.utils.batch_memory_manager import BatchMemoryManager
from opacus import PrivacyEngine

import torch
import torch.nn as nn
import numpy as np

from tqdm.notebook import tqdm
from torch.optim import SGD
from torch.utils.data import DataLoader

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

from sklearn.metrics import accuracy_score

2023-11-15 09:07:35.516306: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-15 09:07:35.699112: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-15 09:07:35.705192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/lib64:/opt/software/Java/1.8.0_152/lib:/opt/software/Python/3.

In [3]:
model_name = "prajjwal1/bert-tiny"
EPOCHS = 45
BATCH_SIZE = 2048
LR = 0.001

In [4]:
# Prepare data
dataset = load_dataset("glue", "mnli")
num_labels = dataset["train"].features["label"].num_classes

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
tokenized_dataset = dataset.map(
    lambda example: tokenizer(
        example["premise"],
        example["hypothesis"],
        max_length=128,
        padding='max_length',
        truncation=True
    ),
    batched=True
)

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

In [7]:
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_dataset = tokenized_dataset.remove_columns(['idx'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9847
    })
})

In [9]:
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=False, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(tokenized_dataset["validation_matched"], shuffle=False, batch_size=BATCH_SIZE)

In [10]:
EPSILON = np.inf
DELTA = 1
MAX_GRAD_NORM = 0.5
MAX_PHYSICAL_BATCH_SIZE = int(BATCH_SIZE/4)

In [11]:
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels

model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules = ['query', 'key', 'value'],
)

if peft_config is not None:
    model = get_peft_model(model, peft_config)
    model.register_full_backward_hook(True)

device = torch.device("cuda:1")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 128, padding_idx=0)
          (position_embeddings): Embedding(512, 128)
          (token_type_embeddings): Embedding(2, 128)
          (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-1): 2 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(
                    in_features=128, out_features=128, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=128, out_fe

In [12]:
errors = ModuleValidator.validate(model, strict=False)
print(errors)

[]


In [13]:
optimizer = SGD(params=model.parameters(), lr=LR)

In [14]:
privacy_engine = PrivacyEngine(accountant="rdp")

model, optimizer, train_dataloader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader=train_dataloader,
    epochs=EPOCHS,
    target_epsilon=EPSILON,
    target_delta=DELTA,
    max_grad_norm=MAX_GRAD_NORM,
    batch_first=True,
)

In [15]:
print(f"Using Sigma = {optimizer.noise_multiplier:.3f} | C = {optimizer.max_grad_norm} | Initial DP (ε, δ) = ({privacy_engine.get_epsilon(DELTA)}, {DELTA})")

Using Sigma = 10.000 | C = 0.5 | Initial DP (ε, δ) = (0, 1)


In [16]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Trainable Parameters: {trainable_params} || All Parameters: {all_param} || Trainable Parameters (%): {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(model)

Trainable Parameters: 12675 || All Parameters: 4398982 || Trainable Parameters (%): 0.29


In [17]:
def train(model, train_dataloader, optimizer, epoch, device):
    model.train()
    criterion = nn.CrossEntropyLoss()

    losses = []
    epsilon = []

    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE,
        optimizer=optimizer,
        ) as memory_safe_data_loader:

        for i, batch in tqdm(enumerate(memory_safe_data_loader), total=len(memory_safe_data_loader), desc=f"Training Epoch: {epoch}"):
            
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()

            outputs = model(**batch)
            loss = criterion(outputs.logits, batch["labels"])
            loss.backward()

            optimizer.step()
            losses.append(loss.item())

            if i % 8000 == 0:
                epsilon = privacy_engine.get_epsilon(DELTA)

                print(f"Training Epoch: {epoch} | Loss: {np.mean(losses):.6f} | ε = {epsilon:.2f}")                    

In [18]:
def test(model, test_dataloader, device):
    model.eval()
    criterion = nn.CrossEntropyLoss()

    losses = []
    accuracies = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Test"):
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = criterion(outputs.logits, batch["labels"])

            preds = outputs.logits.argmax(dim=-1)
            acc = accuracy_score(preds.cpu().numpy(), batch["labels"].cpu().numpy())

            losses.append(loss.item())
            accuracies.append(acc.item())

    acc = np.mean(accuracies)
    loss = np.mean(losses)

    print(
        f"Test set: Loss: {loss:.4f}, Accuracy: {acc*100:.2f}%"
    )

    return loss, acc

In [19]:
for epoch in tqdm(range(EPOCHS), desc=f'Training {EPOCHS} Epochs'):
    train(model, train_dataloader, optimizer, epoch + 1, device)

Training 45 Epochs:   0%|          | 0/45 [00:00<?, ?it/s]

Training Epoch: 1:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 1 | Loss: 1.098443 | ε = 0.00


Training Epoch: 2:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 2 | Loss: 1.120675 | ε = -3.35


Training Epoch: 3:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 3 | Loss: 1.100275 | ε = -3.35


Training Epoch: 4:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 4 | Loss: 1.092477 | ε = -3.35


Training Epoch: 5:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 5 | Loss: 1.104423 | ε = -3.35


Training Epoch: 6:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 6 | Loss: 1.098280 | ε = -3.35


Training Epoch: 7:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 7 | Loss: 1.096994 | ε = -3.35


Training Epoch: 8:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 8 | Loss: 1.096900 | ε = -3.35


Training Epoch: 9:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 9 | Loss: 1.100197 | ε = -3.35


Training Epoch: 10:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 10 | Loss: 1.107652 | ε = -3.35


Training Epoch: 11:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 11 | Loss: 1.100420 | ε = -3.35


Training Epoch: 12:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 12 | Loss: 1.098624 | ε = -3.35


Training Epoch: 13:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 13 | Loss: 1.098265 | ε = -3.35


Training Epoch: 14:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 14 | Loss: 1.097957 | ε = -3.35


Training Epoch: 15:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 15 | Loss: 1.101021 | ε = -3.35


Training Epoch: 16:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 16 | Loss: 1.104890 | ε = -3.35


Training Epoch: 17:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 17 | Loss: 1.103460 | ε = -3.35


Training Epoch: 18:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 18 | Loss: 1.092734 | ε = -3.35


Training Epoch: 19:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 19 | Loss: 1.102022 | ε = -3.35


Training Epoch: 20:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 20 | Loss: 1.097937 | ε = -3.35


Training Epoch: 21:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 21 | Loss: 1.098551 | ε = -3.35


Training Epoch: 22:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 22 | Loss: 1.101954 | ε = -3.35


Training Epoch: 23:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 23 | Loss: 1.098678 | ε = -3.35


Training Epoch: 24:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 24 | Loss: 1.111339 | ε = -3.35


Training Epoch: 25:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 25 | Loss: 1.104998 | ε = -3.35


Training Epoch: 26:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 26 | Loss: 1.100029 | ε = -3.35


Training Epoch: 27:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 27 | Loss: 1.096877 | ε = -3.35


Training Epoch: 28:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 28 | Loss: 1.098598 | ε = -3.35


Training Epoch: 29:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 29 | Loss: 1.110691 | ε = -3.35


Training Epoch: 30:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 30 | Loss: 1.100397 | ε = -3.35


Training Epoch: 31:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 31 | Loss: 1.100382 | ε = -3.35


Training Epoch: 32:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 32 | Loss: 1.098130 | ε = -3.35


Training Epoch: 33:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 33 | Loss: 1.101494 | ε = -3.35


Training Epoch: 34:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 34 | Loss: 1.098727 | ε = -3.35


Training Epoch: 35:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 35 | Loss: 1.093035 | ε = -3.35


Training Epoch: 36:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 36 | Loss: 1.105560 | ε = -3.35


Training Epoch: 37:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 37 | Loss: 1.100258 | ε = -3.35


Training Epoch: 38:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 38 | Loss: 1.103484 | ε = -3.35


Training Epoch: 39:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 39 | Loss: 1.098170 | ε = -3.35


Training Epoch: 40:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 40 | Loss: 1.105048 | ε = -3.35


Training Epoch: 41:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 41 | Loss: 1.097649 | ε = -3.35


Training Epoch: 42:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 42 | Loss: 1.103298 | ε = -3.35


Training Epoch: 43:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 43 | Loss: 1.097494 | ε = -3.35


Training Epoch: 44:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 44 | Loss: 1.095891 | ε = -3.35


Training Epoch: 45:   0%|          | 0/766 [00:00<?, ?it/s]

Training Epoch: 45 | Loss: 1.097350 | ε = -3.35


In [20]:
final_epsilon = privacy_engine.get_epsilon(DELTA)
print(f"Final DP Guarantee (ε, δ)-DP = ({final_epsilon:.2f}, {DELTA})")

Final DP Guarantee (ε, δ)-DP = (-3.35, 1)


In [21]:
test(model, test_dataloader, device)

Test:   0%|          | 0/5 [00:00<?, ?it/s]

Test set: Loss: 1.0983, Accuracy: 34.35%


(1.0983268737792968, 0.34345005150569935)