In [1]:
!pip install git+https://github.com/awslabs/fast-differential-privacy.git

Collecting git+https://github.com/awslabs/fast-differential-privacy.git
  Cloning https://github.com/awslabs/fast-differential-privacy.git to /tmp/pip-req-build-zcl4ykao
  Running command git clone --filter=blob:none --quiet https://github.com/awslabs/fast-differential-privacy.git /tmp/pip-req-build-zcl4ykao
  Resolved https://github.com/awslabs/fast-differential-privacy.git to commit af783b348e82516f7565802cf1144a8be95c69a5
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: fastDP
  Building wheel for fastDP (setup.py) ... [?25ldone
[?25h  Created wheel for fastDP: filename=fastDP-2.0.0-py3-none-any.whl size=133222 sha256=5c999f8c0a8da5ffe7e87f7e6dc80335b9c3e8ed3733c3ed68655224a2bfe65a
  Stored in directory: /tmp/pip-ephem-wheel-cache-zlq3mxs_/wheels/6a/32/f6/19f858522f7d03c4c8d5cabcf543389f86a227589eea066737
Successfully built fastDP
Installing collected packages: fastDP
Successfully installed fastDP-2.0.0

[1m[[0m[34;49mnotice[0m[1;39

In [2]:
!pip install transformers torch scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Found credentials from IAM Role: cld_2xaislbg524y9urhb7nvm96bpl-cluster-node-role


In [4]:
from fastDP import PrivacyEngine
import transformers, torch
import pandas as pd

In [5]:
path="/home/ray/default/sst2_balanced_sentences.csv"
df = pd.read_csv(path)
print(df.shape)
df.head()

(2014, 3)


Unnamed: 0,idx,sentence,label
0,55232,carried less by wow factors than by its funny ...,1
1,1862,sorry use of aaliyah in her one and only starr...,0
2,46429,"the obnoxious special effects , the obligatory...",0
3,52913,like the best of godard 's movies ... it is vi...,1
4,58934,"if you are willing to do this , then you so cr...",0


## SST2 Inference

In [6]:
import torch
import torch.nn.functional as F
from torch.optim import Adam
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from fastDP import PrivacyEngine  # Ensure fastDP is installed

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the model and tokenizer, and move the model to the specified device
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Define the optimizer
optimizer = Adam(model.parameters(), lr=3e-3)

# Dataset class for multi-class classification
class MedicalSpecialtyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        specialty_label = f"{self.labels[idx]}[SEP]"
        text = f"[BOS]{specialty_label}{self.texts[idx]}"

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        # The target is the same as input_ids but shifted by one position
        target_ids = input_ids.clone()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100  # Ignore padding token

        return input_ids, attention_mask, target_ids


# Extract texts and labels from the dataframe
texts = df['sentence'].tolist()
labels = df['label'].tolist()

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.05)

# Create Datasets and DataLoaders
train_dataset = MedicalSpecialtyDataset(train_texts, train_labels, tokenizer)
val_dataset = MedicalSpecialtyDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Define the PrivacyEngine
privacy_engine = PrivacyEngine(
    model,
    batch_size=64,
    sample_size=len(train_dataset),
    epochs=5,
    target_epsilon=16, #change between 3, 8, 16
    clipping_fn='automatic',
    clipping_mode='MixOpt',
    origin_params=None,
    clipping_style='all-layer',
)

# Attach the PrivacyEngine to the optimizer
privacy_engine.attach(optimizer)

# Training loop
for epoch in range(5):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, target_ids = [x.to(device) for x in batch]

        outputs = model(input_ids, attention_mask=attention_mask, labels=target_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} completed.")

# Detach and save the privacy engine state
privacy_engine.detach()

# Function to generate synthetic sentences with medical specialty label
def generate_synthetic_sentence(model, tokenizer, label, prompt, max_length=80):
    model.eval()
    specialty_label = f"{label}[SEP]"
    input_ids = tokenizer.encode(f"[BOS]{specialty_label}{prompt}", return_tensors='pt').to(device)
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        no_repeat_ngram_size=4,
        do_sample=True,
        top_k=100,   #change between 50,100
        top_p=0.95,
        temperature=1.0 #change between 0.6, 0.8, 1.0, 1.2 
    )
    generated_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_sentence

# Generate synthetic sentences for each row in the dataframe
synthetic_sentences = []
for index, row in df.iterrows():
    specialty = row['label']
    synthetic_sentence = generate_synthetic_sentence(model, tokenizer, specialty, max_length=80)
    synthetic_sentences.append(synthetic_sentence)

# Append the synthetic sentences to the dataframe
df['synthetic_sentence_dp_eps16_temp1_topk100'] = synthetic_sentences

# Save the dataframe with the synthetic sentences
df.to_csv('synthetic_data_eps16.csv', index=False)

print("Synthetic sentences generated and saved to synthetic_data_bossep.csv")

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Number of trainable components:  77 ; Number of trainable layers:  40
>>>>>>>>>>>>>>>>> Applying  automatic  per-sample gradient clipping.
>>>>>>>>>>>>>>>>> Block heads for per-sample gradient clipping are defined as: ['transformer.wte']




Epoch 1 completed.
Epoch 2 completed.
Epoch 3 completed.
Epoch 4 completed.
Epoch 5 completed.
Synthetic sentences generated and saved to synthetic_data_bossep.csv


In [8]:
# Define the directory to save the model and tokenizer
save_directory = "./trained_model_with_classifier_distilgpt2_sst2_eps16"

# Create the directory if it does not exist
import os
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./trained_model_with_classifier_distilgpt2_sst2_eps16
