# Differentially Private Synthetic Tabular Data Generation

## Using DP-SGD with Variational Autoencoder

**Dataset:** Synthetic Telemetry Data (May-July 2024)  
**Objective:** Generate differentially private synthetic data while preserving statistical utility  
**Method:** DP-SGD applied to VAE training

---


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager
from opacus.validators import ModuleValidator

plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")

np.random.seed(42)
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Loading


In [None]:
df_raw = pd.read_csv("data/synthetic_telemetry_data.csv")
print(f"Shape: {df_raw.shape}, {len(df_raw):,} records\n")
df_raw.head(10)

## 2. EDA


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

product_counts = df_raw["Product Type"].value_counts()
axes[0].bar(product_counts.index, product_counts.values, color="skyblue")
axes[0].set_title("Product Type Distribution")
axes[0].grid(axis="y", alpha=0.3)

event_counts = df_raw["Event Type"].value_counts()
axes[1].bar(event_counts.index, event_counts.values, color="coral")
axes[1].set_title("Event Type Distribution")
axes[1].grid(axis="y", alpha=0.3)
plt.tight_layout()

In [None]:
df_raw["Time of Event"] = pd.to_datetime(df_raw["Time of Event"])
df_raw["Date"] = df_raw["Time of Event"].dt.date
df_raw["Hour"] = df_raw["Time of Event"].dt.hour
df_raw["DayOfWeek"] = df_raw["Time of Event"].dt.day_name()

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

daily_counts = df_raw.groupby("Date").size()
axes[0].plot(daily_counts.index, daily_counts.values)
axes[0].set_title("Events per Day")
axes[0].set_xlabel("Date")
axes[0].set_ylabel("Events")
axes[0].tick_params(axis="x", rotation=45)

hourly_counts = df_raw["Hour"].value_counts().sort_index()
axes[1].bar(hourly_counts.index, hourly_counts.values, color="green")
axes[1].set_title("Events by Hour")
axes[1].set_xlabel("Hour")
axes[1].set_ylabel("Count")

day_order = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]
dow_counts = df_raw["DayOfWeek"].value_counts().reindex(day_order)
axes[2].bar(dow_counts.index, dow_counts.values, color="purple")
axes[2].set_title("Events by Day of Week")
axes[2].set_xlabel("Day")
axes[2].set_ylabel("Count")
axes[2].tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()

In [None]:
crosstab = pd.crosstab(df_raw["Product Type"], df_raw["Event Type"], margins=True)

plt.figure(figsize=(10, 6))
sns.heatmap(
    crosstab.iloc[:-1, :-1],
    annot=True,
    fmt="d",
    cmap="YlOrRd",
    linewidths=0.5,
    cbar_kws={"label": "Count"},
)
plt.title("Product Type × Event Type Heatmap")
plt.xlabel("Event Type")
plt.ylabel("Product Type")
plt.tight_layout()
plt.show()

## 3. Data Preprocessing

We prepare the data for the VAE by:

1. Dropping the `User ID` column (not needed for modeling)
2. Converting timestamps to Unix seconds for numerical representation
3. One-hot encoding categorical variables
4. Standardizing numerical features


In [None]:
df = df_raw.copy()

df = df.drop(columns=["User ID"], errors="ignore")
df["TimeSeconds"] = pd.to_datetime(df["Time of Event"]).astype("int64") // 1_000_000_000
df["TimeOriginal"] = df["Time of Event"]

categorical_cols = ["Product Type", "Event Type"]
numeric_cols = ["TimeSeconds"]

df.head()

In [None]:
transformer = ColumnTransformer(
    transformers=[
        (
            "cat",
            OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
            categorical_cols,
        ),
        ("num", StandardScaler(), numeric_cols),
    ],
    remainder="drop",
)

X_transformed = transformer.fit_transform(df[categorical_cols + numeric_cols])

In [None]:
X_train, X_test = train_test_split(X_transformed, test_size=0.2, random_state=42)
print(f"{X_train.shape}, {X_test.shape}")

## 4. Dataset and DataLoader Setup


In [None]:
class TelemetryDataset(Dataset):
    def __init__(self, X):
        self.X = torch.tensor(X, dtype=torch.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx]


train_dataset = TelemetryDataset(X_train)
test_dataset = TelemetryDataset(X_test)

## 5. Model Architecture: Variational Autoencoder (VAE)

The VAE consists of:

- **Encoder**: Maps input data to latent distribution parameters (μ, log σ²)
- **Latent space**: Lower-dimensional representation (default: 8 dimensions)
- **Decoder**: Reconstructs data from latent samples

For DP-SGD, we apply:

- **Gradient clipping**: Bounds per-sample gradient L2 norm
- **Gaussian noise**: Added to clipped gradients for privacy


In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=8, hidden_dims=[64, 32]):
        super(VAE, self).__init__()

        self.input_dim = input_dim
        self.latent_dim = latent_dim

        encoder_layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            encoder_layers.extend(
                [nn.Linear(prev_dim, h_dim), nn.ReLU(), nn.BatchNorm1d(h_dim)]
            )
            prev_dim = h_dim

        self.encoder = nn.Sequential(*encoder_layers)
        self.fc_mu = nn.Linear(hidden_dims[-1], latent_dim)
        self.fc_logvar = nn.Linear(hidden_dims[-1], latent_dim)

        decoder_layers = []
        prev_dim = latent_dim
        for h_dim in reversed(hidden_dims):
            decoder_layers.extend(
                [nn.Linear(prev_dim, h_dim), nn.ReLU(), nn.BatchNorm1d(h_dim)]
            )
            prev_dim = h_dim

        decoder_layers.append(nn.Linear(hidden_dims[0], input_dim))
        self.decoder = nn.Sequential(*decoder_layers)

    def encode(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon = self.decode(z)
        return recon, mu, logvar


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[1]
model = VAE(input_dim=input_dim, latent_dim=8, hidden_dims=[64, 32]).to(device)

model

In [None]:
def vae_loss_function(recon_x, x, mu, logvar, beta=1.0):
    recon_loss = nn.functional.mse_loss(recon_x, x, reduction='sum')
    kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    total_loss = recon_loss + beta * kl_div
    
    return total_loss, recon_loss, kl_div

## 6. Differential Privacy Setup with Opacus

**Privacy Parameters:**
- **ε (epsilon)**: Privacy budget (lower = stronger privacy)
- **δ (delta)**: Probability of privacy breach (typically ≪ 1/n)
- **Max grad norm (C)**: Clipping threshold for gradients
- **Noise multiplier (σ)**: Scale of Gaussian noise

We'll track privacy budget using Opacus's privacy accountant.

In [None]:
TARGET_EPSILON = 8.0
TARGET_DELTA = 1e-5
MAX_GRAD_NORM = 1.0
BATCH_SIZE = 256
EPOCHS = 50

In [None]:
model = ModuleValidator.fix(model)
model = model.to(device)

In [None]:
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False,
)

test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0
)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
privacy_engine = PrivacyEngine()

model, optimizer, train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=1.1,
    max_grad_norm=MAX_GRAD_NORM,
)

## 7. Training Loop with Privacy Tracking

In [None]:
def train_epoch(model, train_loader, optimizer, device, beta=1.0):
    model.train()
    total_loss = 0
    total_recon_loss = 0
    total_kl_loss = 0
    n_batches = 0

    for batch in train_loader:
        batch = batch.to(device)

        optimizer.zero_grad()

        recon_batch, mu, logvar = model(batch)
        loss, recon_loss, kl_loss = vae_loss_function(
            recon_batch, batch, mu, logvar, beta
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_recon_loss += recon_loss.item()
        total_kl_loss += kl_loss.item()
        n_batches += 1

    return (
        total_loss / n_batches,
        total_recon_loss / n_batches,
        total_kl_loss / n_batches,
    )


def evaluate(model, test_loader, device, beta=1.0):
    model.eval()
    total_loss = 0
    total_recon_loss = 0
    total_kl_loss = 0
    n_batches = 0

    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            recon_batch, mu, logvar = model(batch)
            loss, recon_loss, kl_loss = vae_loss_function(
                recon_batch, batch, mu, logvar, beta
            )

            total_loss += loss.item()
            total_recon_loss += recon_loss.item()
            total_kl_loss += kl_loss.item()
            n_batches += 1

    return (
        total_loss / n_batches,
        total_recon_loss / n_batches,
        total_kl_loss / n_batches,
    )

In [None]:
history = {
    "train_loss": [],
    "train_recon": [],
    "train_kl": [],
    "test_loss": [],
    "test_recon": [],
    "test_kl": [],
    "epsilon": [],
}

for epoch in range(1, EPOCHS + 1):
    train_loss, train_recon, train_kl = train_epoch(
        model, train_loader, optimizer, device
    )
    test_loss, test_recon, test_kl = evaluate(model, test_loader, device)

    epsilon = privacy_engine.get_epsilon(delta=TARGET_DELTA)

    history["train_loss"].append(train_loss)
    history["train_recon"].append(train_recon)
    history["train_kl"].append(train_kl)
    history["test_loss"].append(test_loss)
    history["test_recon"].append(test_recon)
    history["test_kl"].append(test_kl)
    history["epsilon"].append(epsilon)

    if epoch % 5 == 0 or epoch == 1:
        print(
            f"Epoch {epoch:3d}/{EPOCHS} | "
            f"Train loss: {train_loss:8.2f} | "
            f"Test loss: {test_loss:8.2f} | "
            f"ε: {epsilon:.2f}"
        )

print(f"Final ε: {history['epsilon'][-1]:.4f} (target: {TARGET_EPSILON})")
print(f"Final δ: {TARGET_DELTA}")