In [1]:
import argparse
from dataclasses import dataclass
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import torch
from opacus import PrivacyEngine
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from torch import nn
from torch.utils.data import DataLoader, Dataset

parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=Path, default=Path("../data/telemetry.csv"))
parser.add_argument("--output-path", type=Path, default=Path("../data/synthetic.csv"))
parser.add_argument("--batch-size", type=int, default=256)
parser.add_argument("--latent-dim", type=int, default=16)
parser.add_argument("--epochs", type=int, default=20)
parser.add_argument("--noise-multiplier", type=float, default=1.2)
parser.add_argument("--max-grad-norm", type=float, default=1.0)
parser.add_argument("--target-epsilon", type=float, default=2.0)
parser.add_argument("--delta", type=float, default=1e-5)
parser.add_argument("--synth-samples", type=int, default=10000)
parser.add_argument("--latent-jitter", type=float, default=0.1)
args, _ = parser.parse_known_args()


@dataclass
class Config:
    data_path: Path
    output_path: Path
    batch_size: int
    latent_dim: int
    epochs: int
    noise_multiplier: float
    max_grad_norm: float
    target_epsilon: float
    delta: float
    synth_samples: int
    latent_jitter: float


cfg = Config(**vars(args))

torch.manual_seed(0)
np.random.seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cfg, device

(Config(data_path=WindowsPath('../data/telemetry.csv'), output_path=WindowsPath('../data/synthetic.csv'), batch_size=256, latent_dim=16, epochs=20, noise_multiplier=1.2, max_grad_norm=1.0, target_epsilon=2.0, delta=1e-05, synth_samples=10000, latent_jitter=0.1),
 device(type='cpu'))

In [2]:
df = pd.read_csv(cfg.data_path)
if "User ID" in df.columns:
    df = df.drop(columns=["User ID"])

df["TimeSeconds"] = pd.to_datetime(df["Time of Event"]).astype("int64") // 1_000_000_000

categorical_cols = ["Product Type", "Event Type"]
numeric_cols = ["TimeSeconds"]

time_min, time_max = df["TimeSeconds"].min(), df["TimeSeconds"].max()
df.head(), time_min, time_max

(  Product Type Event Type        Time of Event  TimeSeconds
 0            C       open  2024-05-14 07:34:33   1715672073
 1            E      close  2024-06-17 14:43:26   1718635406
 2            C      close  2024-07-13 05:20:43   1720848043
 3            D       open  2024-06-11 17:39:05   1718127545
 4            C       save  2024-06-23 18:20:35   1719166835,
 np.int64(1714521637),
 np.int64(1722383984))

In [3]:
transformer = ColumnTransformer(
    transformers=[
        (
            "categorical",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            categorical_cols,
        ),
        ("numeric", StandardScaler(), numeric_cols),
    ]
)

X_real = transformer.fit_transform(df[categorical_cols + numeric_cols])
cat_encoder: OneHotEncoder = transformer.named_transformers_["categorical"]
cat_sizes: List[int] = [len(c) for c in cat_encoder.categories_]
cat_total = sum(cat_sizes)
num_scaler: StandardScaler = transformer.named_transformers_["numeric"]

X_real.shape, cat_sizes, cat_total

((152356, 13), [7, 5], 12)

In [4]:
cat_targets_list = []
start = 0
for size in cat_sizes:
    block = X_real[:, start:start + size]
    cat_targets_list.append(block.argmax(axis=1))
    start += size

cat_targets = np.stack(cat_targets_list, axis=1).astype(np.int64)
numeric_targets = X_real[:, cat_total:].astype(np.float32)

features = torch.tensor(X_real, dtype=torch.float32)
cat_targets_tensor = torch.tensor(cat_targets, dtype=torch.long)
num_targets_tensor = torch.tensor(numeric_targets, dtype=torch.float32)

class TelemetryDataset(Dataset):
    def __init__(self, features: torch.Tensor, cat_targets: torch.Tensor, num_targets: torch.Tensor):
        self.features = features
        self.cat_targets = cat_targets
        self.num_targets = num_targets

    def __len__(self) -> int:
        return self.features.size(0)

    def __getitem__(self, idx: int):
        return (
            self.features[idx],
            self.cat_targets[idx],
            self.num_targets[idx],
        )

dataset = TelemetryDataset(features, cat_targets_tensor, num_targets_tensor)
dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True)

len(dataset), cfg.batch_size / len(dataset)

(152356, 0.0016802751450550027)

In [5]:
class TelemetryAutoencoder(nn.Module):
    def __init__(
        self, input_dim: int, latent_dim: int, cat_sizes: List[int], num_dim: int
    ) -> None:
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim),
            nn.ReLU(),
        )
        self.cat_decoders = nn.ModuleList(
            [nn.Linear(latent_dim, size) for size in cat_sizes]
        )
        self.num_decoder = nn.Linear(latent_dim, num_dim)

    def encode(self, x: torch.Tensor) -> torch.Tensor:
        return self.encoder(x)

    def decode_from_latent(self, z: torch.Tensor):
        cat_logits = [decoder(z) for decoder in self.cat_decoders]
        num_out = self.num_decoder(z)
        return cat_logits, num_out

    def forward(self, x: torch.Tensor):
        z = self.encode(x)
        cat_logits, num_out = self.decode_from_latent(z)
        return cat_logits, num_out, z


model = TelemetryAutoencoder(
    input_dim=features.shape[1],
    latent_dim=cfg.latent_dim,
    cat_sizes=cat_sizes,
    num_dim=len(numeric_cols),
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
cat_criterion = nn.CrossEntropyLoss()
num_criterion = nn.MSELoss()

model, sum(p.numel() for p in model.parameters())

(TelemetryAutoencoder(
   (encoder): Sequential(
     (0): Linear(in_features=13, out_features=128, bias=True)
     (1): ReLU()
     (2): Linear(in_features=128, out_features=16, bias=True)
     (3): ReLU()
   )
   (cat_decoders): ModuleList(
     (0): Linear(in_features=16, out_features=7, bias=True)
     (1): Linear(in_features=16, out_features=5, bias=True)
   )
   (num_decoder): Linear(in_features=16, out_features=1, bias=True)
 ),
 4077)

In [6]:
privacy_engine = PrivacyEngine()
model, optimizer, dataloader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=dataloader,
    noise_multiplier=cfg.noise_multiplier,
    max_grad_norm=cfg.max_grad_norm,
)

eps_history = []

for epoch in range(1, cfg.epochs + 1):
    model.train()
    running_loss = 0.0

    for batch_x, batch_cat_targets, batch_num_targets in dataloader:
        batch_x = batch_x.to(device)
        batch_cat_targets = batch_cat_targets.to(device)
        batch_num_targets = batch_num_targets.to(device)

        optimizer.zero_grad()
        cat_logits, num_pred, _ = model(batch_x)

        ce_loss = sum(
            cat_criterion(logits, batch_cat_targets[:, idx])
            for idx, logits in enumerate(cat_logits)
        )
        mse_loss = num_criterion(num_pred, batch_num_targets)

        loss = ce_loss + mse_loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_x.size(0)

    epoch_loss = running_loss / len(dataset)
    try:
        epsilon = privacy_engine.accountant.get_epsilon(delta=cfg.delta)
    except ValueError:
        epsilon = float("nan")

    eps_history.append(epsilon)
    print(f"Epoch {epoch:02d} | loss={epoch_loss:.4f} | ε={epsilon:.3f}")

    if (
        np.isfinite(cfg.target_epsilon)
        and np.isfinite(epsilon)
        and epsilon >= cfg.target_epsilon
    ):
        print(f"Stopping early: ε target {cfg.target_epsilon} reached.")
        break

eps_history

  loss.backward()


Epoch 01 | loss=1.4746 | ε=0.153
Epoch 02 | loss=0.1129 | ε=0.212
Epoch 03 | loss=0.0345 | ε=0.259
Epoch 04 | loss=0.0166 | ε=0.299
Epoch 05 | loss=0.0094 | ε=0.335
Epoch 06 | loss=0.0069 | ε=0.367
Epoch 07 | loss=0.0048 | ε=0.397
Epoch 08 | loss=0.0039 | ε=0.425
Epoch 09 | loss=0.0036 | ε=0.452
Epoch 10 | loss=0.0032 | ε=0.477
Epoch 11 | loss=0.0029 | ε=0.501
Epoch 12 | loss=0.0025 | ε=0.525
Epoch 13 | loss=0.0022 | ε=0.547
Epoch 14 | loss=0.0022 | ε=0.569
Epoch 15 | loss=0.0024 | ε=0.590
Epoch 16 | loss=0.0017 | ε=0.610
Epoch 17 | loss=0.0017 | ε=0.630
Epoch 18 | loss=0.0019 | ε=0.649
Epoch 19 | loss=0.0018 | ε=0.667
Epoch 20 | loss=0.0017 | ε=0.686


[np.float64(0.15265187735659205),
 np.float64(0.2121759108021326),
 np.float64(0.25892096574296464),
 np.float64(0.298915489685312),
 np.float64(0.33454099001918874),
 np.float64(0.3670342195364031),
 np.float64(0.3971368664034298),
 np.float64(0.4253356043685584),
 np.float64(0.4519706285660922),
 np.float64(0.47729082950611196),
 np.float64(0.5014849720727611),
 np.float64(0.5247002452613473),
 np.float64(0.5470541468930574),
 np.float64(0.5686422243069001),
 np.float64(0.5895434585435075),
 np.float64(0.6098240037126326),
 np.float64(0.6295399921744355),
 np.float64(0.6487395116032068),
 np.float64(0.6674641264861636),
 np.float64(0.6857500656724325)]

In [8]:
inference_model = model._module if hasattr(model, "_module") else model
inference_model.eval()
latent_bank = []

latent_loader = DataLoader(dataset, batch_size=1024, shuffle=False)

with torch.no_grad():
    for batch_x, _, _ in latent_loader:
        z = inference_model.encode(batch_x.to(device))
        latent_bank.append(z.cpu())

latent_bank = torch.cat(latent_bank, dim=0)

indices = np.random.choice(latent_bank.shape[0], size=cfg.synth_samples, replace=True)
base_latent = latent_bank[indices]
jitter = torch.randn_like(base_latent) * cfg.latent_jitter
synth_latent = base_latent + jitter

with torch.no_grad():
    cat_logits, num_pred = inference_model.decode_from_latent(synth_latent.to(device))

In [9]:
decoded_categories = []
for col_name, logits, labels in zip(
    categorical_cols, cat_logits, cat_encoder.categories_
):
    indices = logits.argmax(dim=1).numpy()
    decoded_categories.append(pd.Series(labels[indices], name=col_name))

cat_df = pd.concat(decoded_categories, axis=1)

num_array = num_pred.numpy()
num_vals = num_scaler.inverse_transform(num_array)
num_df = pd.DataFrame(num_vals, columns=numeric_cols)
num_df["TimeSeconds"] = num_df["TimeSeconds"].clip(time_min, time_max).astype(np.int64)
num_df["Time of Event"] = pd.to_datetime(num_df["TimeSeconds"], unit="s")

synthetic_df = pd.concat(
    [cat_df.reset_index(drop=True), num_df.reset_index(drop=True)], axis=1
)
synthetic_df = synthetic_df[categorical_cols + numeric_cols + ["Time of Event"]]
synthetic_df.to_csv(cfg.output_path, index=False)

print(f"Synthetic data saved to {cfg.output_path.resolve()}")
synthetic_df.head()

Synthetic data saved to C:\Users\jason\Desktop\code\dsc-180a-q1\data\synthetic.csv


  num_df["TimeSeconds"] = num_df["TimeSeconds"].clip(time_min, time_max).astype(np.int64)


Unnamed: 0,Product Type,Event Type,TimeSeconds,Time of Event
0,B,save,1715474176,2024-05-12 00:36:16
1,D,open,1721368448,2024-07-19 05:54:08
2,C,close,1716754176,2024-05-26 20:09:36
3,Others,close,1716259456,2024-05-21 02:44:16
4,B,close,1720302208,2024-07-06 21:43:28


In [10]:
final_epsilon = eps_history[-1] if eps_history else float("nan")
print(f"Final ε (δ={cfg.delta}): {final_epsilon:.3f}")

def summarize_time(real: pd.Series, synth: pd.Series) -> None:
    stats = {
        "mean": (real.mean(), synth.mean()),
        "median": (real.median(), synth.median()),
        "std": (real.std(), synth.std()),
        "min": (real.min(), synth.min()),
        "max": (real.max(), synth.max()),
    }
    for k, (r, s) in stats.items():
        print(f"{k:>6}: real={r:.2f} | synth={s:.2f}")

print("\n=== TimeSeconds summary ===")
summarize_time(df["TimeSeconds"], synthetic_df["TimeSeconds"])

print("\n=== Category distributions ===")
for col in categorical_cols:
    counts = pd.DataFrame({
        "real": df[col].value_counts(normalize=True),
        "synthetic": synthetic_df[col].value_counts(normalize=True),
    })
    print(f"\n{col}")
    print(counts)

Final ε (δ=1e-05): 0.686

=== TimeSeconds summary ===
  mean: real=1718462125.60 | synth=1718431902.19
median: real=1718467899.50 | synth=1718434240.00
   std: real=2265577.72 | synth=2297103.45
   min: real=1714521637.00 | synth=1714521637.00
   max: real=1722383984.00 | synth=1722383984.00

=== Category distributions ===

Product Type
                  real  synthetic
Product Type                     
A             0.151592     0.1544
B             0.251549     0.2481
C             0.186865     0.1877
D             0.200393     0.2059
E             0.010377     0.0099
F             0.098874     0.1007
Others        0.100350     0.0933

Event Type
                real  synthetic
Event Type                     
open        0.382696     0.3827
close       0.307733     0.3066
save        0.209792     0.2112
reset       0.067454     0.0680
error       0.032326     0.0315
