In [1]:
import argparse
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Sequence

import numpy as np
import pandas as pd
import torch
from opacus import PrivacyEngine
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from torch import nn
from torch.utils.data import DataLoader, Dataset

parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=Path, default=Path("../data/telemetry.csv"))
parser.add_argument("--output-path", type=Path, default=Path("../data/synthetic.csv"))
parser.add_argument("--batch-size", type=int, default=256)
parser.add_argument("--latent-dim", type=int, default=16)
parser.add_argument("--epochs", type=int, default=20)
parser.add_argument("--noise-multiplier", type=float, default=1.2)
parser.add_argument("--max-grad-norm", type=float, default=1.0)
parser.add_argument("--target-epsilon", type=float, default=2.0)
parser.add_argument("--delta", type=float, default=1e-5)
parser.add_argument("--synth-samples", type=int, default=10000)
args, _ = parser.parse_known_args()

@dataclass
class Config:
    data_path: Path
    output_path: Path
    batch_size: int
    latent_dim: int
    epochs: int
    noise_multiplier: float
    max_grad_norm: float
    target_epsilon: float
    delta: float
    synth_samples: int

cfg = Config(**vars(args))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cfg, device

(Config(data_path=WindowsPath('../data/telemetry.csv'), output_path=WindowsPath('../data/synthetic.csv'), batch_size=256, latent_dim=16, epochs=20, noise_multiplier=1.2, max_grad_norm=1.0, target_epsilon=2.0, delta=1e-05, synth_samples=10000),
 device(type='cpu'))

In [2]:
df = pd.read_csv(cfg.data_path)
if "User ID" in df.columns:
    df = df.drop(columns=["User ID"])

df["TimeSeconds"] = pd.to_datetime(df["Time of Event"]).astype("int64") // 1_000_000_000
categorical_cols = ["Product Type", "Event Type"]
numeric_cols = ["TimeSeconds"]
time_min, time_max = df["TimeSeconds"].min(), df["TimeSeconds"].max()
df.head(), time_min, time_max

(  Product Type Event Type        Time of Event  TimeSeconds
 0            C       open  2024-05-14 07:34:33   1715672073
 1            E      close  2024-06-17 14:43:26   1718635406
 2            C      close  2024-07-13 05:20:43   1720848043
 3            D       open  2024-06-11 17:39:05   1718127545
 4            C       save  2024-06-23 18:20:35   1719166835,
 np.int64(1714521637),
 np.int64(1722383984))

In [3]:
transformer = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore",
         sparse_output=False), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols),
    ]
)
X_real = transformer.fit_transform(df[categorical_cols + numeric_cols])
cat_encoder: OneHotEncoder = transformer.named_transformers_["categorical"]
cat_sizes = [len(c) for c in cat_encoder.categories_]
cat_total = sum(cat_sizes)
num_scaler: StandardScaler = transformer.named_transformers_["numeric"]

X_real.shape, cat_sizes, cat_total

((152356, 13), [7, 5], 12)

In [4]:
class TelemetryDataset(Dataset):
    def __init__(self, matrix: np.ndarray):
        self.tensor = torch.tensor(matrix, dtype=torch.float32)

    def __len__(self) -> int:
        return self.tensor.shape[0]

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.tensor[idx]

dataset = TelemetryDataset(X_real)
dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True)
sample_rate = cfg.batch_size / len(dataset)

len(dataset), sample_rate

(152356, 0.0016802751450550027)

In [5]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim: int, latent_dim: int) -> None:
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid(),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = self.encoder(x)
        return self.decoder(z)

model = Autoencoder(X_real.shape[1], cfg.latent_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

model, sum(p.numel() for p in model.parameters())

(Autoencoder(
   (encoder): Sequential(
     (0): Linear(in_features=13, out_features=128, bias=True)
     (1): ReLU()
     (2): Linear(in_features=128, out_features=16, bias=True)
   )
   (decoder): Sequential(
     (0): Linear(in_features=16, out_features=128, bias=True)
     (1): ReLU()
     (2): Linear(in_features=128, out_features=13, bias=True)
     (3): Sigmoid()
   )
 ),
 7709)

In [7]:
privacy_engine = PrivacyEngine()
model, optimizer, dataloader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=dataloader,
    noise_multiplier=cfg.noise_multiplier,
    max_grad_norm=cfg.max_grad_norm,
)

eps_history = []

for epoch in range(1, cfg.epochs + 1):
    model.train()
    running_loss = 0.0
    for batch in dataloader:
        batch = batch.to(device)
        optimizer.zero_grad()
        recon = model(batch)
        loss = criterion(recon, batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch.size(0)

    epoch_loss = running_loss / len(dataset)
    try:
        epsilon = privacy_engine.accountant.get_epsilon(delta=cfg.delta)
    except ValueError:
        epsilon = float("nan")

    eps_history.append(epsilon)
    print(f"Epoch {epoch:02d} | loss={epoch_loss:.4f} | ε={epsilon:.3f}")
    if np.isfinite(cfg.target_epsilon) and np.isfinite(epsilon) and epsilon >= cfg.target_epsilon:
        print(f"Stopping early: ε target {cfg.target_epsilon} reached.")
        break

eps_history

  loss.backward()


Epoch 01 | loss=0.1094 | ε=0.153
Epoch 02 | loss=0.0711 | ε=0.212
Epoch 03 | loss=0.0594 | ε=0.259
Epoch 04 | loss=0.0511 | ε=0.299
Epoch 05 | loss=0.0485 | ε=0.335
Epoch 06 | loss=0.0459 | ε=0.367
Epoch 07 | loss=0.0445 | ε=0.397
Epoch 08 | loss=0.0436 | ε=0.425
Epoch 09 | loss=0.0436 | ε=0.452
Epoch 10 | loss=0.0438 | ε=0.477
Epoch 11 | loss=0.0430 | ε=0.501
Epoch 12 | loss=0.0434 | ε=0.525
Epoch 13 | loss=0.0434 | ε=0.547
Epoch 14 | loss=0.0428 | ε=0.569
Epoch 15 | loss=0.0427 | ε=0.590
Epoch 16 | loss=0.0428 | ε=0.610
Epoch 17 | loss=0.0431 | ε=0.630
Epoch 18 | loss=0.0427 | ε=0.649
Epoch 19 | loss=0.0424 | ε=0.667
Epoch 20 | loss=0.0426 | ε=0.686


[np.float64(0.15265187735663266),
 np.float64(0.21217591080216056),
 np.float64(0.2589209657430531),
 np.float64(0.2989154896853333),
 np.float64(0.33454099001913706),
 np.float64(0.36703421953641385),
 np.float64(0.39713686640341445),
 np.float64(0.4253356043686182),
 np.float64(0.4519706285661513),
 np.float64(0.47729082950604185),
 np.float64(0.5014849720728536),
 np.float64(0.5247002452613325),
 np.float64(0.5470541468930556),
 np.float64(0.5686422243069562),
 np.float64(0.5895434585435535),
 np.float64(0.609824003712563),
 np.float64(0.6295399921745031),
 np.float64(0.6487395116032678),
 np.float64(0.6674641264862889),
 np.float64(0.6857500656724668)]

In [8]:
model.eval()
with torch.no_grad():
    latent = torch.randn(cfg.synth_samples, cfg.latent_dim, device=device)
    synth_matrix = model.decoder(latent).cpu().numpy()

synth_matrix.shape, synth_matrix[:2]

((10000, 13),
 array([[0.42610097, 0.014578  , 0.9352448 , 0.34951293, 0.34734988,
         0.3107458 , 0.06762154, 0.14543477, 0.17739493, 0.10765409,
         0.81321365, 0.5616892 , 0.2926853 ],
        [0.7889601 , 0.12428693, 0.11767975, 0.36946914, 0.25241432,
         0.6883236 , 0.02759617, 0.24528891, 0.3882285 , 0.6869072 ,
         0.36680713, 0.04049201, 0.01743271]], dtype=float32))

In [11]:
decoded_cats = []
start = 0
for col_name, size, labels in zip(categorical_cols, cat_sizes, cat_encoder.categories_):
    block = synth_matrix[:, start:start + size]
    values = labels[block.argmax(axis=1)]
    decoded_cats.append(pd.Series(values, name=col_name))
    start += size

cat_df = pd.concat(decoded_cats, axis=1)
num_block = synth_matrix[:, cat_total:]
num_vals = num_scaler.inverse_transform(num_block)
num_df = pd.DataFrame(num_vals, columns=numeric_cols)
num_df["TimeSeconds"] = num_df["TimeSeconds"].clip(time_min, time_max)
num_df["Time of Event"] = pd.to_datetime(num_df["TimeSeconds"], unit="s")

synthetic_df = pd.concat([cat_df.reset_index(drop=True), num_df.reset_index(drop=True)], axis=1)
synthetic_df.to_csv(cfg.output_path, index=False)
print(f"Synthetic data saved to {cfg.output_path}")

synthetic_df.head()

Synthetic data saved to ..\data\synthetic.csv


Unnamed: 0,Product Type,Event Type,TimeSeconds,Time of Event
0,C,reset,1719125000.0,2024-06-23 06:47:28
1,A,open,1718502000.0,2024-06-16 01:33:52
2,C,close,1720718000.0,2024-07-11 17:08:16
3,A,save,1718951000.0,2024-06-21 06:21:52
4,Others,open,1720519000.0,2024-07-09 09:55:12


In [12]:
def summarize_time(real: pd.Series, synth: pd.Series) -> None:
    stats = {
        "mean": (real.mean(), synth.mean()),
        "median": (real.median(), synth.median()),
        "std": (real.std(), synth.std()),
        "min": (real.min(), synth.min()),
        "max": (real.max(), synth.max()),
    }
    for k, (r, s) in stats.items():
        print(f"{k:>6}: real={r:.2f} | synth={s:.2f}")


print("=== TimeSeconds summary ===")
summarize_time(df["TimeSeconds"], synthetic_df["TimeSeconds"])
print("\n=== Category distributions ===")
for col in categorical_cols:
    print(f"\n{col}")
    print(pd.DataFrame({
        "real": df[col].value_counts(normalize=True),
        "synthetic": synthetic_df[col].value_counts(normalize=True),
    }))

=== TimeSeconds summary ===
  mean: real=1718462125.60 | synth=1719405440.00
median: real=1718467899.50 | synth=1719257856.00
   std: real=2265577.72 | synth=753663.06
   min: real=1714521637.00 | synth=1718462208.00
   max: real=1722383984.00 | synth=1720727296.00

=== Category distributions ===

Product Type
                  real  synthetic
Product Type                     
A             0.151592     0.1613
B             0.251549     0.0931
C             0.186865     0.1196
D             0.200393     0.2906
E             0.010377     0.0153
F             0.098874     0.2248
Others        0.100350     0.0953

Event Type
                real  synthetic
Event Type                     
close       0.307733     0.1355
error       0.032326     0.1534
open        0.382696     0.2450
reset       0.067454     0.2686
save        0.209792     0.1975
