## Using SmartNoise Synthesizers to generate synthetic data

In [1]:
from snsynth import Synthesizer # TODO: GETTING WIERD ERROR HERE, ANYONE ELSE?
import pandas as pd

data = pd.read_csv("maternalHealthDataSet.csv")


# MST synthesizer is used here since it took 1st place in NIST's DP syntehtic data contest
synth = Synthesizer.create("mst", epsilon=1.0, delta=1e-5, verbose=True)
synth.fit(data, preprocessor_eps=0.2)
data_synth = synth.sample(1000)
data_synth



Spent 0.2 epsilon on preprocessor, leaving 0.8 for training
Fitting with 1751040000 dimensions
Getting cliques
Estimating marginals


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,LowRisk,MidRisk,HighRisk,RiskLevelStr,RiskLevel
0,27,120,90,7.0,99.2,70,1,0,0,low risk,0
1,16,140,80,7.0,99.2,76,0,1,0,mid risk,1
2,46,120,80,7.0,99.2,76,1,0,0,low risk,0
3,15,90,89,7.8,99.2,90,0,1,0,mid risk,1
4,30,90,60,7.0,99.2,76,0,1,0,mid risk,1
...,...,...,...,...,...,...,...,...,...,...,...
995,63,120,50,6.2,99.2,70,1,0,0,low risk,0
996,35,75,70,7.8,67.2,80,1,0,0,low risk,0
997,27,120,50,7.4,99.2,66,1,0,0,low risk,0
998,49,120,90,7.0,99.2,60,1,0,1,high risk,2


## Method 3

Train DP public model on original low count data


In [15]:
#TODO: IMPLEMENT DP-SGD
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split

# Define MLP
class MLP(nn.Module):

    def __init__(self):
        super().__init__()
        # Define layers (hidden layer size = input size here)
        self.layers = nn.Sequential(
            # fully connected layer, 6 input to 6
            nn.Linear(6, 6),
            nn.ReLU(),
            # fully connected layer, 6 to 3 output
            nn.Linear(6, 3)
        )
        # handles typeErrors for Linear layers
        self.double()

    # forward propagation
    def forward(self, x):
        return self.layers(x)

# Create model
model = MLP()

# LOAD DATA
# Drop one-hot encoding and string label column
health_data = pd.read_csv("maternalHealthDataSet.csv").drop(["RiskLevelStr","MidRisk","LowRisk","HighRisk"], axis=1)

# data_y is labels, data_x is features
data_y = health_data.iloc[:, -1]
data_x = health_data.drop("RiskLevel", axis=1)
data_x = torch.tensor(data_x.values)
data_y = torch.tensor(data_y.values)

# Split dataset into training and validation sets
train_size = int(0.8 * len(data_x))  # 80% training
val_size = len(data_x) - train_size  # 20% validation
train_data, test_data = random_split(TensorDataset(data_x, data_y), [train_size, val_size])

# Split into batches
batch_size = 16
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# define loss function & optimizer
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# training loop
num_epochs = 150
best_accur = 0.0
for epoch in range(num_epochs):
    train_loss = 0.0

    # Make sure gradient tracking is on
    model.train(True)

    for batch_x, batch_y in train_loader:
        # Clear previous gradients
        optimizer.zero_grad()
        # Forward pass
        pred = model(batch_x)
        # Compute loss
        loss = criterion(pred, batch_y)
        # Back propagation
        loss.backward()
        # Update weights
        optimizer.step()
        # Track loss
        train_loss += loss.item()

    train_loss /= len(train_loader.dataset)  # Average loss

    # Set model to evaluation mode
    model.eval()
    test_loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            # get prediction and calculate loss
            pred = model(batch_x)
            loss = criterion(pred, batch_y)
            test_loss += loss.item()

            # calculate accuracy
            predicted_class = torch.max(pred, dim=1)[1]
            total += batch_x.size(0)
            correct += (predicted_class == batch_y).float().sum()

        test_loss /= len(test_loader.dataset)  # Average validation loss
        test_accuracy = correct / total  # Validation accuracy

    # Print info every 5 epochs
    if (epoch % 5 == 0):
        print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {train_loss:.4f}, "
          f"Validation Loss: {test_loss:.4f}, "
          f"Validation Accuracy: {test_accuracy:.4f}")
        
    
    # Track best performance, and save the model's state
    if best_accur < test_accuracy:
        best_accur = test_accuracy
        torch.save(model.state_dict(), f"m3_model_weights.pth")

Epoch 1/150, Train Loss: 0.3740, Validation Loss: 0.1735, Validation Accuracy: 0.3350
Epoch 6/150, Train Loss: 0.0712, Validation Loss: 0.0720, Validation Accuracy: 0.4236
Epoch 11/150, Train Loss: 0.0689, Validation Loss: 0.0699, Validation Accuracy: 0.3990
Epoch 16/150, Train Loss: 0.0675, Validation Loss: 0.0689, Validation Accuracy: 0.4286
Epoch 21/150, Train Loss: 0.0655, Validation Loss: 0.0678, Validation Accuracy: 0.4433
Epoch 26/150, Train Loss: 0.0632, Validation Loss: 0.0652, Validation Accuracy: 0.4926
Epoch 31/150, Train Loss: 0.0605, Validation Loss: 0.0630, Validation Accuracy: 0.5419
Epoch 36/150, Train Loss: 0.0588, Validation Loss: 0.0618, Validation Accuracy: 0.5517
Epoch 41/150, Train Loss: 0.0569, Validation Loss: 0.0607, Validation Accuracy: 0.5714
Epoch 46/150, Train Loss: 0.0560, Validation Loss: 0.0598, Validation Accuracy: 0.5517
Epoch 51/150, Train Loss: 0.0550, Validation Loss: 0.0584, Validation Accuracy: 0.5813
Epoch 56/150, Train Loss: 0.0547, Validation 

# Comparing non-dp decision tree vs dp decision tree

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import diffprivlib.models as dp

maternal_health = pd.read_csv('maternalHealthDataSet.csv')

In [39]:
X = maternal_health[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']]
y = maternal_health['RiskLevel']

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Method 0 (No noise added)

In [146]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate the model 
accuracy = clf.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.84


## Method 1 (BASE CASE) (DP Decision Tree, Original Data)

In [142]:
dp_clf = dp.DecisionTreeClassifier(epsilon=1, bounds=bounds, classes=classes)
dp_clf.fit(X_train, y_train)

# Evaluate the model
accuracy = dp_clf.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.53


## Method 2 (NON DP Synthetic Data, DP Decision Tree)

In [None]:
#code

## Method 3 (DP Synthetic Data, Non DP Decision Tree

In [None]:
#code