## Using SmartNoise Synthesizers to generate synthetic data

In [88]:
from snsynth import Synthesizer # TODO: GETTING WIERD ERROR HERE, ANYONE ELSE?
import pandas as pd

# TODO: NOTE: ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
            # smartnoise-synth 1.0.4 requires opacus<0.15.0,>=0.14.0, but you have opacus 1.5.2 which is incompatible.

data = pd.read_csv("maternalHealthDataSet.csv")


# MST synthesizer is used here since it took 1st place in NIST's DP syntehtic data contest
synth = Synthesizer.create("mst", epsilon=1.0, delta=1e-5, verbose=True)
synth.fit(data, preprocessor_eps=1.0)
data_synth = synth.sample(1000)
data_synth

Please install mbi with:
   pip install git+https://github.com/ryan112358/private-pgm.git@01f02f17eba440f4e76c1d06fa5ee9eed0bd2bca


ImportError: 

## Justification for using a Decision Tree

Showing that an MLP does not have sufficient accuracy using DP-SGD


In [None]:
from opacus import PrivacyEngine
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split

# Define MLP
class MLP(nn.Module):

    def __init__(self):
        super().__init__()
        # Define layers (hidden layer size = input size here)
        self.layers = nn.Sequential(
            # fully connected layer, 6 input to 6
            nn.Linear(6, 6),
            nn.ReLU(),
            # fully connected layer, 6 to 3 output
            nn.Linear(6, 3)
        )
        # handles typeErrors for Linear layers
        self.double()

    # forward propagation
    def forward(self, x):
        return self.layers(x)
    
# Create model
model = MLP()

# Model params
batch_size = 16
learning_rate = 0.001
num_epochs = 150
    
# Define loss function & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Load data
# Drop one-hot encoding and string label column
health_data = pd.read_csv("maternalHealthDataSet.csv").drop(["RiskLevelStr","MidRisk","LowRisk","HighRisk"], axis=1)
# data_y is labels, data_x is features
data_y = health_data.iloc[:, -1]
data_x = health_data.drop("RiskLevel", axis=1)
data_x = torch.tensor(data_x.values)
data_y = torch.tensor(data_y.values)

# Split dataset into randomly split training and validation sets
train_size = int(0.8 * len(data_x))  # 80% training
val_size = len(data_x) - train_size  # 20% validation
train_data, test_data = random_split(TensorDataset(data_x, data_y), [train_size, val_size])

# Split into batches and give to DataLoader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# IMPLEMENT DIFFERENTIAL PRIVACY FOR MODEL
model.train()
privacy_engine = PrivacyEngine()
model, optimizer, train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=1.1,
    max_grad_norm=1.0,
)

# Training loop
best_accur = 0.0
final_output = ""
for epoch in range(num_epochs):
    train_loss = 0.0

    # Make sure gradient tracking is on
    model.train(True)

    for batch_x, batch_y in train_loader:
        # Clear previous gradients
        optimizer.zero_grad()
        # Forward pass
        pred = model(batch_x)
        # Compute loss
        loss = criterion(pred, batch_y)
        # Back propagation
        loss.backward()
        # Update weights
        optimizer.step()
        # Track loss
        train_loss += loss.item()

    train_loss /= len(train_loader.dataset)  # Average loss

    # Set model to evaluation mode
    model.eval()
    test_loss = 0.0
    total = 0
    correct = 0
    # Don't track grad for testing
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            # Get prediction and calculate loss
            pred = model(batch_x)
            loss = criterion(pred, batch_y)
            test_loss += loss.item()

            # Find most activated output node for each output given in tensor
            predicted_class = torch.max(pred, dim=1)[1] 
            # Calculate accuracy
            total += batch_x.size(0)
            correct += (predicted_class == batch_y).float().sum()

        test_loss /= len(test_loader.dataset)  # Average validation loss
        test_accuracy = correct / total  # Validation accuracy

    
    output = f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {test_loss:.4f}, Validation Accuracy: {test_accuracy:.4f}"
        
    
    # Track best performance, and save the model's state
    if best_accur < test_accuracy:
        best_accur = test_accuracy
        final_output = output

print("BEST MODEL:\n"+final_output)

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


Epoch 120/150, Train Loss: 0.0872, Validation Loss: 0.0810, Validation Accuracy: 0.4877


# Comparing non-dp decision tree vs dp decision tree

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import diffprivlib.models as dp

maternal_health = pd.read_csv('maternalHealthDataSet.csv')

In [23]:
X = maternal_health[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']]
y = maternal_health['RiskLevel']

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Method 3 (Model trained with original data)

In [91]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate the model 
accuracy = clf.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.81


## Model 4 (DP model trained with original data)

In [92]:
classes = (0, 1, 2) # encoding of low-risk, mid-risk, high-risk
bounds = ([], []) # TODO: Use above threshold to calculate these or just provide rough bounds
for col in X.columns:
    bounds[0].append(maternal_health[col].min())
    bounds[1].append(maternal_health[col].max())

In [94]:
dp_clf = dp.DecisionTreeClassifier(epsilon=1, bounds=bounds, classes=classes)
dp_clf.fit(X_train, y_train)

# Evaluate the model
accuracy = dp_clf.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.61
