## Using SmartNoise Synthesizers to generate synthetic data

In [None]:
from snsynth import Synthesizer # TODO: GETTING WIERD ERROR HERE, ANYONE ELSE?
import pandas as pd

data = pd.read_csv("maternalHealthDataSet.csv")


# MST synthesizer is used here since it took 1st place in NIST's DP syntehtic data contest
synth = Synthesizer.create("mst", epsilon=1.0, delta=1e-5, verbose=True)
synth.fit(data, preprocessor_eps=1.0)
data_synth = synth.sample(1000)
data_synth

## Method 3
Train DP public model on original low count data

In [None]:
#TODO: IMPLEMENT DP-SGD
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split

# Define MLP
class MLP(nn.Module):

    def __init__(self):
        super().__init__()
        # Define layers (hidden layer size = input size here)
        self.layers = nn.Sequential(
            # fully connected layer, 6 input to 6
            nn.Linear(6, 6),
            nn.ReLU(),
            # fully connected layer, 6 to 3 output
            nn.Linear(6, 3)
        )
        # handles typeErrors for Linear layers
        self.double()

    # forward propagation
    def forward(self, x):
        return self.layers(x)

# Create model
model = MLP()

# LOAD DATA
# Drop one-hot encoding and string label column
health_data = pd.read_csv("maternalHealthDataSet.csv").drop(["RiskLevelStr","MidRisk","LowRisk","HighRisk"], axis=1)

# data_y is labels, data_x is features
data_y = health_data.iloc[:, -1]
data_x = health_data.drop("RiskLevel", axis=1)
data_x = torch.tensor(data_x.values)
data_y = torch.tensor(data_y.values)

# Split dataset into training and validation sets
train_size = int(0.8 * len(data_x))  # 80% training
val_size = len(data_x) - train_size  # 20% validation
train_data, test_data = random_split(TensorDataset(data_x, data_y), [train_size, val_size])

# Split into batches
batch_size = 16
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# define loss function & optimizer
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# training loop
num_epochs = 150
best_accur = 0.0
for epoch in range(num_epochs):
    train_loss = 0.0

    # Make sure gradient tracking is on
    model.train(True)

    for batch_x, batch_y in train_loader:
        # Clear previous gradients
        optimizer.zero_grad()
        # Forward pass
        pred = model(batch_x)
        # Compute loss
        loss = criterion(pred, batch_y)
        # Back propagation
        loss.backward()
        # Update weights
        optimizer.step()
        # Track loss
        train_loss += loss.item()

    train_loss /= len(train_loader.dataset)  # Average loss

    # Set model to evaluation mode
    model.eval()
    test_loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            # get prediction and calculate loss
            pred = model(batch_x)
            loss = criterion(pred, batch_y)
            test_loss += loss.item()

            # calculate accuracy
            predicted_class = torch.max(pred, dim=1)[1]
            total += batch_x.size(0)
            correct += (predicted_class == batch_y).float().sum()

        test_loss /= len(test_loader.dataset)  # Average validation loss
        test_accuracy = correct / total  # Validation accuracy

    # Print info every 5 epochs
    if (epoch % 5 == 0):
        print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {train_loss:.4f}, "
          f"Validation Loss: {test_loss:.4f}, "
          f"Validation Accuracy: {test_accuracy:.4f}")
        
    
    # Track best performance, and save the model's state
    if best_accur < test_accuracy:
        best_accur = test_accuracy
        torch.save(model.state_dict(), f"m3_model_weights.pth")




Epoch 1/150, Train Loss: 2.2981, Validation Loss: 0.9989, Validation Accuracy: 0.2118
Epoch 6/150, Train Loss: 0.1087, Validation Loss: 0.1070, Validation Accuracy: 0.3350
Epoch 11/150, Train Loss: 0.0780, Validation Loss: 0.0769, Validation Accuracy: 0.3842
Epoch 16/150, Train Loss: 0.0708, Validation Loss: 0.0724, Validation Accuracy: 0.4384
Epoch 21/150, Train Loss: 0.0681, Validation Loss: 0.0664, Validation Accuracy: 0.4433
Epoch 26/150, Train Loss: 0.0635, Validation Loss: 0.0648, Validation Accuracy: 0.4926
Epoch 31/150, Train Loss: 0.0610, Validation Loss: 0.0605, Validation Accuracy: 0.5123
Epoch 36/150, Train Loss: 0.0606, Validation Loss: 0.0599, Validation Accuracy: 0.4877
Epoch 41/150, Train Loss: 0.0599, Validation Loss: 0.0594, Validation Accuracy: 0.5074
Epoch 46/150, Train Loss: 0.0581, Validation Loss: 0.0569, Validation Accuracy: 0.5714
Epoch 51/150, Train Loss: 0.0575, Validation Loss: 0.0588, Validation Accuracy: 0.4877
Epoch 56/150, Train Loss: 0.0561, Validation 