## Using SmartNoise Synthesizers to generate synthetic data

In [2]:
from snsynth import Synthesizer # TODO: Exception: Expected exactly one binary to be present. Got: []
import pandas as pd

data = pd.read_csv("maternalHealthDataSet.csv")

# MST synthesizer is used here since it took 1st place in NIST's DP syntehtic data contest
synth = Synthesizer.create("mst", epsilon=1.0, delta=1e-5, verbose=True)
synth.fit(data, preprocessor_eps=0.2)
data_synth = synth.sample(1000)
data_synth



Spent 0.2 epsilon on preprocessor, leaving 0.8 for training
Fitting with 1751040000 dimensions
Getting cliques
Estimating marginals


Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,LowRisk,MidRisk,HighRisk,RiskLevelStr,RiskLevel
0,32,90,50,7.4,99.2,77,0,0,0,high risk,2
1,38,85,100,7.0,99.2,80,1,0,0,low risk,0
2,66,99,80,5.4,99.2,70,0,1,0,mid risk,1
3,35,120,90,7.0,124.8,66,1,0,0,low risk,0
4,66,130,49,7.8,99.2,66,0,1,0,mid risk,1
...,...,...,...,...,...,...,...,...,...,...,...
995,24,70,100,7.4,99.2,65,0,1,0,mid risk,1
996,23,120,90,7.0,99.2,77,0,1,0,mid risk,1
997,62,75,60,7.0,99.2,88,1,0,0,low risk,0
998,43,120,70,4.2,99.2,88,0,0,1,high risk,2


# Comparing non-dp decision tree vs dp decision tree

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import diffprivlib.models as dp

maternal_health = pd.read_csv('maternalHealthDataSet.csv')

In [19]:
X = maternal_health[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']]
y = maternal_health['RiskLevel']

## Method 0 (No noise added)

In [20]:
clf = DecisionTreeClassifier()

# Evaluate the model 
accuracy = cross_val_score(clf, X, y, cv=10).mean()
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.81


## Method 1 (BASE CASE) (DP Decision Tree, Original Data)

In [21]:
classes = (0, 1, 2) # 0=low risk, 1=mid risk, 2=high risk

# TODO: BOUNDS NEED TO BE COMPUTED WITH DP
bounds = ([],[])
for col in X.columns:
    bounds[0].append(X[col].min())
    bounds[1].append(X[col].max())


dp_clf = dp.DecisionTreeClassifier(epsilon=1, bounds=bounds, classes=classes)
# Evaluate the model 
accuracy = cross_val_score(dp_clf, X, y, cv=10).mean()
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.53


## Method 2 (NON DP Synthetic Data, DP Decision Tree)

In [None]:
#code

## Method 3 (DP Synthetic Data, Non DP Decision Tree

In [None]:
#code