In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from table_evaluator import TableEvaluator

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import os.path, sys
from tests.utils import load_adult

import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader

In [3]:
log.add(sink=sys.stderr, level="INFO")
sys.path.append(os.getcwd())

In [4]:
import PreProcessData

clean_df = PreProcessData.clean_df('data/adult.data')
#arr_X, arr_y, adult_df = load_adult() # arr_X np array without label/target, arr_y np array of just label/target

clean_df.head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,1,0,2,0,8,40,5,3,1
1,0,50,1,0,0,0,4,13,1,2,1
2,0,38,1,0,1,3,6,40,0,3,1
3,4,53,1,0,0,2,6,40,0,2,1
4,4,28,0,12,0,0,5,40,0,0,1


In [5]:
loader = GenericDataLoader(
    clean_df,
    target_column="label",
    sensitive_columns=["race","sex","native-country"],
)

loader.dataframe().head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,1,0,2,0,8,40,5,3,1
1,0,50,1,0,0,0,4,13,1,2,1
2,0,38,1,0,1,3,6,40,0,3,1
3,4,53,1,0,0,2,6,40,0,2,1
4,4,28,0,12,0,0,5,40,0,0,1


In [6]:
# train dpgan for e=0.1
from synthcity.plugins import Plugins
from synthcity.plugins.privacy import plugin_dpgan

#syn_model = Plugins().get("dpgan")

syn_model = plugin_dpgan.DPGANPlugin(n_iter = 10, n_iter_min=1, epsilon=0.1)

syn_model.fit(loader)

[2023-12-05T20:40:56.317990-0600][4948][INFO] Encoding race 3843332182083652118
[2023-12-05T20:40:56.323627-0600][4948][INFO] Encoding age 42226137574194105
[2023-12-05T20:40:58.134445-0600][4948][INFO] Encoding sex 7310119158062436415
[2023-12-05T20:40:58.134445-0600][4948][INFO] Encoding native-country 6190545641124984424
[2023-12-05T20:40:59.915332-0600][4948][INFO] Encoding marital-status 1207444760813777779
[2023-12-05T20:40:59.916845-0600][4948][INFO] Encoding education 7923692313984985242
[2023-12-05T20:41:01.865844-0600][4948][INFO] Encoding occupation 2060381146028151082
[2023-12-05T20:41:03.674912-0600][4948][INFO] Encoding hours-per-week 1474711948381798600
[2023-12-05T20:41:04.950257-0600][4948][INFO] Encoding workclass 6611077038566673260
[2023-12-05T20:41:04.950257-0600][4948][INFO] Encoding relationship 163436197888038013
[2023-12-05T20:41:04.950257-0600][4948][INFO] Encoding label 5779905084709826269
[2023-12-05T20:41:05.883135-0600][4948][INFO] Training GAN on device c

<synthcity.plugins.privacy.plugin_dpgan.DPGANPlugin at 0x15fbf363be0>

In [11]:
synth_df_e01 = syn_model.generate(count=75000).dataframe()
synth_df_e01.head()


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,2,40,0,23,3,3,5,57,5,5,0
1,2,22,0,12,5,5,5,34,5,5,0
2,4,45,0,5,5,7,4,58,6,5,0
3,2,37,0,4,5,5,5,51,6,4,0
4,3,23,0,22,5,3,5,59,6,4,0


In [40]:
from Metrics import get_metrics

get_metrics("DP_e_01", clean_df, synth_df_e01)

Statistics for dataset for mode: DP_e_01
Precision: 0.12619863864460676
Recall: 0.9829721362229102
AUROC: 0.4999270094720678
FTU: 0.10390666666666666
DP: 0.10767639384585292


In [41]:
from synthcity.benchmark import Benchmarks

score_e_01 = Benchmarks.evaluate(
    [("test_eps_0.1", "dpgan", {"epsilon": 0.1, "n_iter": 10, "n_iter_min": 1})],
    loader,
    synthetic_size=75000,
    repeats=2,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    }
    # synthetic_reuse_if_exists=False,
    # augmented_reuse_if_exists=False
)

[2023-12-05T22:04:42.822788-0600][4948][INFO] Testcase : test_eps_0.1
[2023-12-05T22:04:42.822788-0600][4948][INFO] [testcase] Experiment repeat: 1 task type: classification Train df hash = 8844315381081027915
[2023-12-05T22:04:43.022810-0600][4948][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 8844315381081027915


In [38]:
Benchmarks.print(score_e_01)


[4m[1mPlugin : test_eps_0.1[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,12.246154,14.740741,13.493447,1.247293,13.493447,1.247293,2,0,0.0
privacy.identifiability_score.score,0.010608,0.013758,0.012183,0.001575,0.012183,0.001575,2,0,0.01
privacy.identifiability_score.score_OC,0.030333,0.053539,0.041936,0.011603,0.041936,0.011603,2,0,0.01
privacy.DomiasMIA_prior.accuracy,0.499814,0.499814,0.499814,0.0,0.499814,0.0,2,0,0.02
privacy.DomiasMIA_prior.aucroc,0.370411,0.392045,0.381228,0.010817,0.381228,0.010817,2,0,0.02





In [42]:
# Evaluate dpgan for e=1
syn_dpgan_e1 = plugin_dpgan.DPGANPlugin(n_iter = 10, n_iter_min=1, epsilon=1)
syn_dpgan_e1.fit(loader)

[2023-12-05T22:13:19.024884-0600][4948][INFO] Encoding race 3843332182083652118
[2023-12-05T22:13:19.033886-0600][4948][INFO] Encoding age 42226137574194105
[2023-12-05T22:13:21.105030-0600][4948][INFO] Encoding sex 7310119158062436415
[2023-12-05T22:13:21.116982-0600][4948][INFO] Encoding native-country 6190545641124984424
[2023-12-05T22:13:23.460330-0600][4948][INFO] Encoding marital-status 1207444760813777779
[2023-12-05T22:13:23.460330-0600][4948][INFO] Encoding education 7923692313984985242
[2023-12-05T22:13:25.850629-0600][4948][INFO] Encoding occupation 2060381146028151082
[2023-12-05T22:13:27.719918-0600][4948][INFO] Encoding hours-per-week 1474711948381798600
[2023-12-05T22:13:29.018095-0600][4948][INFO] Encoding workclass 6611077038566673260
[2023-12-05T22:13:29.018095-0600][4948][INFO] Encoding relationship 163436197888038013
[2023-12-05T22:13:29.035325-0600][4948][INFO] Encoding label 5779905084709826269
[2023-12-05T22:13:30.054293-0600][4948][INFO] Training GAN on device c

<synthcity.plugins.privacy.plugin_dpgan.DPGANPlugin at 0x15fd49d0880>

In [48]:
synth_df_e1 = syn_dpgan_e1.generate(count=75000).dataframe()
synth_df_e1.head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,2,17,0,26,6,3,9,57,3,5,1
1,2,17,0,26,6,8,6,3,6,5,0
2,4,24,0,12,4,8,0,3,6,5,0
3,2,17,0,12,5,6,3,57,6,0,0
4,2,17,0,26,5,8,5,57,6,0,0


In [52]:
get_metrics("DP_e_1", clean_df, synth_df_e1)

Statistics for dataset for mode: DP_e_1
Precision: 0.43505966863631096
Recall: 0.9797356061923813
AUROC: 0.49955473746637236
FTU: 0.02036
DP: 0.03346638585508788


In [53]:
score_e1 = Benchmarks.evaluate(
    [("test_eps_1", "dpgan", {"epsilon": 1, "n_iter": 10, "n_iter_min": 1})],
    loader,
    synthetic_size=75000,
    repeats=2,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    }
)

[2023-12-05T22:38:54.510340-0600][4948][INFO] Testcase : test_eps_1
[2023-12-05T22:38:54.517730-0600][4948][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 8844315381081027915
[2023-12-05T22:38:54.624542-0600][4948][INFO] Encoding race 8839838288685913191
[2023-12-05T22:38:54.631570-0600][4948][INFO] Encoding age 7983650266753259497
[2023-12-05T22:38:56.240668-0600][4948][INFO] Encoding sex 4570725880838915145
[2023-12-05T22:38:56.252356-0600][4948][INFO] Encoding native-country 7154188036210242214
[2023-12-05T22:38:58.161183-0600][4948][INFO] Encoding marital-status 9060001245496613666
[2023-12-05T22:38:58.161183-0600][4948][INFO] Encoding education 4958767644402141762
[2023-12-05T22:38:59.416840-0600][4948][INFO] Encoding occupation 7487045939440258640
[2023-12-05T22:39:01.180538-0600][4948][INFO] Encoding hours-per-week 54294204929279679
[2023-12-05T22:39:02.502669-0600][4948][INFO] Encoding workclass 4227416048796935791
[2023-12-05T22:39:02.507005-06

In [55]:
Benchmarks.print(score_e1)


[4m[1mPlugin : test_eps_1[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,795.999992,2079.999979,1437.999986,641.999994,1437.999986,641.999994,2,0,0.36
privacy.identifiability_score.score,0.005138,0.006133,0.005636,0.001,0.005636,0.000497,2,0,0.44
privacy.identifiability_score.score_OC,0.016576,0.021051,0.018813,0.002238,0.018813,0.002238,2,0,0.44
privacy.DomiasMIA_prior.accuracy,0.499979,0.499979,0.499979,0.0,0.499979,0.0,2,0,83.65
privacy.DomiasMIA_prior.aucroc,0.49253,0.5546,0.523565,0.031035,0.523565,0.031035,2,0,83.65





In [56]:
# Evaluate dpgan for e=10
syn_dpgan_e10 = plugin_dpgan.DPGANPlugin(n_iter = 10, n_iter_min=1, epsilon=10)
syn_dpgan_e10.fit(loader)

[2023-12-05T23:33:57.769351-0600][4948][INFO] Encoding race 3843332182083652118
[2023-12-05T23:33:57.778319-0600][4948][INFO] Encoding age 42226137574194105
[2023-12-05T23:34:00.559492-0600][4948][INFO] Encoding sex 7310119158062436415
[2023-12-05T23:34:00.563770-0600][4948][INFO] Encoding native-country 6190545641124984424
[2023-12-05T23:34:03.210171-0600][4948][INFO] Encoding marital-status 1207444760813777779
[2023-12-05T23:34:03.229265-0600][4948][INFO] Encoding education 7923692313984985242
[2023-12-05T23:34:06.228075-0600][4948][INFO] Encoding occupation 2060381146028151082
[2023-12-05T23:34:08.401234-0600][4948][INFO] Encoding hours-per-week 1474711948381798600
[2023-12-05T23:34:09.648373-0600][4948][INFO] Encoding workclass 6611077038566673260
[2023-12-05T23:34:09.669715-0600][4948][INFO] Encoding relationship 163436197888038013
[2023-12-05T23:34:09.673920-0600][4948][INFO] Encoding label 5779905084709826269
[2023-12-05T23:34:10.600621-0600][4948][INFO] Training GAN on device c

<synthcity.plugins.privacy.plugin_dpgan.DPGANPlugin at 0x15fd8b4fdc0>

In [57]:
synth_df_e10 = syn_dpgan_e10.generate(count=75000).dataframe()
synth_df_e10.head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,3,17,0,10,6,15,0,62,1,4,1
1,2,17,0,26,4,15,5,61,1,4,1
2,2,17,0,26,3,8,4,59,3,4,1
3,3,23,0,33,6,6,4,62,5,4,0
4,2,19,0,26,3,15,4,62,5,4,1


In [58]:
get_metrics("DP_e_10", clean_df, synth_df_e10)

Statistics for dataset for mode: DP_e_10
Precision: 0.8679549485224681
Recall: 0.9768768145997512
AUROC: 0.5018486941781646
FTU: 0.01188
DP: 0.0005263973408443645


In [59]:
score_e10 = Benchmarks.evaluate(
    [("test_eps_10", "dpgan", {"epsilon": 10, "n_iter": 10, "n_iter_min": 1})],
    loader,
    synthetic_size=75000,
    repeats=2,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    }
)

[2023-12-05T23:58:14.290360-0600][4948][INFO] Testcase : test_eps_10
[2023-12-05T23:58:14.306651-0600][4948][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 8844315381081027915
[2023-12-05T23:58:14.432198-0600][4948][INFO] Encoding race 8839838288685913191
[2023-12-05T23:58:14.443166-0600][4948][INFO] Encoding age 7983650266753259497
[2023-12-05T23:58:16.007113-0600][4948][INFO] Encoding sex 4570725880838915145
[2023-12-05T23:58:16.007113-0600][4948][INFO] Encoding native-country 7154188036210242214
[2023-12-05T23:58:17.863755-0600][4948][INFO] Encoding marital-status 9060001245496613666
[2023-12-05T23:58:17.873144-0600][4948][INFO] Encoding education 4958767644402141762
[2023-12-05T23:58:19.186818-0600][4948][INFO] Encoding occupation 7487045939440258640
[2023-12-05T23:58:20.993519-0600][4948][INFO] Encoding hours-per-week 54294204929279679
[2023-12-05T23:58:22.218453-0600][4948][INFO] Encoding workclass 4227416048796935791
[2023-12-05T23:58:22.234054-0

In [61]:
Benchmarks.print(score_e10)


[4m[1mPlugin : test_eps_10[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,88.444444,795.999992,442.222218,353.777774,442.222218,353.777774,2,0,0.34
privacy.identifiability_score.score,0.004973,0.014421,0.009697,0.004724,0.009697,0.004724,2,0,0.4
privacy.identifiability_score.score_OC,0.013923,0.03083,0.022377,0.008454,0.022377,0.008454,2,0,0.4
privacy.DomiasMIA_prior.accuracy,0.499979,0.500145,0.500062,0.001,0.500062,8.3e-05,2,0,83.86
privacy.DomiasMIA_prior.aucroc,0.487627,0.618005,0.552816,0.065189,0.552816,0.065189,2,0,83.86



