In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from table_evaluator import TableEvaluator

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import os.path, sys
from tests.utils import load_adult

import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader

    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    


In [2]:
log.add(sink=sys.stderr, level="INFO")
sys.path.append(os.getcwd())

In [3]:
import PreProcessData

clean_df = PreProcessData.clean_df('data/adult.data')
#arr_X, arr_y, adult_df = load_adult() # arr_X np array without label/target, arr_y np array of just label/target

clean_df

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,1,0,2,0,8,40,5,3,1
1,0,50,1,0,0,0,4,13,1,2,1
2,0,38,1,0,1,3,6,40,0,3,1
3,4,53,1,0,0,2,6,40,0,2,1
4,4,28,0,12,0,0,5,40,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
32556,0,27,0,0,0,5,0,38,0,0,1
32557,0,40,1,0,0,3,7,40,0,2,0
32558,0,58,0,0,4,3,8,40,0,5,1
32559,0,22,1,0,2,3,8,20,0,1,1


In [4]:
loader = GenericDataLoader(
    clean_df,
    target_column="label",
    sensitive_columns=["race","sex","native-country"],
)

loader

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,1,0,2,0,8,40,5,3,1
1,0,50,1,0,0,0,4,13,1,2,1
2,0,38,1,0,1,3,6,40,0,3,1
3,4,53,1,0,0,2,6,40,0,2,1
4,4,28,0,12,0,0,5,40,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
32556,0,27,0,0,0,5,0,38,0,0,1
32557,0,40,1,0,0,3,7,40,0,2,0
32558,0,58,0,0,4,3,8,40,0,5,1
32559,0,22,1,0,2,3,8,20,0,1,1


In [7]:
# synthcity absolute
from synthcity.plugins import Plugins
from synthcity.plugins.privacy import plugin_dpgan

#syn_model = Plugins().get("dpgan")

syn_model = plugin_dpgan.DPGANPlugin(n_iter = 10, n_iter_min=1)

syn_model.fit(loader)

[2023-12-05T16:18:54.907739-0600][16324][INFO] Encoding race 3843332182083652118
[2023-12-05T16:18:54.907739-0600][16324][INFO] Encoding age 42226137574194105
[2023-12-05T16:18:56.854956-0600][16324][INFO] Encoding sex 7310119158062436415
[2023-12-05T16:18:56.871456-0600][16324][INFO] Encoding native-country 6190545641124984424
[2023-12-05T16:18:58.637670-0600][16324][INFO] Encoding marital-status 1207444760813777779
[2023-12-05T16:18:58.637670-0600][16324][INFO] Encoding education 7923692313984985242
[2023-12-05T16:19:01.086982-0600][16324][INFO] Encoding occupation 2060381146028151082
[2023-12-05T16:19:03.022476-0600][16324][INFO] Encoding hours-per-week 1474711948381798600
[2023-12-05T16:19:04.189262-0600][16324][INFO] Encoding workclass 6611077038566673260
[2023-12-05T16:19:04.203080-0600][16324][INFO] Encoding relationship 163436197888038013
[2023-12-05T16:19:04.209106-0600][16324][INFO] Encoding label 5779905084709826269
[2023-12-05T16:19:05.082095-0600][16324][INFO] Training GAN

<synthcity.plugins.privacy.plugin_dpgan.DPGANPlugin at 0x24134168ee0>

In [8]:
synth_df = syn_model.generate(count=75000).dataframe()
synth_df


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,2,17,0,26,6,8,6,57,1,0,0
1,2,17,0,26,5,8,9,57,6,0,1
2,2,17,0,40,6,6,9,57,6,0,1
3,4,32,0,40,6,6,5,57,6,5,0
4,1,17,0,25,3,6,10,57,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...
74995,2,18,0,40,5,8,7,57,6,4,1
74996,2,17,0,40,6,8,5,1,1,5,0
74997,2,20,0,26,4,8,9,1,6,0,0
74998,1,19,1,22,6,6,8,16,6,0,0


In [9]:
from Metrics import get_metrics

get_metrics("DP-None", clean_df, synth_df)

Statistics for dataset for mode: DP-None
Precision: 0.4333269635008599
Recall: 0.9910839160839161
AUROC: 0.499190860023747
FTU: 0.00728
DP: 0.015998434447435872


In [5]:
# synthcity absolute
# from SynthBenchmarks import Benchmarks

from synthcity.benchmark import Benchmarks

score = Benchmarks.evaluate(
    [(f"test_eps_{eps}", "dpgan", {"epsilon": eps, "n_iter": 10, "n_iter_min": 1}) for eps in [0.1]],
    loader,
    synthetic_size=1000,
    repeats=2
)
# score = Benchmarks.evaluate(
#     [(f"test_eps_{eps}", "dpgan", {"epsilon": eps}) for eps in [0.1, 1, 10]],
#     loader,
#     synthetic_size=1000,
#     repeats=2,
#     #synthetic_cache=False
#     #synthetic_reuse_if_exists=True
# )

[2023-12-05T17:46:53.084164-0600][2640][INFO] Testcase : test_eps_0.1
[2023-12-05T17:46:53.086162-0600][2640][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 8844315381081027915


[2023-12-05T17:47:11.497119-0600][2640][INFO] Encoding race 8839838288685913191
[2023-12-05T17:47:11.511757-0600][2640][INFO] Encoding age 7983650266753259497
[2023-12-05T17:47:12.754385-0600][2640][INFO] Encoding sex 4570725880838915145
[2023-12-05T17:47:12.757999-0600][2640][INFO] Encoding native-country 7154188036210242214
[2023-12-05T17:47:13.875947-0600][2640][INFO] Encoding marital-status 9060001245496613666
[2023-12-05T17:47:13.875947-0600][2640][INFO] Encoding education 4958767644402141762
[2023-12-05T17:47:14.705408-0600][2640][INFO] Encoding occupation 7487045939440258640
[2023-12-05T17:47:15.871829-0600][2640][INFO] Encoding hours-per-week 54294204929279679
[2023-12-05T17:47:16.670592-0600][2640][INFO] Encoding workclass 4227416048796935791
[2023-12-05T17:47:16.676615-0600][2640][INFO] Encoding relationship 5307621220283089396
[2023-12-05T17:47:16.681590-0600][2640][INFO] Encoding label 426742775345651831
[2023-12-05T17:47:17.372721-0600][2640][INFO] Training GAN on device c

In [6]:
Benchmarks.print(score)


[4m[1mPlugin : test_eps_0.1[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
sanity.data_mismatch.score,0.583333,0.583333,0.583333,0.0,0.583333,0.0,2,0,0.0
sanity.common_rows_proportion.score,0.0,0.0,0.0,0.0,0.0,0.0,2,0,0.0
sanity.nearest_syn_neighbor_distance.mean,0.167348,0.171346,0.169347,0.001999,0.169347,0.001999,2,0,0.01
sanity.close_values_probability.score,0.716,0.744,0.73,0.014,0.73,0.014,2,0,0.01
sanity.distant_values_probability.score,0.001,0.003,0.002,0.001,0.002,0.001,2,0,0.0
stats.jensenshannon_dist.marginal,0.072213,0.084152,0.078182,0.005969,0.078182,0.005969,2,0,0.08
stats.chi_squared_test.marginal,0.090169,0.10227,0.096219,0.006051,0.096219,0.006051,2,0,0.01
stats.inv_kl_divergence.marginal,0.431634,0.451789,0.441711,0.010078,0.441711,0.010078,2,0,0.02
stats.ks_test.marginal,0.386455,0.465273,0.425864,0.039409,0.425864,0.039409,2,0,0.02
stats.max_mean_discrepancy.joint,0.002181,0.002228,0.002204,0.001,0.002204,2.4e-05,2,0,0.11



