In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from table_evaluator import TableEvaluator
from ctgan import CTGAN

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import os.path,sys

import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader

import PreProcessData

from Metrics import get_metrics

In [3]:
log.add(sink=sys.stderr, level="INFO")
sys.path.append(os.getcwd())

In [4]:
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'label'
]

# Node order contains the order in which to generate the data, starting with the root nodes
node_order = [['race','age','sex','native-country'],['marital-status'],['education'],['occupation','hours-per-week','workclass','relationship'],['label']]
node_order_nl = ['race','age','sex','native-country','marital-status','education','occupation','hours-per-week','workclass','relationship','label']

# List of connections; key is receiving node
node_connections_normal = {'label':['occupation','race','hours-per-week','age','marital-status','education','sex','workclass','native-country','relatinship'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

'''
Connections are removed according to the privacy criterion
'''
node_connections_FTU = {'label':['occupation','race','hours-per-week','age','marital-status','education','workclass','native-country','relationship'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

node_connections_DP = {'label':['race','age','native-country'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

node_connections_CF = {'label':['occupation','race','hours-per-week','age','education','workclass','native-country',],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

In [5]:
@ignore_warnings(category=ConvergenceWarning)
def generate_data(df,mode):
    print("Generating Data for CTGAN-",mode,"...")
    ctgan = CTGAN(epochs=10, verbose=True)
    # How much more data the synthetic dataset should contain that the OG data (This is to ensure we can
    # take a sample that looks like the original data)
    factor = 4
    
    # Define the privacy measure
    if mode == 'FTU':
        node_connections = node_connections_FTU
    elif mode == 'DP':
        node_connections = node_connections_DP
    elif mode == 'CF':
        node_connections = node_connections_CF
    else:
        print('Mode is not correct!')
    
    model_name = 'CTGANrootnodes' + str(mode) + '.pkl'
    path = 'Models/' + model_name
    if os.path.isfile(path):
        ctgan = ctgan.load(path)
    else:
        # DF to fit the first model on
        start_df = df[['race','age','sex','native-country']]
        temp_discrete = ['race','age','sex','native-country']

        print("Fitting root nodes...")
        ctgan.fit(start_df, temp_discrete)
        ctgan.save('Models/'+model_name)
    
    print("Sampling root...")
    synth_df = ctgan.sample(factor * len(df.index))
    #print('Initial nodes loaded for mode',mode)
    
    # Iteratively generate the data
    for node in node_order_nl:
         # If the node has not been generated yet
        if node not in synth_df.columns:
            # Grab the old data
            empty_df = df[[node]]

            # Grab the attributes that need to be looked at when generating data
            if node in node_connections.keys():
                attributes = node_connections[node]
            else:
                attributes = []
                for n in node_order_nl:
                    attributes.append(n)
                    if n == node:
                        break
                            
            model_name = 'CTGAN' + str(node) + str(mode) + '.pkl'
            path = 'Models/' + model_name
            if os.path.isfile(path):
                ctgan = ctgan.load(path)
                print("Sampling for node ", node, "...")
                generated_data = ctgan.sample(len(synth_df.index))
            else:
                # Grab the attributes from the final df
                gen_df = synth_df.loc[:,synth_df.columns.isin(attributes)]

                # Add the old attribute to the current dataframe
                at = df[attributes]
                empty_df = empty_df.join(at)

                temp_discrete = []
                for d in discrete_columns:
                    if d in gen_df.columns:
                        temp_discrete.append(d)

                print("Fitting for node ", node, "...")
                ctgan.fit(empty_df, temp_discrete)

                model_name = str(node) + str(mode)
                ctgan.save('Models/CTGAN' + model_name + '.pkl')
                print("Sampling for node ", node, "...")
                generated_data = ctgan.sample(len(synth_df.index))
                
            # Add the generated data to the output
            for attribute in attributes + [node]:
                if attribute not in synth_df.columns:
                    synth_df[attribute] = generated_data[attribute].values
            #print('Finished node',node,'for',mode)
    # Finally, we have to manually add the label
    return synth_df

In [6]:
'''
FTU results for adult dataset
'''
def run_experiment_CTGAN(mode):
    # Generate the synthetic a data
    df = PreProcessData.clean_df('data/adult.data')
    synthetic = generate_data(df,mode)
    get_metrics(mode,df,synthetic)
    return df, synthetic

In [7]:
# Run the experiments with the three fariness definitions
orig_ftu, ctgan_ftu_synth = run_experiment_CTGAN('FTU')

Generating Data for CTGAN- FTU ...
Fitting root nodes...


Gen. (0.50) | Discrim. (-0.06): 100%|██████████| 10/10 [00:49<00:00,  4.97s/it]


Sampling root...
Fitting for node  marital-status ...


Gen. (-0.66) | Discrim. (0.05): 100%|██████████| 10/10 [00:37<00:00,  3.75s/it]


Sampling for node  marital-status ...
Fitting for node  education ...


Gen. (-0.63) | Discrim. (0.02): 100%|██████████| 10/10 [00:43<00:00,  4.32s/it]


Sampling for node  education ...
Fitting for node  occupation ...


Gen. (-0.85) | Discrim. (0.02): 100%|██████████| 10/10 [00:36<00:00,  3.67s/it]


Sampling for node  occupation ...
Fitting for node  hours-per-week ...


Gen. (-0.70) | Discrim. (0.05): 100%|██████████| 10/10 [00:50<00:00,  5.02s/it]


Sampling for node  hours-per-week ...
Fitting for node  workclass ...


Gen. (-0.95) | Discrim. (0.01): 100%|██████████| 10/10 [00:47<00:00,  4.70s/it]


Sampling for node  workclass ...
Fitting for node  relationship ...


Gen. (-0.57) | Discrim. (-0.09): 100%|██████████| 10/10 [00:44<00:00,  4.40s/it]


Sampling for node  relationship ...
Fitting for node  label ...


Gen. (-0.93) | Discrim. (0.05): 100%|██████████| 10/10 [00:48<00:00,  4.82s/it]


Sampling for node  label ...
Statistics for dataset for mode: FTU
Precision: 0.9578636712300271
Recall: 0.9226684700254223
AUROC: 0.5042115269732189
FTU: 0.054373052184868376
DP: 0.05287693127330839


In [10]:
orig_dp, ctgan_dp_synth = run_experiment_CTGAN('DP')

Generating Data for CTGAN- DP ...
Fitting root nodes...


Gen. (0.12) | Discrim. (-0.01): 100%|██████████| 10/10 [00:54<00:00,  5.43s/it]


Sampling root...
Fitting for node  marital-status ...


Gen. (-0.46) | Discrim. (0.01): 100%|██████████| 10/10 [00:41<00:00,  4.14s/it]


Sampling for node  marital-status ...
Fitting for node  education ...


Gen. (-0.79) | Discrim. (0.07): 100%|██████████| 10/10 [00:39<00:00,  3.92s/it]


Sampling for node  education ...
Fitting for node  occupation ...


Gen. (-0.72) | Discrim. (-0.02): 100%|██████████| 10/10 [00:47<00:00,  4.73s/it]


Sampling for node  occupation ...
Fitting for node  hours-per-week ...


Gen. (-0.69) | Discrim. (0.07): 100%|██████████| 10/10 [00:48<00:00,  4.81s/it]


Sampling for node  hours-per-week ...
Fitting for node  workclass ...


Gen. (-0.70) | Discrim. (-0.06): 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]


Sampling for node  workclass ...
Fitting for node  relationship ...


Gen. (-0.32) | Discrim. (-0.16): 100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


Sampling for node  relationship ...
Fitting for node  label ...


Gen. (-0.49) | Discrim. (-0.03): 100%|██████████| 10/10 [00:39<00:00,  3.97s/it]


Sampling for node  label ...
Statistics for dataset for mode: DP
Precision: 0.9353737854982468
Recall: 0.9317451641715436
AUROC: 0.4977647883672763
FTU: 0.036138187122869835
DP: 0.0266382525306339


In [13]:
orig_cf, ctgan_cf_synth = run_experiment_CTGAN('CF')

Generating Data for CTGAN- CF ...
Fitting root nodes...


Gen. (0.51) | Discrim. (-0.07): 100%|██████████| 10/10 [00:59<00:00,  5.95s/it]


Sampling root...
Fitting for node  marital-status ...


Gen. (-0.57) | Discrim. (0.11): 100%|██████████| 10/10 [00:44<00:00,  4.42s/it]


Sampling for node  marital-status ...
Fitting for node  education ...


Gen. (-0.62) | Discrim. (0.04): 100%|██████████| 10/10 [00:45<00:00,  4.56s/it]


Sampling for node  education ...
Fitting for node  occupation ...


Gen. (-0.62) | Discrim. (0.04): 100%|██████████| 10/10 [00:35<00:00,  3.58s/it]


Sampling for node  occupation ...
Fitting for node  hours-per-week ...


Gen. (-0.93) | Discrim. (0.01): 100%|██████████| 10/10 [00:43<00:00,  4.38s/it]


Sampling for node  hours-per-week ...
Fitting for node  workclass ...


Gen. (-0.59) | Discrim. (-0.04): 100%|██████████| 10/10 [00:40<00:00,  4.05s/it]


Sampling for node  workclass ...
Fitting for node  relationship ...


Gen. (-0.73) | Discrim. (-0.04): 100%|██████████| 10/10 [00:44<00:00,  4.40s/it]


Sampling for node  relationship ...
Fitting for node  label ...


Gen. (-0.64) | Discrim. (0.07): 100%|██████████| 10/10 [00:57<00:00,  5.76s/it]


Sampling for node  label ...
Statistics for dataset for mode: CF
Precision: 0.9699164685080178
Recall: 0.9608823813665797
AUROC: 0.5004801011891264
FTU: 0.03092467343014389
DP: 0.02970165157165683


In [16]:
# train and test DPGAN on FACTGAN synthetic data
#clean_df = PreProcessData.clean_df('data/adult.data')
ftu_loader = GenericDataLoader(
    ctgan_ftu_synth,
    target_column="label",
    sensitive_columns=["race","sex","native-country"],
)

ftu_loader.dataframe().head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,0,29,2,3,3,40,0,3,1
1,0,46,1,0,1,14,7,35,1,1,0
2,0,33,0,0,5,0,3,33,0,5,1
3,0,44,1,0,0,3,2,67,0,2,1
4,0,36,1,20,0,1,4,57,5,4,1


# Evaluate dpgan ftu for e=0.1

In [17]:
from synthcity.plugins.privacy import plugin_dpgan

ftu_dpgan_e01 = plugin_dpgan.DPGANPlugin(n_iter = 10, n_iter_min=1, epsilon=0.1)

ftu_dpgan_e01.fit(ftu_loader)

[2023-12-06T08:17:31.462531-0600][7452][INFO] Encoding race 2062340901651914458
[2023-12-06T08:17:31.503179-0600][7452][INFO] Encoding age 7970369381195217701
[2023-12-06T08:17:38.363703-0600][7452][INFO] Encoding sex 2653782988807700700
[2023-12-06T08:17:38.379325-0600][7452][INFO] Encoding native-country 7988686125322825174
[2023-12-06T08:17:46.360337-0600][7452][INFO] Encoding marital-status 3633256886493563371
[2023-12-06T08:17:46.374337-0600][7452][INFO] Encoding education 5486981979674986280
[2023-12-06T08:17:53.391947-0600][7452][INFO] Encoding occupation 7682068271151260256
[2023-12-06T08:17:57.010806-0600][7452][INFO] Encoding hours-per-week 4936022831509916494
[2023-12-06T08:18:01.452568-0600][7452][INFO] Encoding workclass 1081804396797573889
[2023-12-06T08:18:01.468195-0600][7452][INFO] Encoding relationship 4705601766387559297
[2023-12-06T08:18:01.482751-0600][7452][INFO] Encoding label 8356302148815017471
[2023-12-06T08:18:09.489312-0600][7452][INFO] Training GAN on devic

<synthcity.plugins.privacy.plugin_dpgan.DPGANPlugin at 0x1d083236370>

In [18]:
ftu_dpgan_e01_df = ftu_dpgan_e01.generate(count=75000).dataframe()
ftu_dpgan_e01_df.head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,3,17,0,40,6,15,0,68,2,3,0
1,3,17,1,15,6,15,2,78,5,1,0
2,3,17,0,20,6,3,10,78,1,0,0
3,1,30,0,26,5,2,0,61,2,5,0
4,3,17,0,26,6,3,2,71,1,3,0


In [23]:
get_metrics("ftu_dpgan_e01", ftu_loader.dataframe(), ftu_dpgan_e01_df)

Statistics for dataset for mode: ftu_dpgan_e01
Precision: 0.17123935666982024
Recall: 0.9983452840595698
AUROC: 0.4997997084988306
FTU: 0.0016133333333333334
DP: 0.0013837180481198752


In [24]:
from synthcity.benchmark import Benchmarks

ftu_dpgan_e01_score = Benchmarks.evaluate(
    [("test_eps_0.1", "dpgan", {"epsilon": 0.1, "n_iter": 10, "n_iter_min": 1})],
    ftu_loader,
    synthetic_size=75000,
    repeats=2,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    }
)

[2023-12-06T10:05:56.143486-0600][7452][INFO] Testcase : test_eps_0.1
[2023-12-06T10:05:56.153017-0600][7452][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 6103139906204711075
[2023-12-06T10:06:15.819518-0600][7452][INFO] Encoding race 8527038700282154753
[2023-12-06T10:06:15.835146-0600][7452][INFO] Encoding age 6087873872986842804
[2023-12-06T10:06:20.651815-0600][7452][INFO] Encoding sex 368158753243843503
[2023-12-06T10:06:20.651815-0600][7452][INFO] Encoding native-country 470440733214006603
[2023-12-06T10:06:26.342530-0600][7452][INFO] Encoding marital-status 6020789638600839186
[2023-12-06T10:06:26.355202-0600][7452][INFO] Encoding education 8506752458204124020
[2023-12-06T10:06:31.895574-0600][7452][INFO] Encoding occupation 6236357910713540756
[2023-12-06T10:06:34.311425-0600][7452][INFO] Encoding hours-per-week 535253690936423005
[2023-12-06T10:06:37.545511-0600][7452][INFO] Encoding workclass 3878359790808055015
[2023-12-06T10:06:37.561185-0

In [25]:
Benchmarks.print(ftu_dpgan_e01_score)


[4m[1mPlugin : test_eps_0.1[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,966.99999,2090.999979,1528.999985,561.999994,1528.999985,561.999994,2,0,1.03
privacy.identifiability_score.score,0.013344,0.022047,0.017696,0.004351,0.017696,0.004351,2,0,16.68
privacy.identifiability_score.score_OC,0.025114,0.04053,0.032822,0.007708,0.032822,0.007708,2,0,16.68
privacy.DomiasMIA_prior.accuracy,0.499979,0.5,0.49999,0.001,0.49999,1e-05,2,0,346.6
privacy.DomiasMIA_prior.aucroc,0.412077,0.414749,0.413413,0.001336,0.413413,0.001336,2,0,346.6





# Evaluate dpgan ftu for e=1

In [26]:
# Evaluate dpgan ftu for e=1
ftu_dpgan_e1 = plugin_dpgan.DPGANPlugin(n_iter = 10, n_iter_min=1, epsilon=1)
ftu_dpgan_e1.fit(ftu_loader)

[2023-12-06T12:25:50.228877-0600][7452][INFO] Encoding race 2062340901651914458
[2023-12-06T12:25:50.249083-0600][7452][INFO] Encoding age 7970369381195217701
[2023-12-06T12:25:56.742707-0600][7452][INFO] Encoding sex 2653782988807700700
[2023-12-06T12:25:56.858199-0600][7452][INFO] Encoding native-country 7988686125322825174
[2023-12-06T12:26:05.832699-0600][7452][INFO] Encoding marital-status 3633256886493563371
[2023-12-06T12:26:05.854187-0600][7452][INFO] Encoding education 5486981979674986280
[2023-12-06T12:26:14.085762-0600][7452][INFO] Encoding occupation 7682068271151260256
[2023-12-06T12:26:17.832521-0600][7452][INFO] Encoding hours-per-week 4936022831509916494
[2023-12-06T12:26:22.697423-0600][7452][INFO] Encoding workclass 1081804396797573889
[2023-12-06T12:26:22.713086-0600][7452][INFO] Encoding relationship 4705601766387559297
[2023-12-06T12:26:22.736213-0600][7452][INFO] Encoding label 8356302148815017471
[2023-12-06T12:26:30.483745-0600][7452][INFO] Training GAN on devic

<synthcity.plugins.privacy.plugin_dpgan.DPGANPlugin at 0x1d0a2ea3b20>

In [27]:
ftu_dpgan_e1_df = ftu_dpgan_e1.generate(count=75000).dataframe()
ftu_dpgan_e1_df.head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,4,38,0,40,5,8,0,47,3,0,0
1,2,38,1,40,5,8,0,47,3,0,0
2,2,38,1,40,5,8,0,15,3,0,0
3,2,38,1,27,5,7,0,89,4,4,0
4,2,38,1,40,5,17,0,49,3,0,0


In [35]:
get_metrics("ftu_dpgan_e1", ftu_loader.dataframe(), ftu_dpgan_e1_df)

Statistics for dataset for mode: ftu_dpgan_e1
Precision: 0.010563193375444556
Recall: 0.995
AUROC: 0.49865077878285075
FTU: 0.0012133333333333334
DP: 0.00318956438929352


In [36]:
ftu_dpgan_e1_score = Benchmarks.evaluate(
    [("test_eps_1", "dpgan", {"epsilon": 1, "n_iter": 10, "n_iter_min": 1})],
    ftu_loader,
    synthetic_size=75000,
    repeats=2,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    }
)

[2023-12-06T14:04:52.500019-0600][7452][INFO] Testcase : test_eps_1
[2023-12-06T14:04:52.504024-0600][7452][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 6103139906204711075
[2023-12-06T14:04:52.673518-0600][7452][INFO] Encoding race 8527038700282154753
[2023-12-06T14:04:52.685150-0600][7452][INFO] Encoding age 6087873872986842804
[2023-12-06T14:04:58.203123-0600][7452][INFO] Encoding sex 368158753243843503
[2023-12-06T14:04:58.203509-0600][7452][INFO] Encoding native-country 470440733214006603
[2023-12-06T14:05:04.547558-0600][7452][INFO] Encoding marital-status 6020789638600839186
[2023-12-06T14:05:04.551751-0600][7452][INFO] Encoding education 8506752458204124020
[2023-12-06T14:05:10.457156-0600][7452][INFO] Encoding occupation 6236357910713540756
[2023-12-06T14:05:13.125128-0600][7452][INFO] Encoding hours-per-week 535253690936423005
[2023-12-06T14:05:16.416691-0600][7452][INFO] Encoding workclass 3878359790808055015
[2023-12-06T14:05:16.427918-060

In [37]:
Benchmarks.print(ftu_dpgan_e1_score)


[4m[1mPlugin : test_eps_1[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,696.999998,2090.999979,1393.999988,696.999991,1393.999988,696.999991,2,0,0.96
privacy.identifiability_score.score,0.012723,0.020099,0.016411,0.003688,0.016411,0.003688,2,0,2.53
privacy.identifiability_score.score_OC,0.046332,0.053212,0.049772,0.00344,0.049772,0.00344,2,0,2.53
privacy.DomiasMIA_prior.accuracy,0.499959,0.499979,0.499969,0.001,0.499969,1e-05,2,0,347.21
privacy.DomiasMIA_prior.aucroc,0.348236,0.369448,0.358842,0.010606,0.358842,0.010606,2,0,347.21





# Evaluate dpgan ftu for e=10

In [39]:
ftu_dpgan_e10 = plugin_dpgan.DPGANPlugin(n_iter = 10, n_iter_min=1, epsilon=10)
ftu_dpgan_e10.fit(ftu_loader)

[2023-12-06T16:33:12.972701-0600][7452][INFO] Encoding race 2062340901651914458
[2023-12-06T16:33:12.995196-0600][7452][INFO] Encoding age 7970369381195217701
[2023-12-06T16:33:20.010117-0600][7452][INFO] Encoding sex 2653782988807700700
[2023-12-06T16:33:20.035100-0600][7452][INFO] Encoding native-country 7988686125322825174
[2023-12-06T16:33:28.629035-0600][7452][INFO] Encoding marital-status 3633256886493563371
[2023-12-06T16:33:28.637978-0600][7452][INFO] Encoding education 5486981979674986280
[2023-12-06T16:33:35.964569-0600][7452][INFO] Encoding occupation 7682068271151260256
[2023-12-06T16:33:39.746238-0600][7452][INFO] Encoding hours-per-week 4936022831509916494
[2023-12-06T16:33:44.758529-0600][7452][INFO] Encoding workclass 1081804396797573889
[2023-12-06T16:33:44.774151-0600][7452][INFO] Encoding relationship 4705601766387559297
[2023-12-06T16:33:44.796543-0600][7452][INFO] Encoding label 8356302148815017471
[2023-12-06T16:33:53.085114-0600][7452][INFO] Training GAN on devic

<synthcity.plugins.privacy.plugin_dpgan.DPGANPlugin at 0x1d0a6ee18e0>

In [40]:
ftu_dpgan_e10_df = ftu_dpgan_e10.generate(count=75000).dataframe()
ftu_dpgan_e10_df.head()

Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,4,38,1,20,4,15,14,15,5,1,0
1,4,38,1,20,4,3,14,15,5,1,0
2,4,38,1,19,4,3,14,15,5,1,0
3,4,38,1,19,4,3,14,15,2,1,0
4,3,39,1,20,3,3,11,15,2,4,0


In [41]:
get_metrics("ftu_dpgan_e10", ftu_loader.dataframe(), ftu_dpgan_e10_df)

Statistics for dataset for mode: ftu_dpgan_e10
Precision: 0.008799601527478001
Recall: 0.9695121951219512
AUROC: 0.5057510159883626
FTU: 0.023906666666666666
DP: 0.025211269398056402


In [42]:
ftu_dpgan_e10_score = Benchmarks.evaluate(
    [("test_eps_10", "dpgan", {"epsilon": 10, "n_iter": 10, "n_iter_min": 1})],
    ftu_loader,
    synthetic_size=75000,
    repeats=1,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    }
)

[2023-12-06T18:08:05.755818-0600][7452][INFO] Testcase : test_eps_10
[2023-12-06T18:08:05.796728-0600][7452][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 6103139906204711075
[2023-12-06T18:08:06.071243-0600][7452][INFO] Encoding race 8527038700282154753
[2023-12-06T18:08:06.099506-0600][7452][INFO] Encoding age 6087873872986842804
[2023-12-06T18:08:11.914890-0600][7452][INFO] Encoding sex 368158753243843503
[2023-12-06T18:08:11.914890-0600][7452][INFO] Encoding native-country 470440733214006603
[2023-12-06T18:08:18.733559-0600][7452][INFO] Encoding marital-status 6020789638600839186
[2023-12-06T18:08:18.749561-0600][7452][INFO] Encoding education 8506752458204124020
[2023-12-06T18:08:25.447785-0600][7452][INFO] Encoding occupation 6236357910713540756
[2023-12-06T18:08:28.211521-0600][7452][INFO] Encoding hours-per-week 535253690936423005
[2023-12-06T18:08:31.576929-0600][7452][INFO] Encoding workclass 3878359790808055015
[2023-12-06T18:08:31.593552-06

In [43]:
Benchmarks.print(ftu_dpgan_e10_score)


[4m[1mPlugin : test_eps_10[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,102.294118,102.294118,102.294118,0.0,102.294118,0.0,1,0,1.3
privacy.identifiability_score.score,0.026937,0.026937,0.026937,0.0,0.026937,0.0,1,0,2.56
privacy.identifiability_score.score_OC,0.054372,0.054372,0.054372,0.0,0.054372,0.0,1,0,2.56
privacy.DomiasMIA_prior.accuracy,0.499979,0.499979,0.499979,0.0,0.499979,0.0,1,0,346.51
privacy.DomiasMIA_prior.aucroc,0.415778,0.415778,0.415778,0.0,0.415778,0.0,1,0,346.51





In [38]:
# #save csv

# orig_cf.to_csv('orig_ct.csv', index=False)
# ctgan_ftu_synth.to_csv('ctgan_ftu_synth.csv', index=False)
# ctgan_dp_synth.to_csv('ctgan_dp_synth.csv', index=False)
# ctgan_cf_synth.to_csv('ctgan_cf_synth.csv', index=False)


In [None]:
#need to test benchmarks and metrics again ... against original test data for benchmark and train classifier on synthetic data for metrics

In [63]:
def get_metrics2(mode,df,synthetic):

    # Split the data into train,test
    traindf, testdf = train_test_split(synthetic, test_size=0.3)
    X_train = traindf.loc[:, traindf.columns != 'label']
    y_train = traindf['label']
    X_test = testdf.loc[:, testdf.columns != 'label']
    y_test = testdf['label']

    clf_df = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',
                                     learning_rate='constant', learning_rate_init=0.001).fit(X_train, y_train)
    '''
    SYNTHETIC DATASET
    '''
    # Make sure the data is representative of the original dataset
    df_balanced_1 = df[df.label == 1].sample(frac = 0.75)
    df_balanced_0 = df[df.label == 0].sample(frac = 0.25)
    #synthetic_balanced = synthetic_balanced_1.append(synthetic_balanced_0)
    df_balanced = pd.concat([df_balanced_1, df_balanced_0], ignore_index=True)

    # Split the data into train,test
    X_syn = df_balanced.loc[:, df_balanced.columns != 'label']
    y_syn = df_balanced['label']

    y_pred_syn = clf_df.predict(X_syn)

    df_pos = df.assign(sex=0)
    df_neg = df.assign(sex=1)
    
    x_pos_syn = df_balanced[df_balanced['sex'] == 0].drop(['label'], axis = 1)[:7508]
    x_neg_syn = df_balanced[df_balanced['sex'] == 1].drop(['label'], axis = 1)[:7508]
    
    pos = clf_df.predict(df_pos.drop('label',axis=1))
    neg = clf_df.predict(df_neg.drop('label',axis=1))

    pred_pos_syn = clf_df.predict(x_pos_syn)
    pred_neg_syn = clf_df.predict(x_neg_syn)
    
    FTU = np.abs(np.mean(pos-neg))
    DP = np.mean(pred_pos_syn)-np.mean(pred_neg_syn)
    
    # Print the obtained statistics
    print('Statistics for dataset for mode:',mode)
    print('Precision:',precision_score(y_syn, y_pred_syn, average='binary'))
    print('Recall:',recall_score(y_syn, y_pred_syn, average='binary'))
    print('AUROC:',roc_auc_score(y_syn, y_pred_syn))
    print('FTU:',FTU)
    print('DP:',DP)


clean_orig_df = PreProcessData.clean_df('data/adult.data')
traindf_orig_df, test_orig_df = train_test_split(clean_orig_df, test_size=0.80)

# train and test DPGAN on FACTGAN synthetic data
#clean_df = PreProcessData.clean_df('data/adult.data')
test_loader = GenericDataLoader(
    test_orig_df,
    target_column="label",
    sensitive_columns=["race","sex","native-country"],
)

#test_loader.dataframe().head()

In [62]:
# CTGAN_FTU,DP,CF metrics 2
get_metrics2(" ctgan_ftu_synth metric 2", clean_orig_df, ctgan_ftu_synth)
# get_metrics2(" ctgan_dp_synth metric 2", clean_orig_df, ctgan_dp_synth)
# get_metrics2(" ctgan_cf_synth metric 2", clean_orig_df, ctgan_cf_synth)

Statistics for dataset for mode:  ctgan_ftu_synth metric 2
Precision: 0.9005141251921344
Recall: 1.0
AUROC: 0.5
FTU: 3.315430011272462e-05
DP: 0.0


In [64]:
#DPGAN_FTU,DP, CF benchmarks against test = orig
ftu_dpgan_e01_score2 = Benchmarks.evaluate(
    [("test_eps_0.1", "dpgan", {"epsilon": 0.1, "n_iter": 10, "n_iter_min": 1})],
    ftu_loader,
    test_loader,
    synthetic_size=75000,
    repeats=1,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    },
    use_metric_cache=False
)
Benchmarks.print(ftu_dpgan_e01_score2)

[2023-12-06T19:43:21.657487-0600][7452][INFO] Testcase : test_eps_0.1
[2023-12-06T19:43:21.671685-0600][7452][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 6103139906204711075



[4m[1mPlugin : test_eps_0.1[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,629.499999,629.499999,629.499999,0.0,629.499999,0.0,1,0,0.97
privacy.identifiability_score.score,0.008579,0.008579,0.008579,0.0,0.008579,0.0,1,0,35.89
privacy.identifiability_score.score_OC,0.02039,0.02039,0.02039,0.0,0.02039,0.0,1,0,35.89
privacy.DomiasMIA_prior.accuracy,0.500021,0.500021,0.500021,0.0,0.500021,0.0,1,0,349.47
privacy.DomiasMIA_prior.aucroc,0.501266,0.501266,0.501266,0.0,0.501266,0.0,1,0,349.47





In [65]:
ftu_dpgan_e1_score2 = Benchmarks.evaluate(
    [("test_eps_1", "dpgan", {"epsilon": 1, "n_iter": 10, "n_iter_min": 1})],
    ftu_loader,
    test_loader,
    synthetic_size=75000,
    repeats=1,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    },
    use_metric_cache=False
)
Benchmarks.print(ftu_dpgan_e1_score2)

[2023-12-06T19:57:03.043738-0600][7452][INFO] Testcase : test_eps_1
[2023-12-06T19:57:03.047551-0600][7452][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 6103139906204711075



[4m[1mPlugin : test_eps_1[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,2048.49999,2048.49999,2048.49999,0.0,2048.49999,0.0,1,0,0.91
privacy.identifiability_score.score,0.005802,0.005802,0.005802,0.0,0.005802,0.0,1,0,32.04
privacy.identifiability_score.score_OC,0.020017,0.020017,0.020017,0.0,0.020017,0.0,1,0,32.04
privacy.DomiasMIA_prior.accuracy,0.500041,0.500041,0.500041,0.0,0.500041,0.0,1,0,348.66
privacy.DomiasMIA_prior.aucroc,0.675551,0.675551,0.675551,0.0,0.675551,0.0,1,0,348.66





In [66]:
ftu_dpgan_e10_score2 = Benchmarks.evaluate(
    [("test_eps_10", "dpgan", {"epsilon": 10, "n_iter": 10, "n_iter_min": 1})],
    ftu_loader,
    test_loader,
    synthetic_size=75000,
    repeats=1,
    metrics={
        'privacy': ['delta-presence', 'identifiability_score', 'DomiasMIA_prior']
    },
    use_metric_cache=False
)
Benchmarks.print(ftu_dpgan_e10_score2)

[2023-12-06T20:03:25.016934-0600][7452][INFO] Testcase : test_eps_10
[2023-12-06T20:03:25.026263-0600][7452][INFO] [testcase] Experiment repeat: 0 task type: classification Train df hash = 6103139906204711075



[4m[1mPlugin : test_eps_10[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
privacy.delta-presence.score,209.833333,209.833333,209.833333,0.0,209.833333,0.0,1,0,0.88
privacy.identifiability_score.score,0.014173,0.014173,0.014173,0.0,0.014173,0.0,1,0,32.93
privacy.identifiability_score.score_OC,0.034563,0.034563,0.034563,0.0,0.034563,0.0,1,0,32.93
privacy.DomiasMIA_prior.accuracy,0.500062,0.500062,0.500062,0.0,0.500062,0.0,1,0,348.02
privacy.DomiasMIA_prior.aucroc,0.762871,0.762871,0.762871,0.0,0.762871,0.0,1,0,348.02



