In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from table_evaluator import TableEvaluator
from ctgan import CTGAN

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import os.path

In [3]:
'''
Cleans the dataset for the CTGAN experiment
'''

def clean_df(direc):
    names = [
            "age",
            "workclass",
            "fnlwgt",
            "education",
            "education-num",
            "marital-status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital-gain",
            "capital-loss",
            "hours-per-week",
            "native-country",
            "label",
        ]
    df = pd.read_csv(direc, names=names, index_col=False)
    df = df.applymap(lambda x: x.strip() if type(x) is str else x)

    for col in df:
        if df[col].dtype == "object":
            df = df[df[col] != "?"]

    replace = [
        [
            "Private",
            "Self-emp-not-inc",
            "Self-emp-inc",
            "Federal-gov",
            "Local-gov",
            "State-gov",
            "Without-pay",
            "Never-worked",
        ],
        [
            "Bachelors",
            "Some-college",
            "11th",
            "HS-grad",
            "Prof-school",
            "Assoc-acdm",
            "Assoc-voc",
            "9th",
            "7th-8th",
            "12th",
            "Masters",
            "1st-4th",
            "10th",
            "Doctorate",
            "5th-6th",
            "Preschool",
        ],
        [
            "Married-civ-spouse",
            "Divorced",
            "Never-married",
            "Separated",
            "Widowed",
            "Married-spouse-absent",
            "Married-AF-spouse",
        ],
        [
            "Tech-support",
            "Craft-repair",
            "Other-service",
            "Sales",
            "Exec-managerial",
            "Prof-specialty",
            "Handlers-cleaners",
            "Machine-op-inspct",
            "Adm-clerical",
            "Farming-fishing",
            "Transport-moving",
            "Priv-house-serv",
            "Protective-serv",
            "Armed-Forces",
        ],
        [
            "Wife",
            "Own-child",
            "Husband",
            "Not-in-family",
            "Other-relative",
            "Unmarried",
        ],
        ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
        ["Female", "Male"],
        [
            "United-States",
            "Cambodia",
            "England",
            "Puerto-Rico",
            "Canada",
            "Germany",
            "Outlying-US(Guam-USVI-etc)",
            "India",
            "Japan",
            "Greece",
            "South",
            "China",
            "Cuba",
            "Iran",
            "Honduras",
            "Philippines",
            "Italy",
            "Poland",
            "Jamaica",
            "Vietnam",
            "Mexico",
            "Portugal",
            "Ireland",
            "France",
            "Dominican-Republic",
            "Laos",
            "Ecuador",
            "Taiwan",
            "Haiti",
            "Columbia",
            "Hungary",
            "Guatemala",
            "Nicaragua",
            "Scotland",
            "Thailand",
            "Yugoslavia",
            "El-Salvador",
            "Trinadad&Tobago",
            "Peru",
            "Hong",
            "Holand-Netherlands",
        ],
        [">50K", "<=50K"],
    ]

    for row in replace:
        df = df.replace(row, range(len(row)))

    # Create a dataframe to store the synthetic data
    df = df[['race','age','sex','native-country','marital-status','education','occupation','hours-per-week','workclass','relationship','label']]
    return df

In [4]:
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'label'
]

# Node order contains the order in which to generate the data, starting with the root nodes
node_order = [['race','age','sex','native-country'],['marital-status'],['education'],['occupation','hours-per-week','workclass','relationship'],['label']]
node_order_nl = ['race','age','sex','native-country','marital-status','education','occupation','hours-per-week','workclass','relationship','label']

# List of connections; key is receiving node
node_connections_normal = {'label':['occupation','race','hours-per-week','age','marital-status','education','sex','workclass','native-country','relatinship'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

'''
Connections are removed according to the privacy criterion
'''
node_connections_FTU = {'label':['occupation','race','hours-per-week','age','marital-status','education','workclass','native-country','relationship'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

node_connections_DP = {'label':['race','age','native-country'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

node_connections_CF = {'label':['occupation','race','hours-per-week','age','education','workclass','native-country',],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

In [15]:
@ignore_warnings(category=ConvergenceWarning)
def generate_data(df,mode):
    print("Generating Data for CTGAN-",mode,"...")
    ctgan = CTGAN(epochs=10, verbose=True)
    # How much more data the synthetic dataset should contain that the OG data (This is to ensure we can
    # take a sample that looks like the original data)
    factor = 50
    
    # Define the privacy measure
    if mode == 'FTU':
        node_connections = node_connections_FTU
    elif mode == 'DP':
        node_connections = node_connections_DP
    elif mode == 'CF':
        node_connections = node_connections_CF
    else:
        print('Mode is not correct!')
    
    model_name = 'CTGANrootnodes' + str(mode) + '.pkl'
    path = 'Models/' + model_name
    if os.path.isfile(path):
        ctgan = ctgan.load(path)
    else:
        # DF to fit the first model on
        start_df = df[['race','age','sex','native-country']]
        temp_discrete = ['race','age','sex','native-country']

        print("Fitting root nodes...")
        ctgan.fit(start_df, temp_discrete)
        ctgan.save('Models/'+model_name)
    
    print("Sampling root...")
    synth_df = ctgan.sample(factor * len(df.index))
    #print('Initial nodes loaded for mode',mode)
    
    # Iteratively generate the data
    for node in node_order_nl:
         # If the node has not been generated yet
        if node not in synth_df.columns:
            # Grab the old data
            empty_df = df[[node]]

            # Grab the attributes that need to be looked at when generating data
            if node in node_connections.keys():
                attributes = node_connections[node]
            else:
                attributes = []
                for n in node_order_nl:
                    attributes.append(n)
                    if n == node:
                        break
                            
            model_name = 'CTGAN' + str(node) + str(mode) + '.pkl'
            path = 'Models/' + model_name
            if os.path.isfile(path):
                ctgan = ctgan.load(path)
                print("Sampling for node ", node, "...")
                generated_data = ctgan.sample(len(synth_df.index))
            else:
                # Grab the attributes from the final df
                gen_df = synth_df.loc[:,synth_df.columns.isin(attributes)]

                # Add the old attribute to the current dataframe
                at = df[attributes]
                empty_df = empty_df.join(at)

                temp_discrete = []
                for d in discrete_columns:
                    if d in gen_df.columns:
                        temp_discrete.append(d)

                print("Fitting for node ", node, "...")
                ctgan.fit(empty_df, temp_discrete)

                model_name = str(node) + str(mode)
                ctgan.save('Models/CTGAN' + model_name + '.pkl')
                print("Sampling for node ", node, "...")
                generated_data = ctgan.sample(len(synth_df.index))
                
            # Add the generated data to the output
            for attribute in attributes + [node]:
                if attribute not in synth_df.columns:
                    synth_df[attribute] = generated_data[attribute].values
            #print('Finished node',node,'for',mode)
    # Finally, we have to manually add the label
    return synth_df

In [7]:
def get_metrics(mode,df,synthetic):

    # Split the data into train,test
    traindf, testdf = train_test_split(df, test_size=0.3)
    X_train = traindf.loc[:, traindf.columns != 'label']
    y_train = traindf['label']
    X_test = testdf.loc[:, testdf.columns != 'label']
    y_test = testdf['label']

    clf_df = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',
                                     learning_rate='constant', learning_rate_init=0.001).fit(X_train, y_train)
    '''
    SYNTHETIC DATASET
    '''
    # Make sure the data is representative of the original dataset
    synthetic_balanced_1 = synthetic[synthetic.label == 1].sample(22654)
    synthetic_balanced_0 = synthetic[synthetic.label == 0].sample(7508)
    #synthetic_balanced = synthetic_balanced_1.append(synthetic_balanced_0)
    synthetic_balanced = pd.concat([synthetic_balanced_1, synthetic_balanced_0], ignore_index=True)

    # Split the data into train,test
    X_syn = synthetic_balanced.loc[:, synthetic_balanced.columns != 'label']
    y_syn = synthetic_balanced['label']

    y_pred_syn = clf_df.predict(X_syn)

    synthetic_pos = synthetic.assign(sex=0)
    synthetic_neg = synthetic.assign(sex=1)
    
    x_pos_syn = synthetic_balanced[synthetic_balanced['sex'] == 0].drop(['label'], axis = 1)[:7508]
    x_neg_syn = synthetic_balanced[synthetic_balanced['sex'] == 1].drop(['label'], axis = 1)[:7508]
    
    pos = clf_df.predict(synthetic_pos.drop('label',axis=1))
    neg = clf_df.predict(synthetic_neg.drop('label',axis=1))

    pred_pos_syn = clf_df.predict(x_pos_syn)
    pred_neg_syn = clf_df.predict(x_neg_syn)
    
    FTU = np.abs(np.mean(pos-neg))
    DP = np.mean(pred_pos_syn)-np.mean(pred_neg_syn)
    
    # Print the obtained statistics
    print('Statistics for dataset for mode:',mode)
    print('Precision:',precision_score(y_syn, y_pred_syn, average='binary'))
    print('Recall:',recall_score(y_syn, y_pred_syn, average='binary'))
    print('AUROC:',roc_auc_score(y_syn, y_pred_syn))
    print('FTU:',FTU)
    print('DP:',DP)

In [8]:
'''
FTU results for adult dataset
'''
def run_experiment_CTGAN(mode):
    # Generate the synthetic a data
    df = clean_df('data/adult.data')
    synthetic = generate_data(df,mode)
    get_metrics(mode,df,synthetic)
    return df, synthetic

In [9]:
#Main imports
import pytest
from typing import Tuple
import pytorch_lightning as pl
from xgboost import XGBClassifier



In [10]:
#Add files to sys 
import sys
sys.path.append(os.getcwd())

In [11]:

from tests.utils import load_adult
from tests.test_decaf import test_run_experiments

X, y, df = load_adult()

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country   label  
0          2174             0              40   United-States   <=50

In [12]:
# %run ./CTGAN/FACT_GAN.ipynb

# Run the experiments with the three privacy definitions
orig_ftu, ctgan_ftu_synth = run_experiment_CTGAN('FTU')
# run_experiment_CTGAN('CF') 
# run_experiment_CTGAN('DP')

Generating Data for CTGAN- FTU ...
Sampling root...
Statistics for dataset for mode: FTU
Precision: 0.7501313439108963
Recall: 0.9453959565639622
AUROC: 0.4976047443981239
FTU: 0.08495723095285458
DP: 0.08564198188598826


In [13]:
orig_ftu.info()
orig_ftu.head()

<class 'pandas.core.frame.DataFrame'>
Index: 30162 entries, 0 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   race            30162 non-null  int64
 1   age             30162 non-null  int64
 2   sex             30162 non-null  int64
 3   native-country  30162 non-null  int64
 4   marital-status  30162 non-null  int64
 5   education       30162 non-null  int64
 6   occupation      30162 non-null  int64
 7   hours-per-week  30162 non-null  int64
 8   workclass       30162 non-null  int64
 9   relationship    30162 non-null  int64
 10  label           30162 non-null  int64
dtypes: int64(11)
memory usage: 2.8 MB


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,1,0,2,0,8,40,5,3,1
1,0,50,1,0,0,0,4,13,1,2,1
2,0,38,1,0,1,3,6,40,0,3,1
3,4,53,1,0,0,2,6,40,0,2,1
4,4,28,0,12,0,0,5,40,0,0,1


In [14]:
ctgan_ftu_synth.info()
ctgan_ftu_synth.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508100 entries, 0 to 1508099
Data columns (total 11 columns):
 #   Column          Non-Null Count    Dtype
---  ------          --------------    -----
 0   race            1508100 non-null  int64
 1   age             1508100 non-null  int64
 2   sex             1508100 non-null  int64
 3   native-country  1508100 non-null  int64
 4   marital-status  1508100 non-null  int64
 5   education       1508100 non-null  int64
 6   occupation      1508100 non-null  int64
 7   hours-per-week  1508100 non-null  int64
 8   workclass       1508100 non-null  int64
 9   relationship    1508100 non-null  int64
 10  label           1508100 non-null  int64
dtypes: int64(11)
memory usage: 126.6 MB


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,1,55,1,0,2,4,4,40,0,3,1
1,0,30,1,0,1,10,5,15,0,2,1
2,4,39,1,0,0,10,9,15,0,5,1
3,0,27,1,0,2,3,5,12,0,3,1
4,0,47,1,0,2,6,3,48,0,3,0


In [16]:
orig_dp, ctgan_dp_synth = run_experiment_CTGAN('DP')

Generating Data for CTGAN- DP ...
Fitting root nodes...


Gen. (-0.02) | Discrim. (-0.04): 100%|██████████| 10/10 [01:16<00:00,  7.65s/it]


Sampling root...
Fitting for node  marital-status ...


Gen. (-0.76) | Discrim. (0.04): 100%|██████████| 10/10 [00:52<00:00,  5.24s/it]


Sampling for node  marital-status ...
Fitting for node  education ...


Gen. (-0.83) | Discrim. (0.07): 100%|██████████| 10/10 [00:54<00:00,  5.48s/it]


Sampling for node  education ...
Fitting for node  occupation ...


Gen. (-0.93) | Discrim. (0.02): 100%|██████████| 10/10 [00:45<00:00,  4.56s/it]


Sampling for node  occupation ...
Fitting for node  hours-per-week ...


Gen. (-0.70) | Discrim. (0.04): 100%|██████████| 10/10 [01:02<00:00,  6.25s/it]


Sampling for node  hours-per-week ...
Fitting for node  workclass ...


Gen. (-1.00) | Discrim. (-0.03): 100%|██████████| 10/10 [00:59<00:00,  5.90s/it]


Sampling for node  workclass ...
Fitting for node  relationship ...


Gen. (-0.69) | Discrim. (0.01): 100%|██████████| 10/10 [00:58<00:00,  5.81s/it]


Sampling for node  relationship ...
Fitting for node  label ...


Gen. (-0.58) | Discrim. (-0.04): 100%|██████████| 10/10 [00:50<00:00,  5.07s/it]


Sampling for node  label ...
Statistics for dataset for mode: DP
Precision: 0.7511004947961099
Recall: 0.9716164915688179
AUROC: 0.5000597108882981
FTU: 0.04346595053378423
DP: 0.04395311667554602


In [17]:
orig_dp.info()
orig_dp.head()

<class 'pandas.core.frame.DataFrame'>
Index: 30162 entries, 0 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   race            30162 non-null  int64
 1   age             30162 non-null  int64
 2   sex             30162 non-null  int64
 3   native-country  30162 non-null  int64
 4   marital-status  30162 non-null  int64
 5   education       30162 non-null  int64
 6   occupation      30162 non-null  int64
 7   hours-per-week  30162 non-null  int64
 8   workclass       30162 non-null  int64
 9   relationship    30162 non-null  int64
 10  label           30162 non-null  int64
dtypes: int64(11)
memory usage: 3.8 MB


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,1,0,2,0,8,40,5,3,1
1,0,50,1,0,0,0,4,13,1,2,1
2,0,38,1,0,1,3,6,40,0,3,1
3,4,53,1,0,0,2,6,40,0,2,1
4,4,28,0,12,0,0,5,40,0,0,1


In [18]:
ctgan_dp_synth.info()
ctgan_dp_synth.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508100 entries, 0 to 1508099
Data columns (total 11 columns):
 #   Column          Non-Null Count    Dtype
---  ------          --------------    -----
 0   race            1508100 non-null  int64
 1   age             1508100 non-null  int64
 2   sex             1508100 non-null  int64
 3   native-country  1508100 non-null  int64
 4   marital-status  1508100 non-null  int64
 5   education       1508100 non-null  int64
 6   occupation      1508100 non-null  int64
 7   hours-per-week  1508100 non-null  int64
 8   workclass       1508100 non-null  int64
 9   relationship    1508100 non-null  int64
 10  label           1508100 non-null  int64
dtypes: int64(11)
memory usage: 126.6 MB


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,23,0,0,0,5,0,50,2,2,1
1,0,23,1,0,2,0,5,40,0,3,1
2,0,39,1,0,2,3,4,22,0,2,1
3,1,45,0,0,2,10,1,34,0,3,1
4,0,29,0,0,0,13,10,44,0,4,1


In [19]:
orig_cf, ctgan_cf_synth = run_experiment_CTGAN('CF')

Generating Data for CTGAN- CF ...
Fitting root nodes...


Gen. (0.23) | Discrim. (0.04): 100%|██████████| 10/10 [01:13<00:00,  7.31s/it]


Sampling root...
Fitting for node  marital-status ...


Gen. (-0.68) | Discrim. (-0.09): 100%|██████████| 10/10 [00:50<00:00,  5.03s/it]


Sampling for node  marital-status ...
Fitting for node  education ...


Gen. (-0.61) | Discrim. (-0.00): 100%|██████████| 10/10 [00:53<00:00,  5.37s/it]


Sampling for node  education ...
Fitting for node  occupation ...


Gen. (-1.02) | Discrim. (0.01): 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]


Sampling for node  occupation ...
Fitting for node  hours-per-week ...


Gen. (-0.82) | Discrim. (-0.04): 100%|██████████| 10/10 [00:58<00:00,  5.89s/it]


Sampling for node  hours-per-week ...
Fitting for node  workclass ...


Gen. (-0.59) | Discrim. (0.05): 100%|██████████| 10/10 [00:56<00:00,  5.62s/it]


Sampling for node  workclass ...
Fitting for node  relationship ...


Gen. (-0.51) | Discrim. (0.03): 100%|██████████| 10/10 [00:55<00:00,  5.57s/it]


Sampling for node  relationship ...
Fitting for node  label ...


Gen. (-0.97) | Discrim. (-0.01): 100%|██████████| 10/10 [01:04<00:00,  6.44s/it]


Sampling for node  label ...
Statistics for dataset for mode: CF
Precision: 0.7507642073086008
Recall: 0.9540478502692681
AUROC: 0.4992002703663868
FTU: 0.042562164312711356
DP: 0.04555141182738409


In [20]:
orig_cf.info()
orig_cf.head()

<class 'pandas.core.frame.DataFrame'>
Index: 30162 entries, 0 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   race            30162 non-null  int64
 1   age             30162 non-null  int64
 2   sex             30162 non-null  int64
 3   native-country  30162 non-null  int64
 4   marital-status  30162 non-null  int64
 5   education       30162 non-null  int64
 6   occupation      30162 non-null  int64
 7   hours-per-week  30162 non-null  int64
 8   workclass       30162 non-null  int64
 9   relationship    30162 non-null  int64
 10  label           30162 non-null  int64
dtypes: int64(11)
memory usage: 3.8 MB


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,39,1,0,2,0,8,40,5,3,1
1,0,50,1,0,0,0,4,13,1,2,1
2,0,38,1,0,1,3,6,40,0,3,1
3,4,53,1,0,0,2,6,40,0,2,1
4,4,28,0,12,0,0,5,40,0,0,1


In [21]:
ctgan_cf_synth.info()
ctgan_cf_synth.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508100 entries, 0 to 1508099
Data columns (total 11 columns):
 #   Column          Non-Null Count    Dtype
---  ------          --------------    -----
 0   race            1508100 non-null  int64
 1   age             1508100 non-null  int64
 2   sex             1508100 non-null  int64
 3   native-country  1508100 non-null  int64
 4   marital-status  1508100 non-null  int64
 5   education       1508100 non-null  int64
 6   occupation      1508100 non-null  int64
 7   hours-per-week  1508100 non-null  int64
 8   workclass       1508100 non-null  int64
 9   relationship    1508100 non-null  int64
 10  label           1508100 non-null  int64
dtypes: int64(11)
memory usage: 126.6 MB


Unnamed: 0,race,age,sex,native-country,marital-status,education,occupation,hours-per-week,workclass,relationship,label
0,0,46,1,0,0,10,4,43,0,2,1
1,0,39,0,20,1,12,2,49,0,3,1
2,0,34,1,0,0,2,9,40,0,2,1
3,4,39,1,0,2,3,6,40,0,0,1
4,0,44,1,0,0,14,5,31,0,5,1
