In [1]:
#Main imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.neural_network import MLPClassifier

import pytorch_lightning as pl
from xgboost import XGBClassifier

import pytest
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx

from typing import Tuple

ModuleNotFoundError: No module named 'pytorch_lightning'

## Import dataset (adult.data)

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')

In [None]:
#adult_dir = '../gdrive/MyDrive/adult.data'
adult_dir = 'data/adult.data'

In [None]:
names = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "label",
    ]
df = pd.read_csv(adult_dir, names=names, index_col=False)
df = df.applymap(lambda x: x.strip() if type(x) is str else x)

for col in df:
    if df[col].dtype == "object":
        df = df[df[col] != "?"]

replace = [
    [
        "Private",
        "Self-emp-not-inc",
        "Self-emp-inc",
        "Federal-gov",
        "Local-gov",
        "State-gov",
        "Without-pay",
        "Never-worked",
    ],
    [
        "Bachelors",
        "Some-college",
        "11th",
        "HS-grad",
        "Prof-school",
        "Assoc-acdm",
        "Assoc-voc",
        "9th",
        "7th-8th",
        "12th",
        "Masters",
        "1st-4th",
        "10th",
        "Doctorate",
        "5th-6th",
        "Preschool",
    ],
    [
        "Married-civ-spouse",
        "Divorced",
        "Never-married",
        "Separated",
        "Widowed",
        "Married-spouse-absent",
        "Married-AF-spouse",
    ],
    [
        "Tech-support",
        "Craft-repair",
        "Other-service",
        "Sales",
        "Exec-managerial",
        "Prof-specialty",
        "Handlers-cleaners",
        "Machine-op-inspct",
        "Adm-clerical",
        "Farming-fishing",
        "Transport-moving",
        "Priv-house-serv",
        "Protective-serv",
        "Armed-Forces",
    ],
    [
        "Wife",
        "Own-child",
        "Husband",
        "Not-in-family",
        "Other-relative",
        "Unmarried",
    ],
    ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
    ["Female", "Male"],
    [
        "United-States",
        "Cambodia",
        "England",
        "Puerto-Rico",
        "Canada",
        "Germany",
        "Outlying-US(Guam-USVI-etc)",
        "India",
        "Japan",
        "Greece",
        "South",
        "China",
        "Cuba",
        "Iran",
        "Honduras",
        "Philippines",
        "Italy",
        "Poland",
        "Jamaica",
        "Vietnam",
        "Mexico",
        "Portugal",
        "Ireland",
        "France",
        "Dominican-Republic",
        "Laos",
        "Ecuador",
        "Taiwan",
        "Haiti",
        "Columbia",
        "Hungary",
        "Guatemala",
        "Nicaragua",
        "Scotland",
        "Thailand",
        "Yugoslavia",
        "El-Salvador",
        "Trinadad&Tobago",
        "Peru",
        "Hong",
        "Holand-Netherlands",
    ],
    [">50K", "<=50K"],
]

for row in replace:
    df = df.replace(row, range(len(row)))

index = df.index
print('TOTAL DATAPOINTS AFTER CLEANING:',len(index))

# Split the data into train,test
train, test = train_test_split(df, test_size=0.3)
X_train = train.loc[:, train.columns != 'label']
y_train = train['label']
X_test = test.loc[:, test.columns != 'label']
y_test = test['label']

# fig, ax = plt.subplots(5, 3, figsize = (30, 20))
# fig.tight_layout(pad = 2.0)

# ax[0,0].hist(df['age'])
# ax[0,0].set_title('age')
# ax[0,1].hist(df['workclass'])
# ax[0,1].set_title('workclass')
# ax[0,2].hist(df['fnlwgt'])
# ax[0,2].set_title('fnlwgt')

# ax[1,0].hist(df['education'])
# ax[1,0].set_title('education')
# ax[1,1].hist(df['education-num'])
# ax[1,1].set_title('education-num')
# ax[1,2].hist(df['marital-status'])
# ax[1,2].set_title('marital-status')

# ax[2,0].hist(df['occupation'])
# ax[2,0].set_title('occupation')
# ax[2,1].hist(df['relationship'])
# ax[2,1].set_title('relationship')
# ax[2,2].hist(df['race'])
# ax[2,2].set_title('race')

# ax[3,0].hist(df['sex'])
# ax[3,0].set_title('sex')
# ax[3,1].hist(df['capital-gain'])
# ax[3,1].set_title('capital-gain')
# ax[3,2].hist(df['capital-loss'])
# ax[3,2].set_title('capital-loss')

# ax[4,0].hist(df['hours-per-week'])
# ax[4,0].set_title('hours-per-week')
# ax[4,1].hist(df['native-country'])
# ax[4,1].set_title('native-country')
# ax[4,2].hist(df['label'])
# ax[4,2].set_title('label')


# plt.show()

## DECAF Model

In [None]:
from DECAF.decaf import DECAF, DataModule
from DECAF.utils import gen_data_nonlinear, load_adult

In [None]:
def generate_baseline(size: int = 100) -> Tuple[torch.Tensor, DataModule, list, dict]:
    # causal structure is in dag_seed
    dag_seed = [
        [1, 2],
        [1, 3],
        [1, 4],
        [2, 5],
        [2, 0],
        [3, 0],
        [3, 6],
        [3, 7],
        [6, 9],
        [0, 8],
        [0, 9],
    ]
    # edge removal dictionary
    bias_dict = {6: [3]}  # This removes the edge into 6 from 3.

    # DATA SETUP according to dag_seed
    G = nx.DiGraph(dag_seed)
    data = gen_data_nonlinear(G, SIZE=size)
    dm = DataModule(data.values)

    return torch.Tensor(np.asarray(data)), dm, dag_seed, bias_dict

In [None]:
def test_sanity_params() -> None:
    _, dummy_dm, seed, _ = generate_baseline()

    model = DECAF(
        dummy_dm.dims[0],
        dag_seed=seed,
    )

    assert model.generator is not None
    assert model.discriminator is not None
    assert model.x_dim == dummy_dm.dims[0]
    assert model.z_dim == dummy_dm.dims[0]

In [None]:
def test_sanity_train() -> None:
    _, dummy_dm, seed, _ = generate_baseline()

    model = DECAF(
        dummy_dm.dims[0],
        dag_seed=seed,
    )
    trainer = pl.Trainer(max_epochs=2, logger=False)

    trainer.fit(model, dummy_dm)

In [None]:
def test_sanity_generate() -> None:
    raw_data, dummy_dm, seed, bias_dict = generate_baseline(size=10)

    model = DECAF(
        dummy_dm.dims[0],
        dag_seed=seed,
    )
    trainer = pl.Trainer(max_epochs=2, logger=False)

    trainer.fit(model, dummy_dm)

    synth_data = (
        model.gen_synthetic(
            raw_data, gen_order=model.get_gen_order(), biased_edges=bias_dict
        )
            .detach()
            .numpy()
    )
    assert synth_data.shape[0] == 10

In [None]:
@pytest.mark.parametrize("X,y", [load_adult()])
@pytest.mark.slow
def test_run_experiments(X: pd.DataFrame, y: pd.DataFrame) -> None:
    print(X.shape)
    """Normalize X"""
    X_normalized = (X - X.mean(axis=0)) / X.std(axis=0)

    # baseline_clf = XGBClassifier(eval_metric='logloss', use_label_encoder=False).fit(X, y)
    # baseline_clf = XGBClassifier().fit(X_normalized, y)
    baseline_clf = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',
                                 learning_rate='constant', learning_rate_init=0.001)
    baseline_clf.fit(X_normalized, y)

    y_pred = baseline_clf.predict(X_normalized)

    print(
        "baseline scores",
        precision_score(y, y_pred),
        recall_score(y, y_pred),
        roc_auc_score(y, y_pred),
    )

    dm = DataModule(X_normalized)

    # causal structure is in dag_seed
    dag_seed = [
        [0, 6],
        [0, 12],
        [0, 1],
        [0, 5],
        [0, 3],
        [3, 6],
        [3, 12],
        [3, 1],
        [3, 7],
        [5, 6],
        [5, 12],
        [5, 1],
        [5, 7],
        [5, 3],
        [8, 6],
        [8, 12],
        [8, 3],
        [8, 5],
        [9, 6],
        [9, 5],
        [9, 12],
        [9, 1],
        [9, 3],
        [9, 7],
        [13, 5],
        [13, 12],
        [13, 3],
        [13, 1],
        [13, 7],
    ]
    # edge removal dictionary
    bias_dict = {}

    # bias_dict = {6: [9],
    #              5: [9],
    #              12: [9],
    #              1: [9],
    #              3: [9],
    #              7: [9],
    #              }

    model = DECAF(
        dm.dims[0],
        dag_seed=dag_seed,
        use_mask=True,
        grad_dag_loss=False,
        lambda_privacy=0,
        lambda_gp=10,
        weight_decay=1e-2,
        l1_g=0,
        p_gen=-1,
        batch_size=100,
    )

    trainer = pl.Trainer(max_epochs=10, logger=False)

    trainer.fit(model, dm)

    X_synth = (
        model.gen_synthetic(
            dm.dataset.x,
            gen_order=model.get_gen_order(), biased_edges=bias_dict
        )
            .detach()
            .numpy()
    )

    print(X_normalized[10:])
    print(X_synth[10:])

    y_synth = baseline_clf.predict(X_synth)

    print(y_synth)
    print('y_synth unique?', np.unique(y_synth) > 1)

    # synth_clf = XGBClassifier().fit(X_synth, y_synth)
    synth_clf = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam',
                              learning_rate='constant', learning_rate_init=0.001)
    synth_clf.fit(X_synth, y_synth)
    y_pred = synth_clf.predict(X_synth)

    # try:
    print(
        "synth scores",
        precision_score(y_synth, y),
        recall_score(y_synth, y),
        roc_auc_score(y_synth, y),
    )
    # except ValueError:
    #     pass

## Benchmark results for comparison - CTGAN

In [None]:
from table_evaluator import TableEvaluator
from ctgan import CTGANSynthesizer

In [None]:
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'label'
]

dfc = df.copy()

#ctgan = CTGANSynthesizer(epochs=10,verbose=True)
#ctgan.fit(dfc, discrete_columns)

# Generate the exact same amount
#df1 = ctgan.sample(len(dfc.index))

# fig, ax = plt.subplots(5, 3, figsize = (30, 20))
# fig.tight_layout(pad = 2.0)

# ax[0,0].hist(df1['age'])
# ax[0,0].set_title('age')
# ax[0,1].hist(df1['workclass'])
# ax[0,1].set_title('workclass')
# ax[0,2].hist(df1['fnlwgt'])
# ax[0,2].set_title('fnlwgt')

# ax[1,0].hist(df1['education'])
# ax[1,0].set_title('education')
# ax[1,1].hist(df1['education-num'])
# ax[1,1].set_title('education-num')
# ax[1,2].hist(df1['marital-status'])
# ax[1,2].set_title('marital-status')

# ax[2,0].hist(df1['occupation'])
# ax[2,0].set_title('occupation')
# ax[2,1].hist(df1['relationship'])
# ax[2,1].set_title('relationship')
# ax[2,2].hist(df1['race'])
# ax[2,2].set_title('race')

# ax[3,0].hist(df1['sex'])
# ax[3,0].set_title('sex')
# ax[3,1].hist(df1['capital-gain'])
# ax[3,1].set_title('capital-gain')
# ax[3,2].hist(df1['capital-loss'])
# ax[3,2].set_title('capital-loss')

# ax[4,0].hist(df1['hours-per-week'])
# ax[4,0].set_title('hours-per-week')
# ax[4,1].hist(df1['native-country'])
# ax[4,1].set_title('native-country')
# ax[4,2].hist(df1['label'])
# ax[4,2].set_title('label')


# plt.show()

In [None]:
# indexc = dfc.index
# print('TOTAL DATAPOINTS AFTER CLEANING:',len(indexc))
# index1 = df1.index
# print('TOTAL DATAPOINTS AFTER CLEANING:',len(index1))
# table_evaluator = TableEvaluator(dfc,df1)
# table_evaluator.visual_evaluation() 

In [None]:
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'label'
]




# Create a dataframe to store the synthetic data
df = df[['race','age','sex','native-country','marital-status','education','occupation','hours-per-week','workclass','relationship','label']]

# Node order contains the order in which to generate the data, starting with the root nodes
node_order = [['race','age','sex','native-country'],['marital-status'],['education'],['occupation','hours-per-week','workclass','relationship'],['label']]
node_order_nl = ['race','age','sex','native-country','marital-status','education','occupation','hours-per-week','workclass','relationship','label']

# List of connections; key is receiving node
node_connections_normal = {'label':['occupation','race','hours-per-week','age','marital-status','education','sex','workclass','native-country','relatinship'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

'''
Connections are removed according to the privacy criterion
'''
node_connections_FTU = {'label':['occupation','race','hours-per-week','age','marital-status','education','workclass','native-country','relationship'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

node_connections_DP = {'label':['race','age','native-country'],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

node_connections_CF = {'label':['occupation','race','hours-per-week','age','education','workclass','native-country',],
                    'occupation':['race','age','sex','marital-status','education'],
                    'hours-per-week':['race','age','marital-status','native-country','education','sex'],
                    'workclass':['age','marital-status','sex','education','native-country'],
                    'relationship':['marital-status','education','age','sex','native-country'],
                    'education':['race','age','marital-status','sex','native-country'],
                    'marital-status':['race','age','sex','native-country']
                    }

In [None]:
ctgan = CTGANSynthesizer(epochs=10)
def generate_data(mode):
    
    # Define the privacy measure
    if mode == 'FTU':
        node_connections = node_connections_FTU
    elif mode == 'DP':
        node_connections = node_connections_DP
    elif mode == 'CF':
        node_connections = node_connections_CF
    else:
        print('Mode is not correct!')
        
    # DF to fit the first model on
    start_df = df[['race','age','sex','native-country']]
    
    # Generate the initial nodes
    temp_discrete = ['race','age','sex','native-country']
    ctgan.fit(start_df, temp_discrete)
    synth_df = ctgan.sample(len(start_df.index))
    print('Done generating the root nodes.')
    
    
    # Iteratively generate the data
    for node in node_order_nl:
        
        # If the node has not been generated yet
        if node not in synth_df.columns:
            
            # Grab the old data
            empty_df = df[[node]]

            # Grab the attributes that need to be looked at when generating data
            attributes = node_connections[node]
            
            # Grab the attributes from the final df
            gen_df = synth_df.loc[:,synth_df.columns.isin(attributes)]
        
            # Add the old attribute to the current dataframe
            at = df[attributes]
            empty_df = empty_df.join(at)
            
            temp_discrete = []
            for d in discrete_columns:
                if d in gen_df.columns:
                    temp_discrete.append(d)
                    
            print('Started training node',node)
            ctgan.fit(empty_df, temp_discrete)
            generated_data = ctgan.sample(len(synth_df.index))
            
            # Check if synth_df needs the current attribute (shouldn't, but just to be sure)
            for attribute in attributes:
                if attribute not in synth_df.columns:
                    synth_df[attribute] = generated_data[attribute].values
            print('Finished training node',node)
    
    synth_df = synth_df.join(generated_data[['label']])
    return synth_df

synthetic = generate_data('FTU')
print(synthetic.head(10))