In [None]:
#Main imports
import pytest
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx

from typing import Tuple

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

from xgboost import XGBClassifier

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import os.path

In [None]:
#Add files to sys 
import os, sys
sys.path.append(os.getcwd())

## Import dataset (adult.data)

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')

In [None]:
#adult_dir = '../gdrive/MyDrive/adult.data'
adult_dir = 'data/adult.data'

In [None]:
names = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "label",
    ]
df = pd.read_csv(adult_dir, names=names, index_col=False)
df = df.applymap(lambda x: x.strip() if type(x) is str else x)

for col in df:
    if df[col].dtype == "object":
        df = df[df[col] != "?"]

replace = [
    [
        "Private",
        "Self-emp-not-inc",
        "Self-emp-inc",
        "Federal-gov",
        "Local-gov",
        "State-gov",
        "Without-pay",
        "Never-worked",
    ],
    [
        "Bachelors",
        "Some-college",
        "11th",
        "HS-grad",
        "Prof-school",
        "Assoc-acdm",
        "Assoc-voc",
        "9th",
        "7th-8th",
        "12th",
        "Masters",
        "1st-4th",
        "10th",
        "Doctorate",
        "5th-6th",
        "Preschool",
    ],
    [
        "Married-civ-spouse",
        "Divorced",
        "Never-married",
        "Separated",
        "Widowed",
        "Married-spouse-absent",
        "Married-AF-spouse",
    ],
    [
        "Tech-support",
        "Craft-repair",
        "Other-service",
        "Sales",
        "Exec-managerial",
        "Prof-specialty",
        "Handlers-cleaners",
        "Machine-op-inspct",
        "Adm-clerical",
        "Farming-fishing",
        "Transport-moving",
        "Priv-house-serv",
        "Protective-serv",
        "Armed-Forces",
    ],
    [
        "Wife",
        "Own-child",
        "Husband",
        "Not-in-family",
        "Other-relative",
        "Unmarried",
    ],
    ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
    ["Female", "Male"],
    [
        "United-States",
        "Cambodia",
        "England",
        "Puerto-Rico",
        "Canada",
        "Germany",
        "Outlying-US(Guam-USVI-etc)",
        "India",
        "Japan",
        "Greece",
        "South",
        "China",
        "Cuba",
        "Iran",
        "Honduras",
        "Philippines",
        "Italy",
        "Poland",
        "Jamaica",
        "Vietnam",
        "Mexico",
        "Portugal",
        "Ireland",
        "France",
        "Dominican-Republic",
        "Laos",
        "Ecuador",
        "Taiwan",
        "Haiti",
        "Columbia",
        "Hungary",
        "Guatemala",
        "Nicaragua",
        "Scotland",
        "Thailand",
        "Yugoslavia",
        "El-Salvador",
        "Trinadad&Tobago",
        "Peru",
        "Hong",
        "Holand-Netherlands",
    ],
    [">50K", "<=50K"],
]

for row in replace:
    df = df.replace(row, range(len(row)))

index = df.index
print('TOTAL DATAPOINTS AFTER CLEANING:',len(index))

# Split the data into train,test
train, test = train_test_split(df, test_size=0.3)
X_train = train.loc[:, train.columns != 'label']
y_train = train['label']
X_test = test.loc[:, test.columns != 'label']
y_test = test['label']

df = df.values
X = df[:, :14].astype(np.uint32)
y = df[:, 14].astype(np.uint8)

## DECAF Model

In [None]:
import decaf.logger as log
from decaf.DECAF import DECAF
from decaf.data import DataModule
from tests.utils import gen_data_nonlinear

In [None]:
def generate_baseline(size: int = 100) -> Tuple[torch.Tensor, DataModule, list, dict]:
    # causal structure is in dag_seed
    dag_seed = [
        [1, 2],
        [1, 3],
        [1, 4],
        [2, 5],
        [2, 0],
        [3, 0],
        [3, 6],
        [3, 7],
        [6, 9],
        [0, 8],
        [0, 9],
    ]
    # edge removal dictionary
    bias_dict = {6: [3]}  # This removes the edge into 6 from 3.

    # DATA SETUP according to dag_seed
    G = nx.DiGraph(dag_seed)
    data = gen_data_nonlinear(G, SIZE=size)
    dm = DataModule(data.values)

    return torch.Tensor(np.asarray(data)), dm, dag_seed, bias_dict

In [None]:
def test_sanity_params() -> None:
    _, dummy_dm, seed, _ = generate_baseline()

    model = DECAF(
        dummy_dm.dims[0],
        dag_seed=seed,
    )

    assert model.generator is not None
    assert model.discriminator is not None
    assert model.x_dim == dummy_dm.dims[0]
    assert model.z_dim == dummy_dm.dims[0]
    print('pass')

test_sanity_params()

In [None]:
def test_sanity_train() -> None:
    _, dummy_dm, seed, _ = generate_baseline()

    model = DECAF(
        dummy_dm.dims[0],
        dag_seed=seed,
    )
    trainer = pl.Trainer(max_epochs=2, logger=False)

    trainer.fit(model, dummy_dm)
    print('pass')

test_sanity_train()

In [None]:
def test_sanity_generate() -> None:
    raw_data, dummy_dm, seed, bias_dict = generate_baseline(size=10)

    model = DECAF(
        dummy_dm.dims[0],
        dag_seed=seed,
    )
    trainer = pl.Trainer(max_epochs=2, logger=False)

    trainer.fit(model, dummy_dm)

    synth_data = (
        model.gen_synthetic(
            raw_data, gen_order=model.get_gen_order(), biased_edges=bias_dict
        )
            .detach()
            .numpy()
    )
    assert synth_data.shape[0] == 10
    print('pass')

test_sanity_generate()

# CTGAN - COMPARISON TO DECAF
This model will automatically load pretrained models and calculate the appropriate metrics.
It will also print the progress. (This will still take some time!) These tests will output the approximate metrics.

NOTE: THIS WILL TAKE SIGNIFICANT TIME EVEN WITH SAVED MODELS AS THE SAMPLING TAKES TIME TOO

In [None]:
from table_evaluator import TableEvaluator
from ctgan import CTGANSynthesizer

In [None]:
%run ./CTGAN/FACT_GAN.ipynb

# Run the experiments with the three privacy definitions
run_experiment_CTGAN('FTU')
run_experiment_CTGAN('CF') 
run_experiment_CTGAN('DP')

## Bias removal experiment on the Adult dataset reproduced results

In [None]:
#showcase table here with all compared Data Quality and fairness scores