In [6]:
#Main imports
import pytest
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx

from typing import Tuple

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

from xgboost import XGBClassifier

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

import os.path

In [7]:
#Add files to sys 
import os, sys
sys.path.append(os.getcwd())

## DECAF Model

In [8]:
from tests.utils import load_adult
from tests.test_decaf import test_run_experiments

X, y, df = load_adult()

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country   label  
0          2174             0              40   United-States   <=50

In [4]:
test_run_experiments(X, y, df, 'ftu')

[32m2023-12-03 13:12:35.843[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1m***** DATA ****[0m
[32m2023-12-03 13:12:35.843[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mn_samples = 30162[0m
[32m2023-12-03 13:12:35.843[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mdag_seed [[0, 6], [0, 12], [0, 1], [0, 5], [0, 3], [3, 6], [3, 12], [3, 1], [3, 7], [5, 6], [5, 12], [5, 1], [5, 7], [5, 3], [8, 6], [8, 12], [8, 3], [8, 5], [9, 6], [9, 5], [9, 12], [9, 1], [9, 3], [9, 7], [13, 5], [13, 12], [13, 3], [13, 1], [13, 7]][0m
[32m2023-12-03 13:12:35.843[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mSetting up network with x_dim = 14, z_dim = 14, h_dim = 200[0m
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


baseline scores 0.8711112942436854 0.9332126776728171 0.7582952040468507
Initialised adjacency matrix as parsed:
 Parameter containing:
tensor([[0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      

  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name          | Type             | Params
---------------------------------------------------
0 | generator     | Generator_causal | 128 K 
1 | discriminator | Discriminator    | 43.4 K
---------------------------------------------------
171 K     Trainable params
196       Non-trainable params
171 K     Total params
0.686     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [5]:
test_run_experiments(X, y, df, 'dp')

[32m2023-12-02 13:18:48.299[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1m***** DATA ****[0m
[32m2023-12-02 13:18:48.300[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mn_samples = 30162[0m
[32m2023-12-02 13:18:48.301[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mdag_seed [[0, 6], [0, 12], [0, 1], [0, 5], [0, 3], [3, 6], [3, 12], [3, 1], [3, 7], [5, 6], [5, 12], [5, 1], [5, 7], [5, 3], [8, 6], [8, 12], [8, 3], [8, 5], [9, 6], [9, 5], [9, 12], [9, 1], [9, 3], [9, 7], [13, 5], [13, 12], [13, 3], [13, 1], [13, 7]][0m
[32m2023-12-02 13:18:48.302[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mSetting up network with x_dim = 14, z_dim = 14, h_dim = 200[0m


baseline scores 0.8854017065134747 0.925267061004679 0.7819595827133146
Initialised adjacency matrix as parsed:
 Parameter containing:
tensor([[0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name          | Type             | Params
---------------------------------------------------
0 | generator     | Generator_causal | 128 K 
1 | discriminator | Discriminator    | 43.4 K
---------------------------------------------------
171 K     Trainable params
196       Non-trainable params
171 K     Total params
0.686     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


Getting metrics...
Statistics for dataset for mode: dp
Precision: 0.8582917912927883
Recall: 0.999226185870154
AUROC: 0.5056371893206192
FTU: 0.0052052251176977656
DP: 0.0035961640916355453


In [6]:
test_run_experiments(X, y, df, 'cf')

[32m2023-12-02 13:23:20.689[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1m***** DATA ****[0m
[32m2023-12-02 13:23:20.690[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mn_samples = 30162[0m
[32m2023-12-02 13:23:20.692[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mdag_seed [[0, 6], [0, 12], [0, 1], [0, 5], [0, 3], [3, 6], [3, 12], [3, 1], [3, 7], [5, 6], [5, 12], [5, 1], [5, 7], [5, 3], [8, 6], [8, 12], [8, 3], [8, 5], [9, 6], [9, 5], [9, 12], [9, 1], [9, 3], [9, 7], [13, 5], [13, 12], [13, 3], [13, 1], [13, 7]][0m
[32m2023-12-02 13:23:20.692[0m | [1mINFO    [0m | [36mdecaf.logger[0m:[36mlog_and_print[0m:[36m64[0m - [1mSetting up network with x_dim = 14, z_dim = 14, h_dim = 200[0m
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You 

baseline scores 0.8716496945010184 0.9446013948971484 0.7624578631385049
Initialised adjacency matrix as parsed:
 Parameter containing:
tensor([[0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


Getting metrics...
Statistics for dataset for mode: cf
Precision: 0.8786849884157546
Recall: 0.8753979624323464
AUROC: 0.6360501929762049
FTU: 0.22269743385717128
DP: 0.2330847096430474


# CTGAN - COMPARISON TO DECAF
This model will automatically load pretrained models and calculate the appropriate metrics.
It will also print the progress. (This will still take some time!) These tests will output the approximate metrics.

NOTE: THIS WILL TAKE SIGNIFICANT TIME EVEN WITH SAVED MODELS AS THE SAMPLING TAKES TIME TOO

In [9]:
from table_evaluator import TableEvaluator
from ctgan import CTGAN

In [5]:
%run ./CTGAN/FACT_GAN.ipynb

# Run the experiments with the three privacy definitions
run_experiment_CTGAN('FTU')
# run_experiment_CTGAN('CF') 
# run_experiment_CTGAN('DP')

ImportError: attempted relative import with no known parent package

ImportError: attempted relative import with no known parent package