# Example: PATE-CTGAN and tabular data

In [None]:
# solve issue with autocomplete
%config Completer.use_jedi = False

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
import os
import pandas as pd
from pathlib import Path
import sys

from sklearn.model_selection import train_test_split

### Load a dataset

In [None]:
from ctgan import load_demo

data = load_demo()

# Names of the columns that are discrete
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

## Synthesize using PATE-CTGAN

In [None]:
from privgem import tabular_patectgan

In [None]:
# inputs
par_dir_name = "test"
batch_size = 64
epsilon = 2
noise_multiplier = 0.002
moments_order = 1000

In [None]:
if not os.path.isdir(os.path.join(f"{par_dir_name}", "orig_data")):
    Path(f"{par_dir_name}/orig_data").mkdir(parents=True, exist_ok=True)
    data.to_csv(os.path.join(par_dir_name, "orig_data", "orig_data.csv"), index=False)
    adult_data_train, adult_data_test = train_test_split(data, test_size=0.25, random_state=42, stratify=data["income"])
    adult_data_train.to_csv(os.path.join(par_dir_name, "orig_data", 'orig_train.csv'), index=False)
    adult_data_test.to_csv(os.path.join(par_dir_name, "orig_data", 'orig_test.csv'), index=False)

with open(f"{par_dir_name}/patectgan_training.csv", "w") as fio:
    fio.writelines(f"PATE-CTGAN, epsilon: {epsilon}, noise_multiplier: {noise_multiplier}, moments order: {moments_order}, batch_size: {batch_size}\n")

In [None]:
pate_model = tabular_patectgan(verbose=True, 
                               epsilon=epsilon, 
                               batch_size=batch_size, 
                               noise_multiplier=noise_multiplier, 
                               moments_order=moments_order, 
                               output_save_path=f"{par_dir_name}/patectgan_training.csv")

In [None]:
pate_model.train(data, discrete_columns)

In [None]:
synth_output = pate_model.sample(len(data))
synth_output
#Path(f"{par_dir_name}/pate_ensemble_{i_inp:05d}").mkdir(parents=True, exist_ok=True)
#synth_output.to_csv(os.path.join(par_dir_name, "synthetic_output.csv"), index=False)