In [1]:
import pandas as pd
from sdv.single_table import TVAESynthesizer

from domias.evaluator import evaluate_performance
from domias.models.generator import GeneratorInterface
from domias.models.ctgan import CTGAN



from scipy import stats
from scipy.stats import multivariate_normal


import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing, fetch_covtype, load_digits

import warnings
warnings.filterwarnings("ignore")

    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    


In [2]:
#Read in UCI dataset
df_uci = pd.read_csv(r'C:\Users\jordy\OneDrive\MSc_Python\Individual_Project\Data\UCI_Credit_Card.csv')

#convert dataframe to array
arr_uci = np.array(df_uci.iloc[:, 1:-1])

In [3]:
#Define data loader
def get_dataset() -> np.ndarray:
    def data_loader() -> np.ndarray:
        scaler = StandardScaler()
        X =arr_uci
        np.random.shuffle(X)
        return scaler.fit_transform(X)

    return data_loader()

In [4]:
def get_generator(
    gan_method: str = "CTGAN",
    epochs: int = 1000,
    seed: int = 0,
) -> GeneratorInterface:
    class LocalGenerator(GeneratorInterface):
        def __init__(self) -> None:
            if gan_method == "TVAE":
                syn_model = TVAESynthesizer(metadata, epochs=epochs)
            elif gan_method == "CTGAN":
                syn_model = CTGAN(epochs=epochs)
            elif gan_method == "KDE":
                syn_model = None
            else:
                raise RuntimeError()
            self.method = gan_method
            self.model = syn_model
                

        def fit(self, data: pd.DataFrame) -> "LocalGenerator":
            if self.method == "KDE":
                self.model = stats.gaussian_kde(np.transpose(data))
            else:
                self.model.fit(data)
            return self

        def generate(self, count: int) -> pd.DataFrame:
            
            if gan_method == "KDE":
                samples = pd.DataFrame(self.model.resample(count).transpose(1, 0))
            elif gan_method == "TVAE":
                samples = self.model.sample(count)
            elif gan_method == "CTGAN":
                samples = self.model.generate(count)
            else:
                raise RuntimeError()

            return samples
            
            #return self.model.sample(count)

    return LocalGenerator()


#Loading metadata from dataset for use in TVAESynthesizer
dataset = get_dataset()
# df_dataset = pd.DataFrame(dataset)
# df_dataset.rename(columns={0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7'}, inplace = True)

# from sdv.metadata import SingleTableMetadata

# metadata = SingleTableMetadata()
# metadata.detect_from_dataframe(data=df_dataset)






"""
Args:
    generator: GeneratorInterface
        Generator with the `fit` and `generate` methods. The generator MUST not be fitted.
    dataset: int
        The evaluation dataset, used to derive the training and test datasets.
    training_size: int
        The split for the training (member) dataset out of `dataset`
    reference_size: int
        The split for the reference dataset out of `dataset`.
    training_epochs: int
        Training epochs
    synthetic_sizes: List[int]
        For how many synthetic samples to test the attacks.

"""

"""
CTGAN Args:

embedding_dim: int = 128,
generator_dim: Tuple = (256, 256),
discriminator_dim: Tuple = (256, 256),
generator_lr: float = 2e-4,
generator_decay: float = 1e-6,
discriminator_lr: float = 2e-4,
discriminator_decay: float = 1e-6,
batch_size: int = 500,
discriminator_steps: int = 1,
log_frequency: bool = True,
verbose: bool = False,
epochs: int = 300,
pac: int = 1,
cuda: bool = True,



"""


# mem_set_size = 1000 -> originally what training size was
reference_set_sizes = [500, 3000, 5000, 10000] #held out set
training_epochs = [2000]
training_sizes = [10000] #D-mem
#synthetic_sizes = [200]
density_estimator = "kde"  # prior, kde, bnaf
gen_size = 200 #same as synthetic_sizes -> D_syn

method = "CTGAN"

# Create a dictionary to store the results
results = {}

# Set the number of iterations
num_iterations = 5

for iteration in range(1, num_iterations+1):
    # Initialize the result dictionary for the current iteration
    iteration_results = {}
    
    
    for reference_set_size in reference_set_sizes:
        size_results = {}
    
        for training_size in training_sizes:
            # Initialize the result dictionary for the current training size

            for training_epoch in training_epochs:
                generator = get_generator(
                    gan_method=method,
                    epochs=training_epoch,
                )

                perf = evaluate_performance(
                    generator,
                    dataset,
                    training_size,
                    reference_set_size,
                    training_epochs=training_epoch,
                    synthetic_sizes=[gen_size],
                    density_estimator=density_estimator,
                )

                # Store the MIA performance for the current training size and epoch
                size_results[training_epoch] = perf[gen_size]["MIA_performance"]

            # Store the results for the current training size
            iteration_results[reference_set_size] = size_results

        # Store the results for the current iteration
        results[iteration] = iteration_results

# Print the results
for iteration, iteration_results in results.items():
    print(f"Iteration {iteration}:")
    for reference_set_size, size_results in iteration_results.items():
        print(f"Training Size {training_size}:")
        for training_epoch, mia_performance in size_results.items():
            print(f"Training Epoch {training_epoch}: MIA Performance = {mia_performance}")
        print()


p_G_evaluated
[8.56210447e-10 1.27711534e-21 1.14720543e-08 ... 4.71084868e-11
 1.59193213e-10 6.12044460e-08]
p_R_evaluated
[2.65501820e-09 7.63233019e-17 4.26169949e-07 ... 2.38814615e-05
 1.03883879e-08 2.86568877e-05]
Synth_set
            0         1         2         3         4         5         6  \
0   -0.752594 -1.233369  0.188964  0.857094 -0.982685  0.015563  0.111930   
1   -1.144229 -1.234724  3.882781  0.861018 -1.493336  0.015215  0.111941   
2   -1.147652 -1.234892  0.185166  0.859880  0.413968  0.015209  0.109622   
3   -0.263282 -1.234032 -1.074786 -1.053971 -0.914772  0.014520  0.111313   
4   -1.204826 -1.235408  0.187923  0.861846  0.357179  0.014337  0.110352   
..        ...       ...       ...       ...       ...       ...       ...   
195  0.190333  0.810610  1.456021 -1.057737  0.211480  0.012891  0.108774   
196 -0.936785 -1.234568  0.187251 -1.059164 -0.277007  0.900439  1.778536   
197  0.641363  0.809198  0.187886 -1.057298 -1.258840  0.016577  0.109717  

p_G_evaluated
[7.17162410e-12 1.75574122e-13 1.07107606e-07 ... 1.26120343e-08
 1.27152382e-10 5.11313502e-08]
p_R_evaluated
[6.47700302e-07 2.73833161e-10 1.55178288e-07 ... 7.51309089e-06
 3.07700545e-08 9.65060308e-06]
Synth_set
            0         1         2         3         4         5         6  \
0   -0.966719 -1.236337  0.186252  0.854046 -1.175903  1.797976  1.791744   
1   -0.899626  0.809492  0.186470  0.853101 -0.792964  0.011279  0.112119   
2    1.927857 -1.234915  0.181738 -1.058875  1.244050 -1.756133 -1.553212   
3   -0.948918  0.806905  0.187252  0.856286  1.610378  0.013585  0.113262   
4    0.434898  0.809798 -1.079024  0.856260 -0.779833 -0.879260 -0.718022   
..        ...       ...       ...       ...       ...       ...       ...   
195 -0.021427  0.810911 -1.083314  0.854419  0.009588 -0.885956  1.774766   
196  1.125636  0.808513 -1.080460 -1.061160  1.771087  0.012275  0.112779   
197  0.766170  0.808928 -1.081526  0.857668  0.287802 -1.766065 -1.549824  

p_G_evaluated
[2.03016886e-08 2.39602574e-20 2.19803499e-06 ... 1.47534477e-09
 8.00268440e-11 2.21289805e-07]
p_R_evaluated
[2.76647482e-07 3.17093222e-10 2.03247935e-07 ... 1.87801278e-06
 4.73665613e-09 1.16372133e-05]
Synth_set
            0         1         2         3         4         5         6  \
0    0.625399  0.811929 -1.078820  0.858396 -1.016716 -0.872350 -0.723110   
1   -0.146757 -1.234268  0.185699  0.861046 -1.390907  0.015495  0.109893   
2    0.019631  0.811674  0.184516  0.858053 -0.502374  0.015626  0.109442   
3   -0.858474  0.810685  0.189016 -1.054743 -1.376715  0.014754  0.109893   
4    1.918693  0.811568  0.186538  0.856560 -0.672580 -0.875524 -0.724942   
..        ...       ...       ...       ...       ...       ...       ...   
195  0.108015 -1.234484  0.184665 -1.055502 -0.845864  0.011913  0.109200   
196  0.622142  0.811124  0.186383  0.858444 -0.437672 -0.877515 -0.729354   
197 -0.838788  0.809759  1.454473  2.738451  0.458760  0.897657  1.774924  

p_G_evaluated
[3.81157520e-08 5.34574947e-14 4.63997327e-09 ... 3.88376848e-09
 3.66482481e-12 1.95001630e-07]
p_R_evaluated
[1.75271276e-09 7.10874452e-10 9.99260165e-08 ... 5.14790088e-07
 3.97883512e-09 4.24496697e-06]
Synth_set
            0         1         2         3         4         5         6  \
0    0.641420  0.810027  0.185105 -1.057609 -0.586884  0.014895  0.111574   
1    0.151097 -1.236182  0.184199  0.858245 -0.644664  0.015988  0.112736   
2    2.694727 -1.232211 -1.078748 -1.057104  0.095340 -0.872888  1.774523   
3   -0.821951 -1.234091  0.181725  0.859257  2.281779  0.015216  0.112083   
4    0.513418 -1.233959 -1.082107 -1.055799  2.465685 -0.877145  1.774552   
..        ...       ...       ...       ...       ...       ...       ...   
195 -0.543164 -1.232218  1.441814 -1.056902  1.040604  0.014278  0.110778   
196 -0.865367  0.811660  0.183566  2.763261 -1.149350  0.016522  0.112749   
197 -0.768765  0.809917 -1.088607  0.860952  1.377420 -1.771842 -1.563642  

p_G_evaluated
[9.65471318e-11 1.30565810e-18 3.89989275e-09 ... 4.81653312e-12
 2.33701544e-12 1.13956780e-07]
p_R_evaluated
[2.65501820e-09 7.63233019e-17 4.26169949e-07 ... 2.38814615e-05
 1.03883879e-08 2.86568877e-05]
Synth_set
            0         1         2         3         4         5         6  \
0   -0.976989 -1.235378  1.448860  0.855432 -0.731676  1.803431  1.772249   
1   -0.818042 -1.234270  0.188002  0.860086 -1.040444  0.015604  0.111149   
2   -1.018819  0.810286  0.186900  0.856941 -1.402869  0.909879 -0.726130   
3   -0.826336 -1.236807  0.186536  0.859835  0.104396  0.017331  0.110944   
4    0.097527  0.809735  1.454219 -1.057582  1.720976  0.015506  0.110153   
..        ...       ...       ...       ...       ...       ...       ...   
195 -0.835950  0.810083  0.185809 -1.057502  0.878784  0.911134 -1.552726   
196  1.702676 -1.235179 -1.078629  0.857542 -1.097767 -1.745968 -1.556540   
197  0.526941  0.811650 -1.076039  0.858462 -0.952463 -0.871713 -1.565472  

p_G_evaluated
[4.27430665e-10 6.17183518e-26 2.89575323e-09 ... 3.67959805e-10
 2.57753239e-11 1.08056470e-06]
p_R_evaluated
[6.47700302e-07 2.73833161e-10 1.55178288e-07 ... 7.51309089e-06
 3.07700545e-08 9.65060308e-06]
Synth_set
            0         1         2         3         4         5         6  \
0    0.479194  0.810403 -1.079659 -1.053866  2.270949 -0.875138 -0.721496   
1   -0.056431 -1.233170  0.187414  0.860549 -0.358177  0.016901  0.111752   
2   -0.891281 -1.234766  0.185452 -1.056542  3.072920  0.015304  0.114230   
3   -0.065380  0.809209  0.186446  0.861875  0.236091  0.015660  0.112872   
4   -0.115288 -1.234399  0.186743 -1.054964  2.321932  1.808431  0.114308   
..        ...       ...       ...       ...       ...       ...       ...   
195 -0.999601  0.811117  0.189180 -1.057751 -1.126801  0.914574  1.774268   
196 -0.069952 -1.233224 -1.074577 -1.053810  2.414355 -0.877689 -0.720055   
197 -0.181917 -1.236446  0.187442 -1.054061  0.187577  0.015799  0.114012  

p_G_evaluated
[3.59376280e-09 3.60898844e-19 2.79615332e-08 ... 2.48452521e-08
 1.26469063e-09 2.50239401e-08]
p_R_evaluated
[2.76647482e-07 3.17093222e-10 2.03247935e-07 ... 1.87801278e-06
 4.73665613e-09 1.16372133e-05]
Synth_set
            0         1         2         3         4         5         6  \
0   -0.914578  0.811274 -1.084768  0.857046 -0.501004 -0.876130 -0.726900   
1   -0.437582 -1.228442  3.692078  0.856711 -0.025674 -1.763326 -1.568931   
2   -1.023341 -1.229389  1.442676 -1.058821  2.484901  0.903172  1.787603   
3    1.045927  0.809786  0.182490  0.857562 -0.844037  0.013877  0.110252   
4   -0.721253 -1.233024  1.438388 -1.059310  2.260860 -0.876430  0.110133   
..        ...       ...       ...       ...       ...       ...       ...   
195  0.037474  0.810941 -1.084346  0.856180 -0.524698  0.015366  0.112169   
196 -0.000174  0.810604 -1.084963 -1.058353 -0.184579 -0.875254 -0.727312   
197 -0.763957  0.811873  1.439339 -1.059293 -1.046715  1.787316  1.784850  

In [7]:
training_size = training_sizes[0]
training_epoch = training_epochs[0]
gen_size = gen_size

ref_output = pd.DataFrame([], columns=["iteration", "epoch", "training_size", "reference_size", "gen_size,", "src", "aucroc"])

for iteration in range(1, num_iterations + 1):
    for reference_set_size in reference_set_sizes:
        reference_set_size_res = results[iteration][reference_set_size][training_epoch]
        perf = reference_set_size_res  # Assuming perf is the correct value for accuracy

        for key in perf:
            ref_output = pd.concat(
                [
                    ref_output,
                    pd.DataFrame(
                        [
                            [iteration, training_epoch, training_size, reference_set_size, gen_size, key, perf[key]["aucroc"]]
                        ],
                        columns=["iteration", "epoch", "training_size", "reference_size", "gen_size", "src", "aucroc"],
                    ),
                ]
            )
ref_output

Unnamed: 0,iteration,epoch,training_size,reference_size,"gen_size,",src,aucroc,gen_size
0,1,2000,10000,500,,ablated_eq1,0.506013,200.0
0,1,2000,10000,500,,ablated_eq2,0.505087,200.0
0,1,2000,10000,500,,LOGAN_D1,0.492027,200.0
0,1,2000,10000,500,,MC,0.506301,200.0
0,1,2000,10000,500,,gan_leaks,0.506142,200.0
...,...,...,...,...,...,...,...,...
0,5,2000,10000,10000,,gan_leaks,0.508675,200.0
0,5,2000,10000,10000,,gan_leaks_cal,0.507597,200.0
0,5,2000,10000,10000,,LOGAN_0,0.496254,200.0
0,5,2000,10000,10000,,eq1,0.507097,200.0


In [8]:
#results df to csv
ref_output.to_csv('kde_ref_output_UCI.csv')