In [2]:
# Baselines 
# Use sdv_new 
#Save the models
from sdv.single_table import TVAESynthesizer, GaussianCopulaSynthesizer, CTGANSynthesizer
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
import warnings
warnings.filterwarnings('ignore')


datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
synthesizers = [TVAESynthesizer, GaussianCopulaSynthesizer, CTGANSynthesizer]
synthesizer_names = ["tvae", "copula_gan", "ctgan"]
random_states = [42,52,62,72]

for dataset in datasets:
    for random_state in random_states:
        filepath = f'./{dataset}/original_{dataset}_train_{random_state}.csv'
        train_df = pd.read_csv(filepath)
        metadata = SingleTableMetadata()
        metadata.detect_from_dataframe(data=train_df)
        i = 0
        for synthesizer in synthesizers:
            print(f'working on {dataset}/{synthesizer_names[i]}_{dataset}_train_{random_state}')
            s = synthesizer(metadata)
            s.fit(train_df)
            synth_samples_size = train_df.shape[0]
            synthetic_samples = s.sample(synth_samples_size)
            synthetic_samples.to_csv(f'./{dataset}/{synthesizer_names[i]}_{dataset}_train_{random_state}.csv', index=False)
            print(f'Save csv on {dataset}/{synthesizer_names[i]}_{dataset}_train_{random_state}')
            i +=1 



working on travelcustomer/tvae_travelcustomer_train_42
Save csv on travelcustomer/tvae_travelcustomer_train_42
working on travelcustomer/copula_gan_travelcustomer_train_42
Save csv on travelcustomer/copula_gan_travelcustomer_train_42
working on travelcustomer/ctgan_travelcustomer_train_42
Save csv on travelcustomer/ctgan_travelcustomer_train_42
working on travelcustomer/tvae_travelcustomer_train_52
Save csv on travelcustomer/tvae_travelcustomer_train_52
working on travelcustomer/copula_gan_travelcustomer_train_52
Save csv on travelcustomer/copula_gan_travelcustomer_train_52
working on travelcustomer/ctgan_travelcustomer_train_52
Save csv on travelcustomer/ctgan_travelcustomer_train_52
working on travelcustomer/tvae_travelcustomer_train_62
Save csv on travelcustomer/tvae_travelcustomer_train_62
working on travelcustomer/copula_gan_travelcustomer_train_62
Save csv on travelcustomer/copula_gan_travelcustomer_train_62
working on travelcustomer/ctgan_travelcustomer_train_62
Save csv on trav

In [4]:
#Generate Samples and save
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, auc, precision_recall_curve
from rdt import HyperTransformer

# Define the list of datasets, synthesizer names, and random states
datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
# synthesizer_names = ["tvae", "copula_gan", "ctgan"]
# synthesizer_names = ["original",]
synthesizer_names = ["ctvae",]
# random_states = [42, 52, 62, 72]
random_states = [42]


results = []

for dataset in datasets:
    for synthesizer_name in synthesizer_names:
        for random_state in random_states:
            # Read the training and test data
            # train_df = pd.read_csv(f'./{dataset}/{synthesizer_name}_{dataset}_train_{random_state}.csv')

            train_df = pd.read_csv(f'./{dataset}/{synthesizer_name}_{dataset}_synth_{random_state}_False_4_way.csv')
            test_df = pd.read_csv(f'./{dataset}/original_{dataset}_test_{random_state}.csv')

            # Splitting features and labels
            ht = HyperTransformer()
            X_train = train_df.iloc[:, :-1]
            ht.detect_initial_config(data=X_train)
            X_train = ht.fit_transform(X_train)
            y_train = train_df.iloc[:, -1]

            X_test = test_df.iloc[:, :-1]
            ht.detect_initial_config(data=X_test)
            X_test = ht.fit_transform(X_test)
            y_test = test_df.iloc[:, -1]

            # Initialize classifiers
            lr = LogisticRegression(max_iter=1000)
            dt = DecisionTreeClassifier()
            rf = RandomForestClassifier()
            svc = SVC(probability=True)  # Set probability to True for SVC

            classifiers = [lr, dt, rf, svc]

            for classifier in classifiers:
                # Fit and predict using the classifier
                classifier.fit(X_train, y_train)
                y_pred = classifier.predict(X_test)

                # Calculate accuracy
                accuracy = accuracy_score(y_test, y_pred)

                # Calculate ROC AUC
                if hasattr(classifier, "predict_proba"):
                    roc_auc = roc_auc_score(y_test, classifier.predict_proba(X_test)[:, 1])
                else:
                    roc_auc = None

                # Calculate Precision-Recall AUC
                if hasattr(classifier, "predict_proba"):
                    precision, recall, _ = precision_recall_curve(y_test, classifier.predict_proba(X_test)[:, 1])
                    pr_auc = auc(recall, precision)
                else:
                    pr_auc = None

                # Add results to the array
                results.append([dataset, synthesizer_name, random_state, str(classifier), accuracy, roc_auc, pr_auc])

# Convert results to a DataFrame
results_df = pd.DataFrame(results, columns=["Dataset", "Synthesizer", "Random_State", "Classifier", "Accuracy", "ROC_AUC", "PR_AUC"])
print(results_df)
# Export results as a CSV file
# results_df.to_csv("original_tabular.csv", index=False)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


           Dataset Synthesizer  Random_State  \
0   travelcustomer       ctvae            42   
1   travelcustomer       ctvae            42   
2   travelcustomer       ctvae            42   
3   travelcustomer       ctvae            42   
4            adult       ctvae            42   
5            adult       ctvae            42   
6            adult       ctvae            42   
7            adult       ctvae            42   
8    creditdefault       ctvae            42   
9    creditdefault       ctvae            42   
10   creditdefault       ctvae            42   
11   creditdefault       ctvae            42   
12           heloc       ctvae            42   
13           heloc       ctvae            42   
14           heloc       ctvae            42   
15           heloc       ctvae            42   

                           Classifier  Accuracy   ROC_AUC    PR_AUC  
0   LogisticRegression(max_iter=1000)  0.773519  0.706621  0.481236  
1            DecisionTreeClassifier()  0.60

In [2]:
import pandas as pd

# Load the results from the CSV file
# results_df = pd.read_csv("tabular_results.csv")
#Original Data Average
results_df = pd.read_csv("original_tabular.csv")
# Group by Random_State and compute the average accuracy
average_accuracies = results_df.groupby(["Dataset","Synthesizer", "Classifier"])["Accuracy"].mean()

print("Average Accuracy for Different Random States:")
print(average_accuracies)


Average Accuracy for Different Random States:
Dataset         Synthesizer  Classifier                       
adult           original     DecisionTreeClassifier()             0.747373
                             LogisticRegression(max_iter=1000)    0.800297
                             RandomForestClassifier()             0.821709
                             SVC(probability=True)                0.800604
creditdefault   original     DecisionTreeClassifier()             0.725583
                             LogisticRegression(max_iter=1000)    0.781250
                             RandomForestClassifier()             0.816167
                             SVC(probability=True)                0.781250
heloc           original     DecisionTreeClassifier()             0.632409
                             LogisticRegression(max_iter=1000)    0.717097
                             RandomForestClassifier()             0.719168
                             SVC(probability=True)                

# Statistical Metric

In [19]:
# Model Evaluate with Statistical Evaluation

#Generate Samples and save
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, auc, precision_recall_curve
from rdt import HyperTransformer
from sdv.metrics.tabular import ContinuousKLDivergence, DiscreteKLDivergence,  KSComplement
import warnings
warnings.filterwarnings('ignore')
from rdt import HyperTransformer

# Define the list of datasets, synthesizer names, and random states
datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
synthesizer_names = ["tvae", "copula_gan", "ctgan"]
random_states = [42, 52, 62, 72]

results = []

for dataset in datasets:
    for synthesizer_name in synthesizer_names:
        for random_state in random_states:
            # Read the original and synth data
            original_df = pd.read_csv(f'./{dataset}/original_{dataset}_train_{random_state}.csv')
            synth_df = pd.read_csv(f'./{dataset}/{synthesizer_name}_{dataset}_train_{random_state}.csv')
            ht = HyperTransformer()
            ht.detect_initial_config(data=original_df)

            cont_KLD = ContinuousKLDivergence.compute(original_df,synth_df)
            try:
                cat_KLD = DiscreteKLDivergence.compute(original_df, synth_df)
            except Exception as e:
                cat_KLD = cont_KLD
            KLD = (cat_KLD + cont_KLD)/2

            ks_test = KSComplement.compute(original_df, synth_df)
            print(f'{dataset}_{synthesizer_name}_{random_state}: KLD: {KLD},  KS_test: {ks_test}')
            

        
# Convert results to a DataFrame
# results_df = pd.DataFrame(results, columns=["Dataset", "Synthesizer", "Random_State", "Classifier", "Accuracy", "ROC_AUC", "PR_AUC"])

# # Export results as a CSV file
# results_df.to_csv("results.csv", index=False)


travelcustomer_tvae_42: KLD: 0.7512895213238691,  KS_test: 0.7636181909045477
travelcustomer_tvae_52: KLD: 0.7000539963832053,  KS_test: 0.6956521739130435
travelcustomer_tvae_62: KLD: 0.6614089802809908,  KS_test: 0.6881559220389805
travelcustomer_tvae_72: KLD: 0.6590372887928703,  KS_test: 0.6786606696651672
travelcustomer_copula_gan_42: KLD: 0.6374969301632873,  KS_test: 0.824087956021989
travelcustomer_copula_gan_52: KLD: 0.6814920012308561,  KS_test: 0.8500749625187406
travelcustomer_copula_gan_62: KLD: 0.6586030462994104,  KS_test: 0.8160919540229884
travelcustomer_copula_gan_72: KLD: 0.6496891875980255,  KS_test: 0.8105947026486757
travelcustomer_ctgan_42: KLD: 0.7691260301982077,  KS_test: 0.896551724137931
travelcustomer_ctgan_52: KLD: 0.7410068169137854,  KS_test: 0.7981009495252374
travelcustomer_ctgan_62: KLD: 0.6930600968060188,  KS_test: 0.8405797101449276
travelcustomer_ctgan_72: KLD: 0.65215685161038,  KS_test: 0.8045977011494253
adult_tvae_42: KLD: 0.9255663120991164, 

In [24]:
#Returns the average KLD and KS Test
# This is for CT_VAE

datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
synthesizer_names = ["tvae", "copula_gan", "ctgan"]
random_states = [42, 52, 62, 72]
dps = [True, False]
results = []

for dataset in datasets:
    for dp in dps:
        total_KLD = 0
        total_KS_test = 0
        num_random_states = len(random_states)
        
        for random_state in random_states:
            # Read the original and synth data
            original_df = pd.read_csv(f'./{dataset}/original_{dataset}_train_{random_state}.csv')
            synth_df = pd.read_csv(f'./{dataset}/ctvae_{dataset}_synth_42_{dp}_4_way.csv')
            ht = HyperTransformer()
            ht.detect_initial_config(data=original_df)

            cont_KLD = ContinuousKLDivergence.compute(original_df, synth_df)
            try:
                cat_KLD = DiscreteKLDivergence.compute(original_df, synth_df)
            except Exception as e:
                cat_KLD = cont_KLD
            KLD = (cat_KLD + cont_KLD) / 2

            ks_test = KSComplement.compute(original_df, synth_df)
            
            total_KLD += KLD
            total_KS_test += ks_test

        avg_KLD = total_KLD / num_random_states
        avg_KS_test = total_KS_test / num_random_states
        
        print(f'Average for {dataset}_{dp}: KLD: {avg_KLD }, KS_test: {avg_KS_test}')


Average for travelcustomer_True: KLD: 0.813742498401042, KS_test: 0.8831637326860249
Average for travelcustomer_False: KLD: 0.8527883008816997, KS_test: 0.8866855841293968
Average for adult_True: KLD: 0.8589174029181508, KS_test: 0.7099127739852209
Average for adult_False: KLD: 0.8377275144177441, KS_test: 0.771715887210191
Average for creditdefault_True: KLD: 0.8169601369112383, KS_test: 0.8049167251708322
Average for creditdefault_False: KLD: 0.8395375985164992, KS_test: 0.8961905680592541
Average for heloc_True: KLD: 0.829303712890199, KS_test: 0.8581186768822671
Average for heloc_False: KLD: 0.9096828363301184, KS_test: 0.9183692987334953


In [1]:
#Returns the average KLD and KS Test
# This is for DP-TVE

#Generate Samples and save
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, auc, precision_recall_curve
from rdt import HyperTransformer
from sdv.metrics.tabular import ContinuousKLDivergence, DiscreteKLDivergence,  KSComplement
import warnings
warnings.filterwarnings('ignore')
from rdt import HyperTransformer


datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
synthesizer_names = ["tvae", "copula_gan", "ctgan"]
random_states = [42, 52, 62, 72]
dps = [True]
results = []

for dataset in datasets:
    for dp in dps:
        total_KLD = 0
        total_KS_test = 0
        num_random_states = len(random_states)
        
        for random_state in random_states:
            # Read the original and synth data
            original_df = pd.read_csv(f'./{dataset}/original_{dataset}_train_{random_state}.csv')
            synth_df = pd.read_csv(f'./{dataset}/dptvae_{dataset}_synth_42_{dp}_4_way.csv')
            ht = HyperTransformer()
            ht.detect_initial_config(data=original_df)

            cont_KLD = ContinuousKLDivergence.compute(original_df, synth_df)
            try:
                cat_KLD = DiscreteKLDivergence.compute(original_df, synth_df)
            except Exception as e:
                cat_KLD = cont_KLD
            KLD = (cat_KLD + cont_KLD) / 2

            ks_test = KSComplement.compute(original_df, synth_df)
            
            total_KLD += KLD
            total_KS_test += ks_test

        avg_KLD = total_KLD / num_random_states
        avg_KS_test = total_KS_test / num_random_states
        
        print(f'Average for {dataset}_{dp}: KLD: {avg_KLD }, KS_test: {avg_KS_test}')


Average for travelcustomer_True: KLD: 0.635456788373729, KS_test: 0.6893078690012792
Average for adult_True: KLD: 0.8180808629885759, KS_test: 0.5592157585698175
Average for creditdefault_True: KLD: 0.7335236083448036, KS_test: 0.7404680537352556
Average for heloc_True: KLD: 0.712894349938328, KS_test: 0.7311110931592385


# Privacy Metric

In [5]:
# Model Evaluate with Statistical Evaluation
#For Baselines
#Generate Samples and save
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, auc, precision_recall_curve
from rdt import HyperTransformer
from sdv.metrics.tabular import NumericalPrivacyMetric, CategoricalPrivacyMetric
from sdmetrics.single_table import CategoricalKNN, NumericalMLP
import warnings
from sdv.metadata import SingleTableMetadata
warnings.filterwarnings('ignore')
from rdt import HyperTransformer

# Define the list of datasets, synthesizer names, and random states
datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
synthesizer_names = ["tvae", "copula_gan", "ctgan"]
random_states = [42]
results = []

for dataset in datasets:
    for synthesizer_name in synthesizer_names:
        for random_state in random_states:
            # Read the original and synth data
            original_df = pd.read_csv(f'./{dataset}/original_{dataset}_train_{random_state}.csv')
            synth_df = pd.read_csv(f'./{dataset}/{synthesizer_name}_{dataset}_train_{random_state}.csv')
            ht = HyperTransformer()
            ht.detect_initial_config(data=original_df)
            metadata = SingleTableMetadata()
            metadata.detect_from_dataframe(data=original_df)
            categorical_keys = []
            continuous_keys = []
            target_key = []
            columns_metadata = metadata.columns
            last_key = list(columns_metadata.keys())[-1]
            target_key.append(last_key)

            for key, value in columns_metadata.items():
                if key == last_key:
                    continue  # Skip the last key since it's already in the "other" category
                if value["sdtype"] == "categorical":
                    categorical_keys.append(key)
                elif value["sdtype"] == "numerical":
                    continuous_keys.append(key)
            # print(categorical_keys, continuous_keys, target_key)
            # continue
            num_privacy = NumericalMLP.compute(
                    real_data = original_df, 
                    synthetic_data= synth_df,
                    key_fields=continuous_keys,
                    sensitive_fields= target_key)
            try:
                cat_privacy = CategoricalKNN.compute(
                    real_data = original_df, 
                    synthetic_data= synth_df,
                    key_fields=categorical_keys,
                    sensitive_fields= target_key)
            except Exception as e:
                cat_privacy = num_privacy
            privacy_score = (num_privacy + cat_privacy)/2


            print(f'{dataset}_{synthesizer_name}_{random_state}: Numerical Privacy: {num_privacy}, Categorical Privacy: {cat_privacy}, Privacy: {privacy_score}')
            

        
# Convert results to a DataFrame
# results_df = pd.DataFrame(results, columns=["Dataset", "Synthesizer", "Random_State", "Classifier", "Accuracy", "ROC_AUC", "PR_AUC"])

# # Export results as a CSV file
# results_df.to_csv("results.csv", index=False)


travelcustomer_tvae_42: Numerical Privacy: 0.06891647738049293, Categorical Privacy: 0.23838080959520236, Privacy: 0.15364864348784765
travelcustomer_copula_gan_42: Numerical Privacy: 0.09554602875970127, Categorical Privacy: 0.26086956521739135, Privacy: 0.1782077969885463
travelcustomer_ctgan_42: Numerical Privacy: 0.0696759617287764, Categorical Privacy: 0.2278860569715142, Privacy: 0.1487810093501453
adult_tvae_42: Numerical Privacy: 0.2882446797377596, Categorical Privacy: 0.25341484103074086, Privacy: 0.2708297603842502
adult_copula_gan_42: Numerical Privacy: 0.2958557026473682, Categorical Privacy: 0.27558571470355964, Privacy: 0.28572070867546395
adult_ctgan_42: Numerical Privacy: 0.26388660622896, Categorical Privacy: 0.2194565503524526, Privacy: 0.2416715782907063
creditdefault_tvae_42: Numerical Privacy: 0.27161795990174176, Categorical Privacy: 0.27161795990174176, Privacy: 0.27161795990174176
creditdefault_copula_gan_42: Numerical Privacy: 0.29038983181339667, Categorical 

In [6]:
# Model Evaluate with Statistical Evaluation
#For CT-VAE with DP and non DP
#Generate Samples and save
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, auc, precision_recall_curve
from rdt import HyperTransformer
from sdv.metrics.tabular import NumericalPrivacyMetric, CategoricalPrivacyMetric
from sdmetrics.single_table import CategoricalKNN, NumericalMLP
import warnings
from sdv.metadata import SingleTableMetadata
warnings.filterwarnings('ignore')
from rdt import HyperTransformer

# Define the list of datasets, synthesizer names, and random states
datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
synthesizer_names = ["tvae", "copula_gan", "ctgan"]
random_states = [42]
results = []
dps = [True, False]

for dataset in datasets:
    for dp in dps:
        for random_state in random_states:
            # Read the original and synth data
            original_df = pd.read_csv(f'./{dataset}/original_{dataset}_train_{random_state}.csv')
            synth_df = pd.read_csv(f'./{dataset}/ctvae_{dataset}_synth_42_{dp}_4_way.csv')
            ht = HyperTransformer()
            ht.detect_initial_config(data=original_df)
            metadata = SingleTableMetadata()
            metadata.detect_from_dataframe(data=original_df)
            categorical_keys = []
            continuous_keys = []
            target_key = []
            columns_metadata = metadata.columns
            last_key = list(columns_metadata.keys())[-1]
            target_key.append(last_key)

            for key, value in columns_metadata.items():
                if key == last_key:
                    continue  # Skip the last key since it's already in the "other" category
                if value["sdtype"] == "categorical":
                    categorical_keys.append(key)
                elif value["sdtype"] == "numerical":
                    continuous_keys.append(key)
            # print(categorical_keys, continuous_keys, target_key)
            # continue
            num_privacy = NumericalMLP.compute(
                    real_data = original_df, 
                    synthetic_data= synth_df,
                    key_fields=continuous_keys,
                    sensitive_fields= target_key)
            try:
                cat_privacy = CategoricalKNN.compute(
                    real_data = original_df, 
                    synthetic_data= synth_df,
                    key_fields=categorical_keys,
                    sensitive_fields= target_key)
            except Exception as e:
                cat_privacy = num_privacy
            privacy_score = (num_privacy + cat_privacy)/2


            print(f'{dataset}_{dp}_{random_state}: Numerical Privacy: {num_privacy}, Categorical Privacy: {cat_privacy}, Privacy: {privacy_score}')
            

        
# Convert results to a DataFrame
# results_df = pd.DataFrame(results, columns=["Dataset", "Synthesizer", "Random_State", "Classifier", "Accuracy", "ROC_AUC", "PR_AUC"])

# # Export results as a CSV file
# results_df.to_csv("results.csv", index=False)


travelcustomer_True_42: Numerical Privacy: 0.1490787091588749, Categorical Privacy: 0.26686656671664166, Privacy: 0.20797263793775828
travelcustomer_False_42: Numerical Privacy: 0.09247190185164218, Categorical Privacy: 0.23838080959520236, Privacy: 0.16542635572342226
adult_True_42: Numerical Privacy: 0.23108616972739043, Categorical Privacy: 0.6998449793793325, Privacy: 0.46546557455336146
adult_False_42: Numerical Privacy: 0.2313228472715006, Categorical Privacy: 0.4195501477083272, Privacy: 0.3254364974899139
creditdefault_True_42: Numerical Privacy: 0.30063922252040404, Categorical Privacy: 0.30063922252040404, Privacy: 0.30063922252040404
creditdefault_False_42: Numerical Privacy: 0.29699793995894735, Categorical Privacy: 0.29699793995894735, Privacy: 0.29699793995894735
heloc_True_42: Numerical Privacy: 0.17292430094296862, Categorical Privacy: 0.17292430094296862, Privacy: 0.17292430094296862
heloc_False_42: Numerical Privacy: 0.12696824319191005, Categorical Privacy: 0.1269682

In [2]:
# Model Evaluate with Statistical Evaluation
#For DP-TVAE 
#Generate Samples and save
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, auc, precision_recall_curve
from rdt import HyperTransformer
from sdv.metrics.tabular import NumericalPrivacyMetric, CategoricalPrivacyMetric
from sdmetrics.single_table import CategoricalKNN, NumericalMLP
import warnings
from sdv.metadata import SingleTableMetadata
warnings.filterwarnings('ignore')
from rdt import HyperTransformer

# Define the list of datasets, synthesizer names, and random states
datasets = ["travelcustomer", "adult", "creditdefault", "heloc"]
synthesizer_names = ["tvae", "copula_gan", "ctgan"]
random_states = [42]
results = []
dps = [True]

for dataset in datasets:
    for dp in dps:
        for random_state in random_states:
            # Read the original and synth data
            original_df = pd.read_csv(f'./{dataset}/original_{dataset}_train_{random_state}.csv')
            synth_df = pd.read_csv(f'./{dataset}/dptvae_{dataset}_synth_42_{dp}_4_way.csv')
            ht = HyperTransformer()
            ht.detect_initial_config(data=original_df)
            metadata = SingleTableMetadata()
            metadata.detect_from_dataframe(data=original_df)
            categorical_keys = []
            continuous_keys = []
            target_key = []
            columns_metadata = metadata.columns
            last_key = list(columns_metadata.keys())[-1]
            target_key.append(last_key)

            for key, value in columns_metadata.items():
                if key == last_key:
                    continue  # Skip the last key since it's already in the "other" category
                if value["sdtype"] == "categorical":
                    categorical_keys.append(key)
                elif value["sdtype"] == "numerical":
                    continuous_keys.append(key)
            # print(categorical_keys, continuous_keys, target_key)
            # continue
            num_privacy = NumericalMLP.compute(
                    real_data = original_df, 
                    synthetic_data= synth_df,
                    key_fields=continuous_keys,
                    sensitive_fields= target_key)
            try:
                cat_privacy = CategoricalKNN.compute(
                    real_data = original_df, 
                    synthetic_data= synth_df,
                    key_fields=categorical_keys,
                    sensitive_fields= target_key)
            except Exception as e:
                cat_privacy = num_privacy
            privacy_score = (num_privacy + cat_privacy)/2


            print(f'{dataset}_{dp}_{random_state}: Numerical Privacy: {num_privacy}, Categorical Privacy: {cat_privacy}, Privacy: {privacy_score}')
            

        
# Convert results to a DataFrame
# results_df = pd.DataFrame(results, columns=["Dataset", "Synthesizer", "Random_State", "Classifier", "Accuracy", "ROC_AUC", "PR_AUC"])

# # Export results as a CSV file
# results_df.to_csv("results.csv", index=False)


travelcustomer_True_42: Numerical Privacy: 0.06849191236680591, Categorical Privacy: 0.24437781109445278, Privacy: 0.15643486173062934
adult_True_42: Numerical Privacy: 0.3206637924281312, Categorical Privacy: 0.39530258270203866, Privacy: 0.35798318756508496
creditdefault_True_42: Numerical Privacy: 0.28736415998678083, Categorical Privacy: 0.28736415998678083, Privacy: 0.28736415998678083
heloc_True_42: Numerical Privacy: 0.20185080700781463, Categorical Privacy: 0.20185080700781463, Privacy: 0.20185080700781463
