In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from metrics.pairwise_corr import *
from metrics.JSD import JSD
from metrics.pra_measures import *
from metrics.record_linkage import *
from metrics.DCR_NNDR import *
from metrics.pMSE_python import *
from metrics.automatic_categorization import *
from metrics.calculate_IDR import *
from metrics.calculate_CAP import *
from metrics.Population_uniqueness import *
import time

## Simulation Data

In [None]:
def create_var_info_and_type_dict(p):
    if p == 5:
        var_info = {
            'numeric': ['Var4', 'Var9'],
            'categorical': ['Var1', 'Var2'],
            'ordinal': ['Var10']
        }
        type_dict = {
            'cont': ['Var4', 'Var9'],
            'cat': ['Var2'],
            'ord': ['Var10'],
            'bin': ['Var1']
        }
    elif p == 10:
        var_info = {
            'numeric': ['Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var9'],
            'categorical': ['Var1', 'Var2', 'Var3'],
            'ordinal': ['Var10']
        }
        type_dict = {
            'cont': ['Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var9'],
            'cat': ['Var2', 'Var3'],
            'ord': ['Var10'],
            'bin': ['Var1']
        }
    elif p == 15:
        var_info = {
            'numeric': ['Var4', 'Var9', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19'],
            'categorical': ['Var1', 'Var2', 'Var11', 'Var12', 'Var13'],
            'ordinal': ['Var10', 'Var20']
        }
        type_dict = {
            'cont': ['Var4', 'Var9', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19'],
            'cat': ['Var2', 'Var12', 'Var13'],
            'ord': ['Var10', 'Var20'],
            'bin': ['Var1', 'Var11']
        }
    elif p == 20:
        var_info = {
            'numeric': ['Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var9', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19'],
            'categorical': ['Var1', 'Var2', 'Var3', 'Var11', 'Var12', 'Var13'],
            'ordinal': ['Var10', 'Var20']
        }
        type_dict = {
            'cont': ['Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var9', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19'],
            'cat': ['Var2', 'Var3', 'Var12', 'Var13'],
            'ord': ['Var10', 'Var20'],
            'bin': ['Var1', 'Var11']
        }
    else:
        raise ValueError("Unsupported value of p. Only p=5, p=10, p=15, p=20 are supported.")

    return var_info, type_dict

In [None]:
n_list = [5000, 10000, 50000, 100000]
p_list = [5, 10, 15, 20]


syn_method = ['orig', 'syn_default', 'syn_spline', 'syn_density', 'ctgan']

for method in syn_method:
    for n in n_list:
        for p in p_list:
        
            step = 50
            if n == 100000:
                step = 10
            
            start_time = time.time()
            
            orig_data_1 = pd.read_csv(f"simulation_data/{method}/orig_data_1_{n}_{p}.csv")  # file path
            if method == 'orig':
                syn_data_1 = pd.read_csv(f"simulation_data/{method}/orig_data_2_{n}_{p}.csv")  # file path
            else:
                syn_data_1 = pd.read_csv(f"simulation_data/{method}/syn_data_{n}_{p}.csv") # file path
            
            data_len = len(orig_data_1)
            dim = f'{n}x{p}'
            
            repeat_metrics = ['pMSE_LR_V', 'pMSE_LR_R', 'pMSE_LR_S', 'pMSE_T_V', 'pMSE_T_R', 'pMSE_T_S', 'IDR', 'CAP_0', 'CAP_NA', 'Pop_Uniq']
            fixed_metrics = ['JSD', 'Dpa', 'Dcb', 'Authenticity', 'Record_Linkage', 'DCR', 'NNDR', 'Pairwise Correlation']
            
            all_metrics = repeat_metrics + fixed_metrics
            
            metric = pd.DataFrame(columns=['type', 'np'] + all_metrics)
            
            metric['np'] = [dim] * step
            metric['type'] = [method] * step
            
            var_info, type_dict = create_var_info_and_type_dict(p)
            
            
            
            ### Fixed metrics evaluation ###
            
            # Pairwise Correlation
            corr_orig, corr_syn, norm_ = compare_pairwise_corr(orig_data_1, syn_data_1, type_dict)
            metric['Pairwise Correlation'] = [norm_] * step
            
            orig_data_1.loc[:,var_info['categorical']] = orig_data_1.loc[:,var_info['categorical']].astype('str')
            syn_data_1.loc[:,var_info['categorical']] = syn_data_1.loc[:,var_info['categorical']].astype('str')
            orig_data_1.loc[:,var_info['ordinal']] = orig_data_1.loc[:,var_info['ordinal']].astype('str')
            syn_data_1.loc[:,var_info['ordinal']] = syn_data_1.loc[:,var_info['ordinal']].astype('str')
                
            # JSD
            jsd = JSD(orig_data_1, syn_data_1, var_info)[0]
            metric['JSD'] = [jsd] * step
            
            # pra_measures
            compute_syn = compute_metrics(orig_data_1,syn_data_1)
            metric['Dpa'] = [compute_syn[0]['Dpa']] * step
            metric['Dcb'] = [compute_syn[0]['Dcb']] * step
            metric['Authenticity'] = [compute_syn[0]['mean_aut']] * step
            
            # Record_Linkage
            data_len2 = data_len ** 2
            rl = record_linakge(orig_data_1, syn_data_1, var_info) / data_len2
            metric['Record_Linkage'] = [rl] * step
            
            # DCR, NNDR
            syn_DN = DCR_NNDR(orig_data_1, syn_data_1, var_info)
            syn_data_DCR = np.mean(syn_DN[0])
            syn_data_NNDR = np.mean(syn_DN[1])
            metric['DCR'] = [syn_data_DCR] * step
            metric['NNDR'] = [syn_data_NNDR] * step
            
            metric.to_csv(f"simulation_data/result/metric_gaussian_{method}_{dim}.csv", sep=",", index = False) 

            ### Repeated metrics evaluation ### 
            for i in range(step):
                # pMSE
                pMSE = pMSE_compare(orig_data_1, syn_data_1)[0]
                metric.at[i, 'pMSE_LR_V'] = pMSE.iloc[0,0]
                metric.at[i, 'pMSE_LR_R'] = pMSE.iloc[0,1]
                metric.at[i, 'pMSE_LR_S'] = pMSE.iloc[0,2]
                metric.at[i, 'pMSE_T_V'] = pMSE.iloc[1,0]
                metric.at[i, 'pMSE_T_R'] = pMSE.iloc[1,1]
                metric.at[i, 'pMSE_T_S'] = pMSE.iloc[1,2]
                
                # IDR, CAP
                sensitive = ['Var10']
                attribute = [col for col in orig_data_1.columns if col != 'Var10']
                data_orig_cat, data_syn_cat = categorize_df_kmeans(orig_data_1, syn_data_1, var_info['numeric'])
                IDR, mean_IDR = calculate_IDR(data_orig_cat, data_syn_cat, sensitive)
                metric.at[i, 'IDR'] = mean_IDR
                
                CAP_values = CAP(data_orig_cat,data_syn_cat,attribute)
                metric.at[i, 'CAP_0'] = CAP_values['CAP_0']
                metric.at[i, 'CAP_NA'] = CAP_values['CAP_NA']
        
            
                # Population Uniqueness
                pop = pop_uni(orig_data_1, syn_data_1, var_info)
                metric.at[i, 'Pop_Uniq'] = pop
                
                metric.to_csv(f"simulation_data/result/metric_gaussian_{method}_{dim}.csv", sep=",", index = False)  
                metric = pd.read_csv(f"simulation_data/result/metric_gaussian_{method}_{dim}.csv") 
                print(metric.iloc[i])
                print(f'Simulation {i+1} : finished')
                print('--------------------------------------------------') 
            
            end_time = time.time()
            execution_time = (end_time - start_time)/60
            print(f"Code executed in: {execution_time:.1f} minutes")
                            

### Aggregation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

all_metrics = ['pMSE_LR_V', 'pMSE_LR_R', 'pMSE_LR_S', 'pMSE_T_V', 'pMSE_T_R', 'pMSE_T_S', 'IDR', 'CAP_0', 'CAP_NA', 'Pop_Uniq', 
                  'JSD', 'Dpa', 'Dcb', 'Authenticity', 'Record_Linkage', 'DCR', 'NNDR', 'Pairwise Correlation']
repeat_metrics = ['pMSE_LR_R', 'pMSE_LR_S', 'pMSE_T_V', 'pMSE_T_R', 'pMSE_T_S', 'IDR', 'CAP_0', 'CAP_NA', 'Pop_Uniq']
    
metric = pd.DataFrame(columns=['type', 'np', 'stat'] + all_metrics)

n_list = [5000, 10000, 50000, 100000]
p_list = [5, 10, 15, 20]
syn_method = ['orig', 'syn_default', 'syn_spline', 'syn_density', 'ctgan']
stat_list = ['mean', 'sd', 'min', 'max']

i = 0
for n in n_list:
    for p in p_list:
        for syn in syn_method:
            for stat in stat_list:
                step = 50
                if n == 100000:
                    step = 10

                dim = f'{n}x{p}'
                df = pd.read_csv(f"simulation_data/result/metric_gaussian_{syn}_{dim}.csv")

                metric.at[i, 'np'] = dim
                metric.at[i, 'type'] = syn
                metric.at[i, 'stat'] = stat
                
                numeric_df = df.iloc[:, 2:]
                
                if stat == 'mean':
                    mean_values = numeric_df.iloc[0:step].mean()
                    metric.at[i, 3:] = mean_values.tolist() 
                elif stat == 'sd':
                    std_values = numeric_df.iloc[0:step].std()
                    metric.at[i, 3:] = std_values.tolist() 
                elif stat == 'min':
                    min_values = numeric_df.iloc[0:step].min()
                    metric.at[i, 3:] = min_values.tolist() 
                else:
                    max_values = numeric_df.iloc[0:step].max()
                    metric.at[i, 3:] = max_values.tolist()

                i += 1
            
metric.to_csv(f"simulation_data/result/metric_total_cw_toy_plus.csv", sep=",", index = False) 