In [None]:
# !pip install numpy
# !pip install pandas

In [None]:
import numpy as np
import pandas as pd
from faker import Faker
import random

#Code for generating a baby dataset



name_gen = Faker()
num_samples = 9

height = list(np.random.normal(loc=50, scale=1, size=num_samples))

classic_icecreams = [
    "Vanilla", "Chocolate", "Strawberry", "Mint Chocolate Chip",
    "Cookies and Cream", "Rocky Road", "Butter Pecan", "Neapolitan",
    "Pistachio", "French Vanilla"
]

fav_icecream = list(random.choices(classic_icecreams, k=num_samples))


In [None]:
# Generate random first and last names
name_df = pd.DataFrame({
    'First_Name': [name_gen.first_name() for _ in range(num_samples)],
    'Last_Name': [name_gen.last_name() for _ in range(num_samples)]
})

height_df = pd.DataFrame({'Height': height})

icecream_df = pd.DataFrame({'Flavour': fav_icecream})

full_df = pd.concat([name_df, height_df, icecream_df], axis=1)

In [None]:
from sklearn.manifold import TSNE
from saiph.projection import fit_transform
from sklearn.preprocessing import LabelEncoder

sens_individual = ['John', 'Davies', 182.5, 'Vanilla']
new_individual_df = pd.DataFrame([sens_individual], columns=full_df.columns)
all_individuals = pd.concat([full_df, new_individual_df], ignore_index=True)
print(all_individuals)


#PCA
coord_real_pca, model_pca = fit_transform(all_individuals, nf=2)

In [None]:
#Make a synthetic dataset
from synthesis.synthesizers.privbayes import PrivBayes
from saiph.projection import transform

eps=np.inf
epsilon = eps

pb = PrivBayes(epsilon=epsilon, verbose=False)

pb.fit(all_individuals)

gen_data  = pb.sample()

final_data = pd.DataFrame(gen_data.values, columns=gen_data.columns, index=range(all_individuals.shape[0]))

print(final_data)

import matplotlib.pyplot as plt
    
def scatter_plot(coord_real, coord_synth):
    # Scatter Plot
    plt.figure()

    # Plot DataFrame 1
    plt.scatter(coord_real['Dim. 1'], coord_real['Dim. 2'], color='blue', label='Real', alpha=0.7)

    # Plot DataFrame 2
    plt.scatter(coord_synth['Dim. 1'], coord_synth['Dim. 2'], color='red', label='Synthetic', alpha=0.5)

    plt.title('Scatter Plot of real and synthetic data')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.legend()
    plt.grid(True)

    # Show Plot
    plt.show()

#PCA
syn_coords_pca = transform(final_data, model_pca)
scatter_plot(coord_real_pca, syn_coords_pca)

In [None]:
eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for liquorice in [0,1]:
    real_dir = f'sample_data_{liquorice}.csv'
    real_data = pd.read_csv(real_dir)
    for eps in eps_list:
        #synthesize_no_bin(real_data, eps).to_csv(f"demo_syn/syn_bin_{liquorice}_{eps}.csv")
        syn = pd.read_csv(f"demo_syn_new/syn_no_{liquorice}_{eps}.csv")
        intersection = (
            real_data.merge(syn, how="inner", indicator=False).drop_duplicates()
        )
        print(len(intersection))

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def scatter_plot_tsne(coord_real, coord_synth):
    real = pd.DataFrame(coord_real)
    syn = pd.DataFrame(coord_synth)
    coords = pd.merge(real, syn)
    # Scatter Plot
    plt.figure()

    # Plot DataFrame 1
    plt.scatter(real[0], real[1], color='blue', label='Real', alpha=0.5)

    # Plot DataFrame 2
    plt.scatter(syn[0], syn[1], color='red', label='Synthetic', alpha=0.5)

    plt.title('Scatter Plot of real and synthetic data')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.legend()
    plt.grid(True)

    # Show Plot
    plt.show()


tsne = TSNE(n_components=2)

tsne_coord_real = tsne.fit_transform(all_individuals)

tsne_coord_syn = tsne.fit_transform(final_data)

scatter_plot_tsne(tsne_coord_real, tsne_coord_syn)

In [None]:
from saiph.projection import fit_transform
from saiph.projection import transform

def scatter_plot(coord_real, coord_synth):
    # Scatter Plot
    plt.figure()

    # Plot DataFrame 1
    plt.scatter(coord_real['Dim. 1'], coord_real['Dim. 2'], color='blue', label='Real', alpha=0.7)

    # Plot DataFrame 2
    plt.scatter(coord_synth['Dim. 1'], coord_synth['Dim. 2'], color='red', label='Synthetic', alpha=0.5)

    plt.title('Scatter Plot of real and synthetic data')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.legend()
    plt.grid(True)




    # Show Plot
    plt.show()
    
pca_coord_real, model = fit_transform(all_individuals, nf=2)
pca_coord_syn = transform(final_data, model)

scatter_plot(pca_coord_real, pca_coord_syn)

In [None]:
#With label encoding

import pandas as pd
import numpy as np
from faker import Faker
import random
from DataSynthesizer1.DataDescriber import DataDescriber
from DataSynthesizer1.DataGenerator import DataGenerator
from synthesis.synthesizers.privbayes import PrivBayes
from datetime import datetime
import os

def generate_real_data(num_samples, liquorice):
    #Code for generating a baby dataset
    name_gen = Faker()
    heights = np.around(list(np.random.normal(loc=170, scale=10, size=num_samples)), 2)
    

    # Generate random first and last names
    name_df = pd.DataFrame({
        'First_Name': [name_gen.first_name() for _ in range(num_samples)],
        'Last_Name': [name_gen.last_name() for _ in range(num_samples)]
    })
    height_df = pd.DataFrame({'Height': heights})
    basic_df = pd.concat([name_df, height_df], axis=1)

    # Define country list and correlation rules
    countries = ["USA", "Canada", "Germany", "France", "Italy", "China", "Brazil", "Australia", "Japan", "UK", "Sweden", "Norway", "Denmark", "Finland"]

    # Ice cream preferences (default: random choice)
    ice_creams = ["Vanilla", "Chocolate", "Strawberry", "Mint", "Pistachio", "Stracciatella"]

    # Generate data
    data = []
    for i in range(num_samples):
        person = {}

        # Assign country
        person["Country of Origin"] = random.choice(countries)

        # Assign favorite ice cream with correlation (Italy → Stracciatella preference)
        if person["Country of Origin"] == "Italy":
            person["Favorite Ice Cream"] = np.random.choice(ice_creams, p=[0.1, 0.1, 0.1, 0.1, 0.2, 0.4])
        else:
            person["Favorite Ice Cream"] = random.choice(ice_creams)

        # Assign liking for liquorice (Nordic countries → Higher probability)
        if person["Country of Origin"] in ["Sweden", "Norway", "Denmark", "Finland"]:
            person["Likes Liquorice"] = np.random.choice([1, 0], p=[0.9, 0.1])  # 70% chance for Nordic countries
        else:
            person["Likes Liquorice"] = np.random.choice([1, 0], p=[0.2, 0.8])  # 20% for others

        # Assign number of times visited Italy (Random integer, but higher if from Europe)
        if person["Country of Origin"] in ["Germany", "France", "UK", "Sweden", "Norway", "Denmark", "Finland", "Italy"]:
            person["Times Visited Italy"] = np.random.poisson(2)  # Higher average visits
        else:
            person["Times Visited Italy"] = np.random.poisson(0.5)  # Lower average visits

        # First time in London (UK residents more likely to say yes)
        person["First Time in London"] = 1 if person["Country of Origin"] == "UK" else np.random.choice([1, 0], p=[0.2, 0.8])

        # Number of steps per day (Normal distribution with realistic values)
        person["Steps per Day"] = max(1000, int(np.random.normal(8000, 3000)))  # Avoids negative steps

        data.append(person)

    # Create DataFrame
    df = pd.DataFrame(data)
    
    full_df = pd.concat([basic_df, df], axis=1, ignore_index=True)
    
    if liquorice == 0:
        # Sample row: UK resident who does NOT like liquorice
        indiv = [
            "James", "Smith", round(random.gauss(175, 10), 2), "UK", "Strawberry", 0, 2, 0, 7500
        ]
        indiv_df = pd.DataFrame([indiv], columns=full_df.columns)
    if liquorice == 1:
        # Sample row: Sweden resident who LIKES liquorice
        indiv = [
            "Lars", "Andersson", round(random.gauss(185, 10), 2), "Sweden", "Chocolate", 1, 3, 0, 9200 
        ]
        indiv_df = pd.DataFrame([indiv], columns=full_df.columns)
    full_df = pd.concat([full_df, indiv_df], ignore_index=True)
    full_df.columns = ["First Name", "Last Name", "Height", "Nationality", "Favorite Icecream", "Like Liquorice", "Times Been to Italy", "First Time London", "Steps per Day"]
    print(full_df)
    # Save to CSV (optional)
    full_df.to_csv(f"sample_data_{liquorice}.csv", index=False)

def synthesize_no_bin(real_data, eps):    
    # instantiate and fit synthesizer
    pb = PrivBayes(epsilon=eps, verbose=False)
    pb.fit(real_data)

    # Synthesize data
    gen_data  = pb.sample()

    # Save to csv file
    result = pd.DataFrame(gen_data.values, columns=gen_data.columns, index=range(real_data.shape[0]))
    
    return result

def synthesize_bin(real_data, eps):
    real_data['Height'] = real_data['Height'].astype(float)
    fn_encoder = LabelEncoder()
    ln_encoder = LabelEncoder()
    fl_encoder = LabelEncoder()
    na_encoder = LabelEncoder()
    r_fn = fn_encoder.fit_transform(real_data['First Name'])
    r_ln = ln_encoder.fit_transform(real_data['Last Name'])
    r_na = na_encoder.fit_transform(real_data['Nationality'])
    r_fl = fl_encoder.fit_transform(real_data['Favorite Icecream'])
    all_labels = pd.DataFrame({'First Name':r_fn, 'Last Name': r_ln, 'Height': real_data['Height'],'Nationality': r_na, 'Favorite Icecream':r_fl, 'Like Liquorice': real_data['Like Liquorice'], 'Times Been to Italy': real_data['Times Been to Italy'], 'First Time London': real_data['First Time London'], 'Steps per Day': real_data['Steps per Day']})
    
    describer = DataDescriber()
    timestamp = datetime.now().timestamp()
    all_labels.to_csv(f'{timestamp}.csv', index=False)
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=f'{timestamp}.csv', 
                                                            epsilon=eps, 
                                                            k=2,
                                                            attribute_to_is_candidate_key={"First Name": False, "Last Name": False, "Height": False, "Nationality": False, "Favorite Icecream": False, "Like Liquorice": False, "Times Been to Italy": False, "First Time London": False, "Steps per Day": False},
                                                            attribute_to_datatype={"First_Name": "Integer", "Last_Name": "Integer", "Height": "Float", "Nationality": "Integer","Favorite Icecream": "Integer","Like Liquorice": "Integer","Times Been to Italy": "Integer", "First Time London": "Integer","Steps per Day": "Integer"},
                                                            attribute_to_is_categorical={'First_Name': True, 'Last_Name': True, 'Height': False, "Nationality": True, "Favorite Icecream": True,"Like Liquorice": True,"Times Been to Italy": True, "First Time London": True,"Steps per Day": False},
                                                            )
    description = f'{timestamp}.json'
    syn_path = f'syn_{timestamp}.csv'
    describer.save_dataset_description_to_file(description)
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(n=len(all_labels), description_file=description, seed=timestamp)
    generator.save_synthetic_data(syn_path)
    result = pd.read_csv(syn_path, index_col=False).round(2)
    os.remove(f'{timestamp}.csv')
    os.remove(f'{timestamp}.json')
    os.remove(f'syn_{timestamp}.csv')
    result['First Name'] = fn_encoder.inverse_transform(result['First Name'].astype(int))
    result['Last Name'] = ln_encoder.inverse_transform(result['Last Name'].astype(int))
    result['Nationality'] = na_encoder.inverse_transform(result['Nationality'].astype(int))
    result['Favorite Icecream'] = fl_encoder.inverse_transform(result['Favorite Icecream'].astype(int))
    return result

    
#generate_real_data(1499, 1)
#generate_real_data(1499, 0)

if __name__ == '__main__':
    eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
    for liquorice in [0,1]:
        real_dir = f'sample_data_{liquorice}.csv'
        real_data = pd.read_csv(real_dir, index_col=False)
        for eps in eps_list:
            #synthesize_no_bin(real_data, eps).to_csv(f"demo_syn/syn_bin_{liquorice}_{eps}.csv")
            synthesize_bin(real_data, eps).to_csv(f"demo_syn_new/syn_no_{liquorice}_{eps}.csv", index=0)


In [None]:
from Metrics import AttributeInference as AIR
from Metrics import CGeneralizedCAP as GCAP
from Metrics import CZeroCAP as CZCAP
from Metrics import DCR
from Metrics import NNDR
from Metrics import Hidden_rate
from Metrics import NNAA
from Metrics import MemInf as MIR
from Metrics import Hitting_rate
from Metrics import MDCR
from sklearn.preprocessing import LabelEncoder
from Metrics import All_synthcity
import pandas as pd

def get_metric_results(real_data, syn_data):
    real_data['Height'].astype('Float32')
    syn_data['Height'].astype('Float32')

    all_data = pd.concat([real_data, syn_data])
    fn_encoder = LabelEncoder()
    ln_encoder = LabelEncoder()
    fl_encoder = LabelEncoder()
    na_encoder = LabelEncoder()
    r_fn = fn_encoder.fit_transform(all_data['First Name'])
    r_ln = ln_encoder.fit_transform(all_data['Last Name'])
    r_na = na_encoder.fit_transform(all_data['Nationality'])
    r_fl = fl_encoder.fit_transform(all_data['Favorite Icecream'])
    all_labels = pd.DataFrame({'First Name':r_fn, 'Last Name': r_ln, 'Height': all_data['Height'],'Nationality': r_na, 'Favorite Icecream':r_fl, 'Like Liquorice': all_data['Like Liquorice'], 'Times Been to Italy': all_data['Times Been to Italy'], 'First Time London': all_data['First Time London'], 'Steps per Day': all_data['Steps per Day']})
    real_labels = all_labels[:len(real_data)]
    syn_labels = all_labels[-len(real_data):]
    
    metrics = {
                    'sanity': ['common_rows_proportion', 'nearest_syn_neighbor_distance', 'close_values_probability', 'distant_values_probability'],
                    'stats': ['alpha_precision'],
                    'detection': ['detection_mlp'],
                    'privacy': ['identifiability_score'],
                }
    #air = AIR.calculate_metric(args = None, _real_data=real_dat, _synthetic=syn_dat)
    synthcity_results = All_synthcity.calculate_metric(args = None, _real_data=real_data, _synthetic=real_data, _metrics=metrics)
    crp = synthcity_results['mean'][1]
    nsnd = 1-synthcity_results['mean'][2]
    cvp = synthcity_results['mean'][3]
    dvp = 1-synthcity_results['mean'][4]
    auth = synthcity_results['mean'][10]
    mlp = synthcity_results['mean'][11]
    id_score = synthcity_results['mean'][12]
    air = AIR.calculate_metric(args = None, _real_data=real_data, _synthetic=syn_data)
    gcap = GCAP.calculate_metric(args = None, _real_data=real_labels, _synthetic=syn_labels)
    zcap = CZCAP.calculate_metric(args = None, _real_data=real_labels, _synthetic=syn_labels)
    mdcr = MDCR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    hitR = Hitting_rate.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    mir = MIR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    nnaa = NNAA.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    #---These metrics simply take too long to run
    dcr = DCR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    nndr = NNDR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    hidd = Hidden_rate.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
     
    priv_results = np.around([air, gcap, zcap, 
                            mdcr, hitR, mir, 
                            nnaa, crp, nsnd, 
                            cvp, dvp, auth, 
                            mlp, id_score, 
                            dcr, nndr, hidd
                            ], 2).tolist()
    
    metric_list = ["Attribute Inference Risk", "GeneralizedCAP", "ZeroCAP", 
                   "Median Distance to Closest Record", "Hitting Rate",
                   "Membership Inference Risk", "Nearest Neighbour Adversarial Accuracy",
                   "Common Row Proportion", "Nearest Synthetic Neighbour Distance",
                   "Close Value Probability", "Distant Value Probability",
                   "Authenticity", "DetectionMLP", "Identifiability Score"
                   , "Distance to Closest Record", "Nearest Neighbour Distance Ratio", "Hidden Rate"
                   ]
    
    results = pd.DataFrame({'Metric':metric_list, 'Result':priv_results})
    
    return results


eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for liquorice in [0,1]:
    real_dir = f'sample_data_{liquorice}.csv'
    real_data = pd.read_csv(real_dir)
    for eps in eps_list:
        syn_no = pd.read_csv(f"demo_syn/syn_no_{liquorice}_{eps}.csv")
        syn_bin = pd.read_csv(f"demo_syn/syn_bin_{liquorice}_{eps}.csv")
        get_metric_results(real_data, syn_no).to_csv(f"metric_results/syn_bin_{liquorice}_{eps}.csv")
        get_metric_results(real_data, syn_bin).to_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")

In [None]:
from Metrics import All_synthcity
from Metrics import AttributeInference1 as AIR
from Metrics import CGeneralizedCAP as GCAP
from Metrics import CZeroCAP as CZCAP
from Metrics import DCR
from Metrics import NNDR
from Metrics import Hidden_rate
from Metrics import NNAA
from Metrics import MemInf as MIR
from Metrics import Hitting_rate
from Metrics import MDCR
from sklearn.preprocessing import LabelEncoder
#from Metrics import All_synthcity
import pandas as pd
import numpy as np
def get_metric_results(real_data, syn_data):
    all_data = pd.concat([real_data, syn_data])
    fn_encoder = LabelEncoder()
    ln_encoder = LabelEncoder()
    fl_encoder = LabelEncoder()
    na_encoder = LabelEncoder()
    r_fn = fn_encoder.fit_transform(all_data['First Name'])
    r_ln = ln_encoder.fit_transform(all_data['Last Name'])
    r_na = na_encoder.fit_transform(all_data['Nationality'])
    r_fl = fl_encoder.fit_transform(all_data['Favorite Icecream'])
    all_labels = pd.DataFrame({'First Name':r_fn, 'Last Name': r_ln, 'Height': all_data['Height'],'Nationality': r_na, 'Favorite Icecream':r_fl, 'Like Liquorice': all_data['Like Liquorice'], 'Times Been to Italy': all_data['Times Been to Italy'], 'First Time London': all_data['First Time London'], 'Steps per Day': all_data['Steps per Day']})
    real_labels = all_labels[:len(real_data)]
    syn_labels = all_labels[-len(real_data):]
    
    metrics = {
                    'sanity': ['common_rows_proportion', 'nearest_syn_neighbor_distance', 'close_values_probability', 'distant_values_probability'],
                    'stats': ['alpha_precision'],
                    'detection': ['detection_mlp'],
                    'privacy': ['identifiability_score'],
                }
    #air = AIR.calculate_metric(args = None, _real_data=real_data, _synthetic=syn_data)
    # synthcity_results = All_synthcity.calculate_metric(args = None, _real_data=real_data, _synthetic=syn_data, _metrics=metrics)
    # crp = synthcity_results['mean'][1]
    # nsnd = 1-synthcity_results['mean'][2]
    # cvp = synthcity_results['mean'][3]
    # dvp = 1-synthcity_results['mean'][4]
    # auth = synthcity_results['mean'][10]
    # mlp = synthcity_results['mean'][11]
    # id_score = synthcity_results['mean'][12]
    #air = AIR.calculate_metric(args = None, _real_data=real_data, _synthetic=syn_data)
    # gcap = GCAP.calculate_metric(args = None, _real_data=real_labels, _synthetic=syn_labels)
    # zcap = CZCAP.calculate_metric(args = None, _real_data=real_labels, _synthetic=syn_labels)
    # mdcr = MDCR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    hitR = Hitting_rate.calculate_metric(args=None, _real_data=real_data, _synthetic=syn_data)
    # mir = MIR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    # nnaa = NNAA.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    #---These metrics simply take too long to run
    # dcr = DCR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    # nndr = NNDR.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
    # hidd = Hidden_rate.calculate_metric(args=None, _real_data=real_labels, _synthetic=syn_labels)
     
    priv_results = np.around([#air, gcap, zcap, 
                            #mdcr, 
                            hitR, 
                            # mir, 
                            #nnaa, 
                            #crp, nsnd, 
                            #cvp, dvp, auth, 
                            #mlp, id_score, 
                            #dcr, nndr, hidd
                            ], 2).tolist()
    
    metric_list = [#"Attribute Inference Riks", "GeneralizedCAP", "ZeroCAP", "Median Distance to Closest Record", 
                   "Hitting Rate",
                   #"Membership Inference Risk", "Nearest Neighbour Adversarial Accuracy",
                   #"Common Row Proportion", "Nearest Synthetic Neighbour Distance",
                   #"Close Value Probability", "Distant Value Probability",
                   #"Authenticity", "DetectionMLP", "Identifiability Score"
                   #, "Distance to Closest Record", "Nearest Neighbour Distance Ratio", "Hidden Rate"
                   ]
    
    results = pd.DataFrame({'Metric':metric_list, 'Result':priv_results})
    
    return results
    

eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for liquorice in [0,1]:
    real_dir = f'sample_data_{liquorice}.csv'
    real_data = pd.read_csv(real_dir, index_col=False)
    for eps in eps_list:
        syn_no = pd.read_csv(f"demo_syn/syn_no_{liquorice}_{eps}.csv", index_col=False).drop(columns=['Unnamed: 0'])
        #syn_bin = pd.read_csv(f"demo_syn/syn_bin_{liquorice}_{eps}.csv")
        #get_metric_results(real_data, syn_no).to_csv(f"metric_full_new/syn_bin_{liquorice}_{eps}.csv")
        get_metric_results(real_data, syn_no).to_csv(f"metric_full/syn_no_{liquorice}_{eps}.csv")

In [None]:
import pandas as pd
eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for liquorice in [0,1]:
    for eps in eps_list:
        syn_no = pd.read_csv(f"metric_full_new/syn_no_{liquorice}_{eps}.csv")
        syn_bin = pd.read_csv(f"metric_full_new/syn_bin_{liquorice}_{eps}.csv")
        syn_no_more = pd.read_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")
        syn_bin_more = pd.read_csv(f"metric_results/syn_bin_{liquorice}_{eps}.csv")
        # full_no_bin.to_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")
        # full_bin.to_csv(f"metric_results/syn_bin_{liquorice}_{eps}.csv")
        
        syn_no_more["Metric"] = syn_no_more["Metric"].str.strip()
        syn_no["Metric"] = syn_no["Metric"].str.strip()
        result_map = dict(zip(syn_no["Metric"], syn_no["Result"]))
        syn_no_more["Result"] = syn_no_more["Metric"].map(result_map).combine_first(syn_no_more["Result"])
        syn_no_more.drop(columns=['Unnamed: 0.1']).to_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")
        
        syn_bin_more["Metric"] = syn_bin_more["Metric"].str.strip()
        syn_bin["Metric"] = syn_bin["Metric"].str.strip()
        result_map = dict(zip(syn_bin["Metric"], syn_bin["Result"]))
        syn_bin_more["Result"] = syn_bin_more["Metric"].map(result_map).combine_first(syn_bin_more["Result"])
        syn_bin_more.drop(columns=['Unnamed: 0.1']).to_csv(f"metric_results/syn_bin_{liquorice}_{eps}.csv")
        


In [None]:
import pandas as pd
eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for liquorice in [0,1]:
    for eps in eps_list:
        syn_no = pd.read_csv(f"metric_full_new/syn_no_{liquorice}_{eps}.csv")
        syn_no_more = pd.read_csv(f"metric_full/syn_no_{liquorice}_{eps}.csv")
        all = pd.concat([syn_no_more, syn_no], axis=0)
        all.to_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")

In [None]:
eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for liquorice in [0,1]:
    for eps in eps_list:
        df1 = pd.read_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")
        df2 = pd.read_csv(f"metric_full/syn_no_{liquorice}_{eps}.csv")
        df1["Metric"] = df1["Metric"].str.strip()
        df2["Metric"] = df2["Metric"].str.strip()

        # Create a mapping from df2 (Metric → Result)
        result_map = dict(zip(df2["Metric"], df2["Result"]))

        # Replace df1's "Result" where Metric matches, keeping original values if no match is found
        df1["Result"] = df1["Metric"].map(result_map).combine_first(df1["Result"])
        df1.to_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")
        print(df1)

In [None]:
#without labelencoding
import pandas as pd
import numpy as np
from faker import Faker
import random
from DataSynthesizer1.DataDescriber import DataDescriber
from DataSynthesizer1.DataGenerator import DataGenerator
from synthesis.synthesizers.privbayes import PrivBayes
from datetime import datetime
import os

def generate_real_data(num_samples, liquorice):
    #Code for generating a baby dataset
    name_gen = Faker()
    heights = np.around(list(np.random.normal(loc=170, scale=10, size=num_samples)), 2)
    

    # Generate random first and last names
    name_df = pd.DataFrame({
        'First_Name': [name_gen.first_name() for _ in range(num_samples)],
        'Last_Name': [name_gen.last_name() for _ in range(num_samples)]
    })
    height_df = pd.DataFrame({'Height': heights})
    basic_df = pd.concat([name_df, height_df], axis=1)

    # Define country list and correlation rules
    countries = ["USA", "Canada", "Germany", "France", "Italy", "China", "Brazil", "Australia", "Japan", "UK", "Sweden", "Norway", "Denmark", "Finland"]

    # Ice cream preferences (default: random choice)
    ice_creams = ["Vanilla", "Chocolate", "Strawberry", "Mint", "Pistachio", "Stracciatella"]

    # Generate data
    data = []
    for i in range(num_samples):
        person = {}

        # Assign country
        person["Country of Origin"] = random.choice(countries)

        # Assign favorite ice cream with correlation (Italy → Stracciatella preference)
        if person["Country of Origin"] == "Italy":
            person["Favorite Ice Cream"] = np.random.choice(ice_creams, p=[0.1, 0.1, 0.1, 0.1, 0.2, 0.4])
        else:
            person["Favorite Ice Cream"] = random.choice(ice_creams)

        # Assign liking for liquorice (Nordic countries → Higher probability)
        if person["Country of Origin"] in ["Sweden", "Norway", "Denmark", "Finland"]:
            person["Likes Liquorice"] = np.random.choice([1, 0], p=[0.9, 0.1])  # 70% chance for Nordic countries
        else:
            person["Likes Liquorice"] = np.random.choice([1, 0], p=[0.2, 0.8])  # 20% for others

        # Assign number of times visited Italy (Random integer, but higher if from Europe)
        if person["Country of Origin"] in ["Germany", "France", "UK", "Sweden", "Norway", "Denmark", "Finland", "Italy"]:
            person["Times Visited Italy"] = np.random.poisson(2)  # Higher average visits
        else:
            person["Times Visited Italy"] = np.random.poisson(0.5)  # Lower average visits

        # First time in London (UK residents more likely to say yes)
        person["First Time in London"] = 1 if person["Country of Origin"] == "UK" else np.random.choice([1, 0], p=[0.2, 0.8])

        # Number of steps per day (Normal distribution with realistic values)
        person["Steps per Day"] = max(1000, int(np.random.normal(8000, 3000)))  # Avoids negative steps

        data.append(person)

    # Create DataFrame
    df = pd.DataFrame(data)
    
    full_df = pd.concat([basic_df, df], axis=1, ignore_index=True)
    
    if liquorice == 0:
        # Sample row: UK resident who does NOT like liquorice
        indiv = [
            "James", "Smith", round(random.gauss(175, 10), 2), "UK", "Strawberry", 0, 2, 0, 7500
        ]
        indiv_df = pd.DataFrame([indiv], columns=full_df.columns)
    if liquorice == 1:
        # Sample row: Sweden resident who LIKES liquorice
        indiv = [
            "Lars", "Andersson", round(random.gauss(185, 10), 2), "Sweden", "Chocolate", 1, 3, 0, 9200 
        ]
        indiv_df = pd.DataFrame([indiv], columns=full_df.columns)
    full_df = pd.concat([full_df, indiv_df], ignore_index=True)
    full_df.columns = ["First Name", "Last Name", "Height", "Nationality", "Favorite Icecream", "Like Liquorice", "Times Been to Italy", "First Time London", "Steps per Day"]
    print(full_df)
    # Save to CSV (optional)
    full_df.to_csv(f"sample_data_{liquorice}.csv", index=False)

def synthesize_no_bin(real_data, eps):    
    # instantiate and fit synthesizer
    pb = PrivBayes(epsilon=eps, verbose=False)
    pb.fit(real_data)

    # Synthesize data
    gen_data  = pb.sample()

    # Save to csv file
    result = pd.DataFrame(gen_data.values, columns=gen_data.columns, index=range(real_data.shape[0]))
    
    return result

def synthesize_bin(real_data, eps):
    
    describer = DataDescriber()
    timestamp = datetime.now().timestamp()
    real_data.to_csv(f'{timestamp}.csv', index=False)
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=f'{timestamp}.csv', 
                                                            epsilon=eps, 
                                                            k=2,
                                                            attribute_to_is_candidate_key={"First Name": False, "Last Name": False, "Height": False, "Nationality": False, "Favorite Icecream": False, "Like Liquorice": False, "Times Been to Italy": False, "First Time London": False, "Steps per Day": False},
                                                            attribute_to_datatype={"First Name": "String", "Last Name": "String", "Height": "Float", "Nationality": "String","Favorite Icecream": "String","Like Liquorice": "Integer","Times Been to Italy": "Integer", "First Time London": "Integer","Steps per Day": "Integer"},
                                                            attribute_to_is_categorical={'First Name': True, 'Last Name': True, 'Height': False, "Nationality": True, "Favorite Icecream": True,"Like Liquorice": True,"Times Been to Italy": True, "First Time London": True,"Steps per Day": False},
                                                            )
    description = f'{timestamp}.json'
    syn_path = f'syn_{timestamp}.csv'
    describer.save_dataset_description_to_file(description)
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(n=len(real_data), description_file=description, seed=timestamp)
    generator.save_synthetic_data(syn_path)
    result = pd.read_csv(syn_path, index_col=False).round(2)
    os.remove(f'{timestamp}.csv')
    os.remove(f'{timestamp}.json')
    os.remove(f'syn_{timestamp}.csv')
    
    return result

    
#generate_real_data(1499, 1)
#generate_real_data(1499, 0)

if __name__ == '__main__':
    eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
    for liquorice in [0,1]:
        real_dir = f'sample_data_{liquorice}.csv'
        real_data = pd.read_csv(real_dir, index_col=False)
        for eps in eps_list:
            #synthesize_no_bin(real_data, eps).to_csv(f"demo_syn/syn_bin_{liquorice}_{eps}.csv")
            synthesize_bin(real_data, eps).to_csv(f"demo_syn_new/syn_no_{liquorice}_{eps}.csv", index=0)

In [53]:
eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for liquorice in [0,1]:
    for eps in eps_list:
        df1 = pd.read_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")
        df2 = pd.read_csv(f"metric_full/syn_no_{liquorice}_{eps}.csv")
        df1.loc[df1["Metric"] == "Hitting Rate", "Result"] = df2.loc[df2["Metric"] == "Hitting Rate", "Result"].values[0]
        df1.to_csv(f"metric_results/syn_no_{liquorice}_{eps}.csv")