# Code for how the synthetic data was generated and metric results were computed

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from synthesis.synthesizers.privbayes import PrivBayes

In [None]:
#Real dataset generation definition

def generate_real_data(num_samples):
    name_gen = Faker()
    heights = np.around(list(np.random.normal(loc=170, scale=10, size=num_samples)), 2)
    classic_icecreams = [
        "Vanilla", "Chocolate", "Strawberry", "Mint Chocolate Chip",
        "Cookies and Cream", "Rocky Road", "Butter Pecan", "Neapolitan",
        "Pistachio", "French Vanilla"
    ]
    fav_icecream = list(random.choices(classic_icecreams, k=num_samples))

    # Generate random first and last names
    name_df = pd.DataFrame({
        'First Name': [name_gen.first_name() for _ in range(num_samples)],
        'Last Name': [name_gen.last_name() for _ in range(num_samples)]
    })
    height_df = pd.DataFrame({'Height': heights})
    icecream_df = pd.DataFrame({'Flavour': fav_icecream})
    basic_df = pd.concat([name_df, height_df, icecream_df], axis=1)
    
    # Set random seed for reproducibility
    random.seed(42)
    np.random.seed(42)

    # Define country list and correlation rules
    countries = ["USA", "Canada", "Germany", "France", "Italy", "China", "Brazil", "Australia", "Japan", "UK", "Sweden", "Norway", "Denmark", "Finland"]

    # Ice cream preferences (default: random choice)
    ice_creams = ["Vanilla", "Chocolate", "Strawberry", "Mint", "Pistachio", "Stracciatella"]

    # Generate data
    data = []
    for i in range(num_samples):
        person = {}

        # Assign country
        person["Country of Origin"] = random.choice(countries)

        # Assign favorite ice cream with correlation (Italy → Stracciatella preference)
        if person["Country of Origin"] == "Italy":
            person["Favorite Icecream"] = np.random.choice(ice_creams, p=[0.1, 0.1, 0.1, 0.1, 0.2, 0.4])
        else:
            person["Favorite Icecream"] = random.choice(ice_creams)

        # Assign liking for liquorice (Nordic countries → Higher probability)
        if person["Country of Origin"] in ["Sweden", "Norway", "Denmark", "Finland"]:
            person["Like Liquorice"] = np.random.choice([1, 0], p=[0.9, 0.1])  # 70% chance for Nordic countries
        else:
            person["Like Liquorice"] = np.random.choice([1, 0], p=[0.2, 0.8])  # 20% for others

        # Assign number of times visited Italy (Random integer, but higher if from Europe)
        if person["Country of Origin"] in ["Germany", "France", "UK", "Sweden", "Norway", "Denmark", "Finland", "Italy"]:
            person["Times Visited Italy"] = np.random.poisson(2)  # Higher average visits
        else:
            person["Times Visited Italy"] = np.random.poisson(0.5)  # Lower average visits

        # First time in London (UK residents more likely to say yes)
        person["First Time London"] = 1 if person["Country of Origin"] == "UK" else np.random.choice([1, 0], p=[0.2, 0.8])

        # Number of steps per day (Normal distribution with realistic values)
        person["Steps per Day"] = max(1000, int(np.random.normal(8000, 3000)))  # Avoids negative steps

        data.append(person)

    # Create DataFrame
    df = pd.DataFrame(data)
    
    full_df = pd.concat([basic_df, df], axis=1)
    
    return full_df

num_samples = 1500

real_data = generate_real_data(num_samples)

real_data.to_csv("sample_data_1.csv", index=False)

In [None]:
#Synthesize dataset with PrivBayes
def synthesize_no_bin(real_data, eps):    
    # instantiate and fit synthesizer
    pb = PrivBayes(epsilon=eps, verbose=False)
    pb.fit(real_data)

    # Synthesize data
    gen_data  = pb.sample()

    # Save to csv file
    result = pd.DataFrame(gen_data.values, columns=gen_data.columns, index=range(real_data.shape[0]))
    
    return result

eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
for eps in eps_list:
    synthesize_no_bin(real_data, eps).to_csv(f"demo_syn/syn_no_1_{eps}.csv")


In [None]:
#Get metric results
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from get_metric_results import get_metric_results

df = pd.read_csv("sample_data_1.csv")
epsilon = [1, 2.5, 5]
for eps in epsilon:
    syn_df = pd.read_csv(f"demo_syn/syn_no_1_{eps}.csv", index_col=False).drop(columns=['Unnamed: 0'], errors='ignore')
    print(syn_df.head())

    all_data = pd.concat([df, syn_df], ignore_index=True)

    cat_cols = all_data.select_dtypes(include=['object', 'bool']).columns

    # Initialize a dictionary to hold encoded data
    encoded_data = {}
    for col in cat_cols:
        if all_data[col].dtype == 'bool':
            encoded_data[col] = all_data[col].astype(int)
        else:
            le = LabelEncoder()
            encoded_data[col] = le.fit_transform(all_data[col].astype(str))

    num_cols = all_data.select_dtypes(exclude=['object', 'bool']).columns
    for col in num_cols:
        encoded_data[col] = all_data[col]

    all_labels = pd.DataFrame(encoded_data)
    real_len = len(df)
    real_labels = all_labels[:real_len]
    syn_labels = all_labels[real_len:]

    metric_results = get_metric_results(df, syn_df, real_labels, syn_labels, sensitive_attributes=['Like Liquorice'])
    print("Metric Results:")
    print(metric_results)

    metric_results.to_csv(f"metric_results/syn_no_1_{eps}.csv", index=False)


eps_list = [0.02, 0.05, 0.1, 0.2, 0.5, 1, 2.5, 5]
real_data = pd.read_csv("sample_data_1.csv")


## Get Metric Results for Tabsyn

Here is the code used to get the metric results for the SDG mechanism [Tabsyn](https://doi.org/10.48550/arXiv.2310.09656), using their provided dataset, namely, the [Shoppers](https://doi.org/10.24432/C5F88Q) dataset.

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from get_metric_results import get_metric_results

df = pd.read_csv("Data/real.csv", index_col=False)
syn_df = pd.read_csv("Data/tabsyn.csv", index_col=False)

all_data = pd.concat([df, syn_df], ignore_index=True)

cat_cols = all_data.select_dtypes(include=['object', 'bool']).columns

# Initialize a dictionary to hold encoded data
encoded_data = {}
for col in cat_cols:
    if all_data[col].dtype == 'bool':
        encoded_data[col] = all_data[col].astype(int)
    else:
        le = LabelEncoder()
        encoded_data[col] = le.fit_transform(all_data[col].astype(str))

num_cols = all_data.select_dtypes(exclude=['object', 'bool']).columns
for col in num_cols:
    encoded_data[col] = all_data[col]

all_labels = pd.DataFrame(encoded_data)
real_len = len(df)
real_labels = all_labels[:real_len]
syn_labels = all_labels[real_len:]

metric_results = get_metric_results(df, syn_df, real_labels, syn_labels, sensitive_attributes=['Revenue'])
print("Metric Results:")
print(metric_results)

metric_results.to_csv("metric_results/tabsyn_metric_results.csv", index=False)


## Get Metric Results for PrivBayes

Here is the code used to get the metric results for the SDG mechanism [PrivBayes](https://doi.org/10.1145/3134428), using the dataset used in TabSyn, namely, the [Shoppers](https://doi.org/10.24432/C5F88Q) dataset.

In [None]:
#Synthesize the dataset with PrivBayes
from synthesis.synthesizers.privbayes import PrivBayes

def synthesize_no_bin(real_data, eps):    
    # instantiate and fit synthesizer
    pb = PrivBayes(epsilon=eps, verbose=False)
    pb.fit(real_data)

    # Synthesize data
    gen_data  = pb.sample()

    # Save to csv file
    result = pd.DataFrame(gen_data.values, columns=gen_data.columns, index=range(real_data.shape[0]))
    
    return result

df = pd.read_csv('Data/real.csv')
epsilon = [0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.5, 5.0]
for eps in epsilon:
    synthesized_data = synthesize_no_bin(df, eps)
    synthesized_data.to_csv(f'Data/privbayes(e={eps}).csv', index=False)

In [None]:
#Get metric results for PrivBayes
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from get_metric_results import get_metric_results

df = pd.read_csv("Data/real.csv", index_col=False)
epsilon = [0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.5, 5.0]
for eps in epsilon:
    syn_df = pd.read_csv(f"Data/privbayes(e={eps}).csv", index_col=False)

    all_data = pd.concat([df, syn_df], ignore_index=True)

    cat_cols = all_data.select_dtypes(include=['object', 'bool']).columns

    # Initialize a dictionary to hold encoded data
    encoded_data = {}
    for col in cat_cols:
        if all_data[col].dtype == 'bool':
            encoded_data[col] = all_data[col].astype(int)
        else:
            le = LabelEncoder()
            encoded_data[col] = le.fit_transform(all_data[col].astype(str))

    num_cols = all_data.select_dtypes(exclude=['object', 'bool']).columns
    for col in num_cols:
        encoded_data[col] = all_data[col]

    all_labels = pd.DataFrame(encoded_data)
    real_len = len(df)
    real_labels = all_labels[:real_len]
    syn_labels = all_labels[real_len:]

    metric_results = get_metric_results(df, syn_df, real_labels, syn_labels, sensitive_attributes=['Revenue'])
    print("Metric Results:")
    print(metric_results)

    metric_results.to_csv(f"metric_results/privbayes(e={eps})_metric_results.csv", index=False)