### Generate synthetic data using private marginals


In [22]:
# get dataset
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

bear = pd.read_csv('https://raw.githubusercontent.com/jbennett979/Data_Privacy_FP/refs/heads/main/north_america_bear_killings.csv')

In [26]:
# Cleaning up the dataset
bear.columns = [c.lstrip() for c in bear]
# capitalizing columns name for age and gender
bear= bear.rename(columns={'age': 'Age'})
bear= bear.rename(columns={'gender': 'Gender'})

bear['Age'] = pd.to_numeric(bear['Age'], errors = 'coerce')

In [28]:
# calculates a differentially private one-way marginal for a given column
def dp_marginal(col, epsilon):
    hist = bear[col].value_counts()
    noisy_hist = hist.apply(lambda x: laplace_mech(x, sensitivity=1, epsilon=epsilon))

    # convert hist to 1-way marginal
    marginal = noisy_hist.clip(lower=0) / noisy_hist.clip(lower=0).sum()
    return marginal

dp_marginal('Age', 0.1)

Age
40    0.073666
24    0.015750
19    0.000000
37    0.000000
31    0.009343
        ...   
23    0.017472
43    0.000000
8     0.001103
76    0.000000
52    0.021447
Name: count, Length: 64, dtype: float64

In [30]:
# generates n samples of synthetic data
def gen_samples(n, marginal):
    samples = marginal.sample(n=n, replace=True, weights='probability')
    return samples.drop(columns='probability')

In [31]:
# builds a 2-way marginal with differential privacy.
def dp_two_marginal(col1, col2, epsilon):
    # make 2D histogram
    hist = bear[[col1, col2]].value_counts()
    dp_hist = hist.apply(lambda x: laplace_mech(x, sensitivity=1, epsilon=epsilon))
    syn_rep = dp_hist.clip(lower=0)

    marginal = syn_rep / syn_rep.sum()
    return marginal.to_frame(name='probability').reset_index()

In [86]:
def pick_b(epsilon, s):
    bs = range(0, 100, 10)
    last_result = 0
    epsilon_i = epsilon / len(bs)

    for b in bs:
        clipped_sum = s.clip(upper=b).sum()
        result = laplace_mech(clipped_sum, sensitivity=b, epsilon=epsilon_i)
        if result < last_result:
            return b
        else:
            last_result = result
    return bs[-1]

def dp_sum_age(epsilon):
    clipping_param = pick_b(epsilon, bear['Age'])
    print("clipping parameter: ", clipping_param)
    age_dp = bear['Age'].clip(upper=clipping_param).sum()

    return laplace_mech(age_dp, sensitivity=clipping_param, epsilon=epsilon)

# compute a differentially private average of age column
def dp_avg_age(epsilon):
    dp_sum = dp_sum_age(epsilon/2)
    age_dp = bear['Age'].shape[0]
    dp_count = laplace_mech(age_dp, sensitivity=1, epsilon=epsilon/2)

    dp_avg = dp_sum/dp_count
    return dp_avg

# TODO: remove actual querying of dataset
print("real avg: ", bear['Age'].mean())
dp_avg_age(1.0)

real avg:  36.09756097560975
clipping parameter:  40


np.float64(29.323384772134208)