## Generate synthetic data using private marginals
---

#### Clean and get dataset and include important functions and packages

In [69]:
# get dataset
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

bear = pd.read_csv('https://raw.githubusercontent.com/jbennett979/Data_Privacy_FP/refs/heads/main/north_america_bear_killings.csv')

In [70]:
# Cleaning up the dataset
bear.columns = [c.lstrip() for c in bear]
# capitalizing columns name for age and gender
bear= bear.rename(columns={'age': 'Age'})
bear= bear.rename(columns={'gender': 'Gender'})

bear['Age'] = pd.to_numeric(bear['Age'], errors = 'coerce')
bear

Unnamed: 0,Name,Age,Gender,Date,Month,Year,Type,Location,Description,Type of bear,Hunter,Grizzly,Hikers,Only one killed
0,Erin Johnson,27,female,6/19/2017,6,2017,Wild,"Pogo mine, Alaska","Johnson, a contract employee for Pogo Mine, wa...",Black bear,0,0,0,1
1,Patrick Cooper,16,male,6/18/2017,6,2017,Wild,"Indian, Alaska",Cooper was chased and mauled by a bear while r...,Black bear,0,0,0,1
2,Daniel Ward O'Connor,27,male,5/10/2015,5,2015,Wild,"near Mackenzie, British Columbia",Ward was killed by a bear while he slept near ...,Black bear,0,0,0,1
3,Darsh Patel,22,male,9/21/2014,9,2014,Wild,"near West Milford, New Jersey",Patel was about to begin hiking with four frie...,Black bear,0,0,1,1
4,Lorna Weafer,36,female,5/7/2014,5,2014,Wild,"near Fort McMurray, Alberta","Weafer, a Suncor worker was attacked at the re...",Black bear,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,Baby Laird,1,,10/5/1908,10,1908,Captive,"Tucson, Arizona",After a bear escaped from a cage at Elysian Gr...,Black bear,0,0,0,1
160,John Dicht,18,male,11/24/1906,11,1906,Wild,"Elk County, Pennsylvania","Thinking the bear was dead, Dicht began skinni...",Black bear,0,0,0,1
161,Mary Porterfield,3,female,5/19/1901,5,1901,Wild,"Job, West Virginia",The children were gathering flowers near their...,Black bear,0,0,0,0
162,Wilie Porterfield,5,male,5/19/1901,5,1901,Wild,"Job, West Virginia",The children were gathering flowers near their...,Black bear,0,0,0,0


#### Finding a differentially private average from the data set in order to safely compare outputs to the data

In [71]:
# Differentially private value to compare synthetic data against
def pick_b(epsilon, s):
    bs = range(0, 100, 10)
    last_result = 0
    epsilon_i = epsilon / len(bs)

    for b in bs:
        clipped_sum = s.clip(upper=b).sum()
        result = laplace_mech(clipped_sum, sensitivity=b, epsilon=epsilon_i)
        if result < last_result:
            return b
        else:
            last_result = result
    return bs[-1]

def dp_sum_age(epsilon):
    clipping_param = pick_b(epsilon, bear['Age'])
    # print("clipping parameter: ", clipping_param)
    age_dp = bear['Age'].clip(upper=clipping_param).sum()

    return laplace_mech(age_dp, sensitivity=clipping_param, epsilon=epsilon)

# compute a differentially private average of age column
def dp_avg_age(epsilon):
    dp_sum = dp_sum_age(epsilon/2)
    age_dp = bear['Age'].shape[0]
    dp_count = laplace_mech(age_dp, sensitivity=1, epsilon=epsilon/2)

    dp_avg = dp_sum/dp_count
    return dp_avg

# TODO: remove actual querying of dataset
print("real avg: ", bear['Age'].mean())
print("dp_avg: ", dp_avg_age(1.0))

print('Percent error: {}'.format(pct_error( bear['Age'].mean(), dp_avg_age(1.0))))


real avg:  36.09756097560975
dp_avg:  33.287856510876544
Percent error: 6.0337827302018345


#### Functions to generate differentially private marginals

In [72]:
# calculates a differentially private one-way marginal for a given column
def dp_marginal(col, epsilon):
    ### BEGIN SOLUTION
    hist = bear[col].value_counts()
    dp_hist = hist.apply(lambda x: laplace_mech(x, 1, epsilon))
    clipped = dp_hist.clip(lower=0)
    marginal = clipped / clipped.sum()
    return marginal

* Start off by generating a synthetic representation of the age column
* From the age column, generate a two way marginal for age and something else
    * Type of bear
    * Hiker
    * Hunter
* Compare the accuracies


In [76]:
# Generate a two way synthetic rep for age and type of bear
def two_way_bear_hist(col1, col2, epsilon):
    # goal: 2-dimensional histogram
    hist = bear[[col1, col2]].value_counts()
    dp_hist = hist.apply(lambda x: laplace_mech(x, sensitivity=1, epsilon=epsilon))
    return dp_hist.clip(lower=0)

age_type = two_way_bear_hist('Age', 'Year', 1.0)
print(age_type)
type_age = two_way_bear_hist('Year', 'Age', 1.0)
print(type_age)

Age  Year
40   1998    2.334363
53   1999    0.949446
40   1992    2.721106
37   2018    1.311569
19   1980    0.694373
               ...   
74   2009    0.710810
76   1934    1.102089
77   1995    5.542383
     2002    0.170565
93   2001    1.458748
Name: count, Length: 158, dtype: float64
Year  Age
2018  37     3.510982
1999  53     1.890983
1998  40     1.923136
1992  40     1.151846
1980  19     2.132767
               ...   
2017  16     0.327681
2018  1      2.423375
      18     0.000000
      31     0.696084
      44     0.000000
Name: count, Length: 158, dtype: float64


In [81]:
def two_marginal(col1, col2, epsilon):
    # 1. generate the synthetic representation
    syn_rep = two_way_bear_hist(col1, col2, epsilon)
    # 2. normalize to get a probability distribution
    marginal = syn_rep / syn_rep.sum()
    return marginal.to_frame(name='probability').reset_index()

def gen_samples(n, marginal):
    samples = marginal.sample(n=n, replace=True, weights='probability')
    return samples.drop(columns='probability')

marginal = two_marginal('Hunter','Age', 1.0)
gen_samples(15, marginal)


Unnamed: 0,Hunter,Age
25,0,6
68,1,31
38,0,56
74,1,51
8,0,1
61,0,71
28,0,68
49,0,20
64,0,93
31,0,27


In [None]:
|