## Generate synthetic data using private marginals
---

#### Clean and get dataset and include important functions and packages

In [450]:
# get dataset
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

bear = pd.read_csv('https://raw.githubusercontent.com/jbennett979/Data_Privacy_FP/refs/heads/main/north_america_bear_killings.csv')

In [451]:
# Cleaning up the dataset
bear.columns = [c.lstrip() for c in bear]
# capitalizing columns name for age and gender
bear= bear.rename(columns={'age': 'Age'})
bear= bear.rename(columns={'gender': 'Gender'})
bear = bear.rename(columns={'Type': 'Captivity'})

bear['Age'] = pd.to_numeric(bear['Age'], errors = 'coerce')
bear

Unnamed: 0,Name,Age,Gender,Date,Month,Year,Captivity,Location,Description,Type of bear,Hunter,Grizzly,Hikers,Only one killed
0,Erin Johnson,27,female,6/19/2017,6,2017,Wild,"Pogo mine, Alaska","Johnson, a contract employee for Pogo Mine, wa...",Black bear,0,0,0,1
1,Patrick Cooper,16,male,6/18/2017,6,2017,Wild,"Indian, Alaska",Cooper was chased and mauled by a bear while r...,Black bear,0,0,0,1
2,Daniel Ward O'Connor,27,male,5/10/2015,5,2015,Wild,"near Mackenzie, British Columbia",Ward was killed by a bear while he slept near ...,Black bear,0,0,0,1
3,Darsh Patel,22,male,9/21/2014,9,2014,Wild,"near West Milford, New Jersey",Patel was about to begin hiking with four frie...,Black bear,0,0,1,1
4,Lorna Weafer,36,female,5/7/2014,5,2014,Wild,"near Fort McMurray, Alberta","Weafer, a Suncor worker was attacked at the re...",Black bear,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,Baby Laird,1,,10/5/1908,10,1908,Captive,"Tucson, Arizona",After a bear escaped from a cage at Elysian Gr...,Black bear,0,0,0,1
160,John Dicht,18,male,11/24/1906,11,1906,Wild,"Elk County, Pennsylvania","Thinking the bear was dead, Dicht began skinni...",Black bear,0,0,0,1
161,Mary Porterfield,3,female,5/19/1901,5,1901,Wild,"Job, West Virginia",The children were gathering flowers near their...,Black bear,0,0,0,0
162,Wilie Porterfield,5,male,5/19/1901,5,1901,Wild,"Job, West Virginia",The children were gathering flowers near their...,Black bear,0,0,0,0


#### Functions to generate differentially private marginals

In [452]:
# calculates a differentially private one-way marginal for a given column
def dp_marginal(col, epsilon):
    hist = bear[col].value_counts()
    dp_hist = hist.apply(lambda x: laplace_mech(x, 1, epsilon))
    clipped = dp_hist.clip(lower=0)
    marginal = clipped / clipped.sum()
    return marginal

In [453]:
# one-way marginal for the age column
age_marg = dp_marginal('Age', 1.0)
hist = bear['Age'].value_counts() / len(bear['Age'])
df = pd.merge(hist, age_marg, on='Age').rename(columns={'count_x': 'Real_Age', 'count_y': 'Synth_Age'})
df['% Error'] = df.apply(lambda x: pct_error(x.Real_Age, x.Synth_Age), axis=1)
df
# age_marg

Unnamed: 0_level_0,Real_Age,Synth_Age,% Error
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40,0.048780,0.035128,27.986604
24,0.036585,0.030010,17.972069
19,0.036585,0.039660,8.403882
37,0.036585,0.036831,0.671113
31,0.030488,0.014583,52.169053
...,...,...,...
23,0.006098,0.003908,35.901411
43,0.006098,0.005417,11.166265
8,0.006098,0.019899,226.343425
76,0.006098,0.016446,169.721283


In [454]:
# one-way marginal for the type of bear column
dp_marginal('Type of bear', 1.0)

Type of bear
Brown bear    0.476469
Black bear    0.471444
Polar Bear    0.052087
Name: count, dtype: float64

In [455]:
# one-way marginal for if bear is a grizzly
dp_marginal('Grizzly', 1.0)

Grizzly
0    0.832646
1    0.167354
Name: count, dtype: float64

* Start off by generating a synthetic representation of the age column
* From the age column, generate a two way marginal for age and something else
    * Type of bear
    * Hiker
    * Hunter
* Compare the accuracies


#### Generate synthetic data from one-way marginal

In [456]:
def gen_samples(n, marginal):
    samples = marginal.sample(n=n, replace=True, weights='probability')
    return samples.drop(columns='probability')

def dp_synthetic_data(cols, n, epsilon):
    epsilon_i = epsilon / len(cols)
    data_dict = {}

    for col in cols:
        marg = dp_marginal(col, epsilon_i).to_frame(name='probability')
        synth = marg.sample(n=n, replace=True, weights='probability').sort_index()

        synth = synth.index.to_series().reset_index(drop=True)
        data_dict[col] = synth

    return pd.concat(data_dict, axis=1)

dp_synthetic_data(['Age'], 20, 1.0)

Unnamed: 0,Age
0,5
1,6
2,7
3,7
4,10
5,12
6,19
7,20
8,37
9,37


#### Generate two-way marginals

In [457]:
# Generate a two way synthetic rep for age and type of bear
def two_way_bear_hist(col1, col2, epsilon):
    hist = bear[[col1, col2]].value_counts()
    dp_hist = hist.apply(lambda x: laplace_mech(x, sensitivity=1, epsilon=epsilon))
    return dp_hist.clip(lower=0)

def two_marginal(col1, col2, epsilon):
    syn_rep = two_way_bear_hist(col1, col2, epsilon)
    marginal = syn_rep / syn_rep.sum()
    return marginal.to_frame(name='probability').reset_index()

def gen_samples(n, marginal):
    samples = marginal.sample(n=n, replace=True, weights='probability')
    return samples.drop(columns='probability')

In [458]:
# two way marginal
marginal = two_marginal('Type of bear','Age', 1.0)
marginal

Unnamed: 0,Type of bear,Age,probability
0,Brown bear,40,0.046925
1,Black bear,24,0.030618
2,Brown bear,19,0.025929
3,Black bear,3,0.024549
4,Brown bear,38,0.023773
...,...,...,...
91,Polar Bear,29,0.008427
92,Polar Bear,31,0.000000
93,Polar Bear,43,0.003662
94,Polar Bear,46,0.005636


In [459]:
# two way marginal
marginal = two_marginal('Hunter','Age', 1.0)
gen_samples(15, marginal)

Unnamed: 0,Hunter,Age
12,0,51
66,0,76
66,0,76
35,0,44
20,0,64
11,0,11
10,0,5
71,1,32
61,0,71
21,0,32


#### Synthetic data with two-way marginals

In [460]:
def dp_synthetic_data_two_marginal(col1, col2, n, epsilon):
    # Age and type marginal
    age_type_marg = two_marginal(col1, col2, epsilon)

    samples = gen_samples(n, age_type_marg).sort_index()
    return samples

In [461]:
""" Synthetic data if the bear was a grizzly and the month the accident occurred """
# 160 samples to better compare the counts
synth = dp_synthetic_data_two_marginal('Grizzly', 'Month', 160, 1.0)

synth_month = len(synth[synth['Month'] == 7])
real_month = len(bear[bear['Month'] == 7])

print(synth_month, real_month)
assert synth_month >= 15 and synth_month <= 35


29 27


In [462]:
""" Synthetic data on type of bear and if they were in captivity """
# 160 samples to better compare the counts
synth = dp_synthetic_data_two_marginal('Type of bear', 'Captivity', 160, 1.0)

synth_wild = len(synth[synth['Captivity'] == 'Captive'])
real_wild = len(bear[bear['Captivity'] == 'Captive'])

print(synth_wild, real_wild)
assert synth_wild >= 15 and synth_wild <= 40

30 28


In [463]:
""" Synthetic data on type of bear and if they were in captivity """
# 160 samples to better compare the counts
synth = dp_synthetic_data_two_marginal('Age', 'Hunter', 160, 1.0)

synth_hunt = len(synth[synth['Hunter'] == 1])
real_hunt = len(bear[bear['Hunter'] == 1])

print(synth_hunt, real_hunt)
assert synth_hunt >= 5 and synth_hunt <= 25

16 15
