In [20]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import random
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)
def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

In [21]:
# Load the data and libraries
import pandas as pd
import numpy as np

bear = pd.read_csv('https://raw.githubusercontent.com/jbennett979/Data_Privacy_FP/refs/heads/main/north_america_bear_killings.csv')

In [22]:
#removing space from the column name age\n",
bear.columns = [c.lstrip() for c in bear]
# capitalizing columns name for age and gende
bear= bear.rename(columns={'age': 'Age'})
bear= bear.rename(columns={'gender': 'Gender'})

In [23]:
print(bear.columns)
    # converting the age column who was a string to integers, while also adding error handling
bear['Age'] = pd.to_numeric(bear['Age'], errors = 'coerce')

Index(['Name', 'Age', 'Gender', 'Date', 'Month', 'Year', 'Type', 'Location',
       'Description', 'Type of bear', 'Hunter', 'Grizzly', 'Hikers',
       'Only one killed'],
      dtype='object')


In [24]:
#
def age_sum (epsilon):
   b = 75
   clipped_sum = bear['Age'].clip(upper=b).sum()

   return (laplace_mech(clipped_sum, sensitivity=b, epsilon=epsilon))

age_sum(1.0)

np.float64(5920.964456160357)

In [25]:
def dp_avg_age(epsilon):

    noisy_sum = age_sum(epsilon/2)

    noisy_count = laplace_mech(len(bear), sensitivity=1, epsilon=epsilon/2)

    mean = noisy_sum/noisy_count

    return mean

dp_avg_age(1.0)

np.float64(36.27869710376496)

In [26]:
 # starting with choosing a clip param using clip b param for age

def age_pick_b(epsilon):
   bs =  range(1,200,10)
   last_result = 0
   epsilon_i = epsilon / len(bs)

   for b in bs:
    #try b
    clipped_sum = bear['Age'].clip(upper=b).sum()
    result= (laplace_mech(clipped_sum, sensitivity=b, epsilon=epsilon_i))

    if result < last_result:
        return b
    else:
        last_result = result
    #raise Exception('No good clipping parameter found')

age_pick_b(1.0)

31

In [27]:
# differential private
def monthly_counts (epsilon):

   count= bear['Month'].value_counts()

   return (laplace_mech(count, sensitivity=1, epsilon=epsilon))

monthly_counts(1.0)

Month
8     26.741189
7     25.741189
10    22.741189
9     22.741189
6     18.741189
5     16.741189
11    10.741189
4      2.741189
1      1.741189
12     0.741189
2     -0.258811
3     -0.258811
Name: count, dtype: float64

In [28]:
# range queries
def range_query(df, col, a, b):
    return len(df[(df[col] >= a) & (df[col] < b)]) # this runs the range query

random_lower_bounds = [random.randint(1, 70) for _ in range(100)]
random_workload = [(lb, random.randint(lb, 100)) for lb in random_lower_bounds]
real_answers = [range_query(bear, 'Age', lb, ub) for (lb, ub) in random_workload]


In [29]:
def workload_laplace_vec(workload, epsilon):
    l1_sens = age_pick_b(1.0)
    workload_answer = [range_query(bear, 'Age', lb, ub) for (lb, ub) in random_workload]
    laplace_vec=  laplace_mech_vec(workload_answer, l1_sens, epsilon=epsilon)
    return laplace_vec
print('First 4 answers:', workload_laplace_vec(random_workload, 1.0)[:4])

First 4 answers: [64.42247837254149, -75.95353344228404, 11.13155934791767, -15.57407828983784]
