# CS 3110/5110: Data Privacy
## In-Class Exercise, week of 10/14/2024

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

In [None]:
def gaussian_mech_RDP(val, sensitivity, alpha, epsilon):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon))
    return val + np.random.normal(loc=0, scale=sigma)

## Question 1

Write a function to calculate the L1 **local sensitivity** of the following mean query.

In [None]:
def mean_query():
    b = 125
    return adult['Age'][adult['Education'] == 'Bachelors'].clip(lower=0, upper=b).mean()

mean_query()

In [None]:
def mean_query_local_sensitivity():
    # YOUR CODE HERE
    raise NotImplementedError()

mean_query_local_sensitivity()

In [None]:
# TEST CASE
assert (mean_query_local_sensitivity() - 0.02333831217326363) < 0.0001

## Question 2

Why can't we use the local sensitivity from question 1 directly to achieve differential privacy for `mean_query`?

YOUR ANSWER HERE

## Question 3

Implement an upper bound on $A(f, x, k)$, the local sensitivity of a mean query at distance $k$.

In [None]:
def ls_mean_at_distance(df, b, k):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
# TEST CASE
assert ls_mean_at_distance(adult['Age'], 125, 500) == 0.0038988178784192633
assert ls_mean_at_distance(adult['Age'], 125, 5000) == 0.004535394216465295

## Question 4

Implement an upper bound on $D(f, x, s)$, the distance from $x$ to a database where the local sensitivity of the mean query is greater than or equal to $s$.

In [None]:
def dist_to_high_ls_mean(df, b, s):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
# TEST CASE
assert dist_to_high_ls_mean(adult['Age'], 125, 0.1) == 31311 
assert dist_to_high_ls_mean(adult['Age'], 125, 0.01) == 20061
assert dist_to_high_ls_mean(adult['Age'], 125, 0.001) == 0

## Question 5

Use the Propose-Test-Release (PTR) framework to calculate the mean age in the adult dataset with differential privacy.

In [None]:
def mean_ptr(epsilon, delta):
    # YOUR CODE HERE
    raise NotImplementedError()

mean_ptr(1.0, 10e-5)

In [None]:
# TEST CASE

true_answer = adult['Age'].mean()
trials = [mean_ptr(1.0, 10e-5) for _ in range(20)]
errors = [pct_error(true_answer, x) for x in trials]
assert np.mean(errors) < 1.0

## Question 6

Use the sample-and-aggregate framework to calculate the averge of ages in the adult dataset with differential privacy.

In [None]:
def saa_avg_age(k, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

saa_avg_age(600, 1)    

## Question 7

What is the impact of the parameter `k`?

YOUR ANSWER HERE

## Question 8

Calculate the empirical *cumulative distribution function* of ages in the adult dataset. Use the RDP Gaussian mechanism.

In [None]:
def calculate_cdf():
    alpha = 5
    epsilon = 0.0001
    
    # YOUR CODE HERE
    raise NotImplementedError()

cdf_vals = calculate_cdf()
plt.plot(range(20, 80), cdf_vals);

## Question 9

What is the *total privacy cost* in RDP of your solution above?

YOUR ANSWER HERE

## Question 10

What is the *total privacy cost* in $(\epsilon, \delta)$-differential privacy of your solution above, for $\delta = 10^{-5}$?

YOUR ANSWER HERE

## Question 11

Repeat question 3, but for zero-concentrated differential privacy.

In [None]:
def gaussian_mech_zCDP(val, sensitivity, rho):
    # YOUR CODE HERE
    raise NotImplementedError()

def calculate_cdf_zcdp():
    rho = 0.00001
    
    # YOUR CODE HERE
    raise NotImplementedError()

cdf_vals = calculate_cdf_zcdp()
plt.plot(range(20, 80), cdf_vals);