# CS 3110/5110: Data Privacy
## In-Class Exercise, 10/21/2024

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

In [None]:
def gaussian_mech_RDP(val, sensitivity, alpha, epsilon):
    sigma = np.sqrt((sensitivity**2 * alpha) / (2 * epsilon))
    return val + np.random.normal(loc=0, scale=sigma)

## Question 1

Calculate the empirical *cumulative distribution function* of ages in the adult dataset. Use the RDP Gaussian mechanism.

In [None]:
def calculate_cdf():
    alpha = 5
    epsilon = 0.0001
    
    # YOUR CODE HERE
    raise NotImplementedError()

cdf_vals = calculate_cdf()
plt.plot(range(20, 80), cdf_vals);

## Question 2

What is the *total privacy cost* in RDP of your solution above?

YOUR ANSWER HERE

## Question 3

What is the *total privacy cost* in $(\epsilon, \delta)$-differential privacy of your solution above, for $\delta = 10^{-5}$?

YOUR ANSWER HERE

## Question 4

Repeat the above, but for zero-concentrated differential privacy.

In [None]:
def gaussian_mech_zCDP(val, sensitivity, rho):
    # YOUR CODE HERE
    raise NotImplementedError()

def calculate_cdf_zcdp():
    rho = 0.00001
    
    # YOUR CODE HERE
    raise NotImplementedError()

cdf_vals = calculate_cdf_zcdp()
plt.plot(range(20, 80), cdf_vals);

## Question 5

The code below defines a finite set of options for marital status. Define a *scoring function* that returns high scores for common marital statuses, and low scores for uncommon ones (e.g. the score could be the number of people with that status).

In [None]:
options = ['Never-married', 'Married-civ-spouse', 'Divorced',
           'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
           'Widowed']

def score(option):
    # YOUR CODE HERE
    raise NotImplementedError()

score('Never-married')

In [None]:
# TEST CASE
assert score('Never-married') == 10683

## Question 6

Implement `report_noisy_max` using the Laplace mechanism. `report_noisy_max` should return the value in a set that approximately maximizes the value of the score function. It should *not* return the score itself.

In [None]:
def report_noisy_max(R, score, sensitivity, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

report_noisy_max(options, score, 1, 1)

In [None]:
# TEST CASE
assert report_noisy_max(options, score, 1, 1) == 'Married-civ-spouse'

## Question 7

What is the **total privacy cost** of `report_noisy_max` under *sequential composition*?

YOUR ANSWER HERE

## Above Threshold

The following code implements Above Threshold:

In [None]:
# preserves epsilon-differential privacy
def above_threshold(queries, df, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    
    for idx, q in enumerate(queries):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q(df) + nu_i >= T_hat:
            return idx
    return -1 # the index of the last element

## Question 8

Use `above_threshold` to find the first `age` for which `len(adult[adult['Age'] == age]) >= 800` (the first age for which more than 800 people have that age).

In [None]:
def make_query(age):
    def q(df):
        return len(df[df['Age'] == age])
    return q

def find_first_age_above_800(epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()
    
find_first_age_above_800(1.0)

In [None]:
# TEST CASE
assert find_first_age_above_800(1.0) == 22

## Question 9

Use `above_threshold` to implement `pick_b` for the following summation query. `pick_b` should pick a clipping parameter `b`.

In [None]:
def make_query(b):
    def q(df):
        # YOUR CODE HERE
        raise NotImplementedError()
    return q

def pick_b(epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()
    
pick_b(1.0)

In [None]:
# TEST CASE
b = pick_b(1.0)
assert b > 80
assert b < 100

## Question 10

Implement `above_threshold_val`, which returns the *value* of the first query result above the threshold. Your solution should have a **total privacy cost** of `epsilon`.

In [None]:
def make_query(age):
    def q(df):
        return len(df[df['Age'] == age])
    return q

def above_threshold_val(queries, df, T, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()
    
queries = [make_query(age) for age in range(0,100)]
above_threshold_val(queries, adult, 800, 1.0)

In [None]:
# TEST CASE
queries = [make_query(age) for age in range(0,100)]

results = [above_threshold_val(queries, adult, 800, 1.0) for _ in range(20)]
assert np.mean(results) > 865
assert np.mean(results) < 890

## Question 11

Argue that your solution in question 10 has a total privacy cost of `epsilon`.

YOUR ANSWER HERE