# CS 3110/5110: Data Privacy
## Homework 7

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

## Range Queries

A *range query* counts the number of rows in the dataset which have a value lying in a given range. For example, "how many participants are between the ages of 21 and 33?" is a range query. A *workload* of range queries is just a list of range queries. The code below generates 100 random range queries over ages in the adult dataset.

In [None]:
def range_query(df, col, a, b):
    return len(df[(df[col] >= a) & (df[col] < b)])

random_lower_bounds = [random.randint(1, 70) for _ in range(100)]
random_workload = [(lb, random.randint(lb, 100)) for lb in random_lower_bounds]
real_answers = [range_query(adult, 'Age', lb, ub) for (lb, ub) in random_workload]
print('First 5 queries: ', random_workload[:5])

## Question 1 (10 points)

Write code to answer a workload of range queries using `laplace_mech` and sequential composition. Your solution should have a **total privacy cost of epsilon**.

In [None]:
def workload_laplace(workload, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

print('First 4 answers:', workload_laplace(random_workload, 1.0)[:4])

In [None]:
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_laplace(random_workload, 1.0))]
print('Average absolute error:', np.mean(errors))
assert np.mean(errors) > 50
assert np.mean(errors) < 200

## Question 2 (10 points)

Write code to answer a workload using `laplace_mech_vec` - the version of the Laplace mechanism for **vector-valued** queries. Your solution should *not* use sequential composition, and should have a total privacy cost of `epsilon`.

*Hint*: remember to use L1 global sensitivity.

In [None]:
def workload_laplace_vec(workload, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

print('First 4 answers:', workload_laplace_vec(random_workload, 1.0)[:4])

In [None]:
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_laplace_vec(random_workload, 1.0))]
print('Average absolute error:', np.mean(errors))
assert np.mean(errors) > 50
assert np.mean(errors) < 200

## Question 3 (10 points)

In 2-5 sentences, answer the following:
- Did the two solutions differ in terms of their accuracy?
- How do they differ in terms of their use of composition properties of differential privacy?

YOUR ANSWER HERE

## Question 3 (10 points)

Write code to answer a workload using `gaussian_mech_vec` - the version of the Gaussian mechanism for vector-valued queries. Your solution should not use sequential composition, should satisfy $(\epsilon, \delta)$-differential privacy, and should have a total privacy cost of (`epsilon`, `delta`).

*Hint*: remember to use L2 sensitivity.

In [None]:
def workload_gaussian_vec(workload, epsilon, delta):
    # YOUR CODE HERE
    raise NotImplementedError()

print('First 4 answers:', workload_gaussian_vec(random_workload, 1.0, 1e-5)[:4])

In [None]:
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_gaussian_vec(random_workload, 1.0, 1e-5))]
print('Average absolute error:', np.mean(errors))
assert np.mean(errors) > 10
assert np.mean(errors) < 100

## Question 4 (10 points)

In 2-5 sentences, answer the following:
- Of your solutions in questions 1-3, which ones rely on *sequential composition*?
- Which solution offers the best accuracy?
- Why does this particular solution yield the best accuracy?

YOUR ANSWER HERE

## Question 5 (10 points)

Re-implement your solution to question 3 using *Rényi differential privacy*. Your solution should satisfy $(\alpha, \bar\epsilon)$-RDP.

*Hint*: see the "variants" chapter in the textbook.

In [None]:
def workload_gaussian_vec_RDP(workload, alpha, epsilon_bar):
    # YOUR CODE HERE
    raise NotImplementedError()

print('First 4 answers:', workload_gaussian_vec(random_workload, 1.0, 1e-5)[:4])

In [None]:
# TEST CASE
errors = [abs(r_a - l_a) for (r_a, l_a) in zip(real_answers, workload_gaussian_vec_RDP(random_workload, 5, 0.1))]
print('Average absolute error:', np.mean(errors))
assert np.mean(errors) > 10
assert np.mean(errors) < 100

## Question 6 (10 points)

Implement a function `convert_RDP_ED` to convert from the $(\alpha, \bar\epsilon)$ of Rényi differential privacy to the $(\epsilon, \delta)$ of approximate differential privacy. Your function should also take the desired value of $\delta$.

In [None]:
def convert_RDP_ED(alpha, epsilon_bar, delta):
    # YOUR CODE HERE
    raise NotImplementedError()

convert_RDP_ED(5, 0.1, 1e-5)

In [None]:
# TEST CASE
assert convert_RDP_ED(5, 0.1, 1e-5) == 2.9782313662425572
assert convert_RDP_ED(40, 0.1, 1e-5) == 0.39520321705051864
assert convert_RDP_ED(500, 1.0, 1e-5) == 1.02307199491978
assert convert_RDP_ED(40, 1.0, 1e-5) == 1.2952032170505188

## Question 7 (10 points)

In 2-5 sentences, answer the following:
- Try various values for `alpha` and `epsilon_bar` in `convert_RDP_ED`. At what values do you observe an $(\epsilon, \delta)$ value around $(1.0, 10^{-5})$?
- Try these values for `alpha` and `epsilon_bar` in `workload_gaussian_vec_RDP`. How does the error compare to using `workload_gaussian_vec`?
- Is it useful to use Rényi differential privacy to answer workloads of range queries? Or is regular $(\epsilon, \delta)$-differential privacy just as good?

YOUR ANSWER HERE