# CS211: Data Privacy
## Homework 6

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

## Question 1 (5 points)

Consider the code below, which graphs the total privacy cost of an averaging attack under (1) sequential composition and (2) advanced composition, for values of `k` between 1 and 200.

In [None]:
epsilon = .1
delta_p = 1e-5

def query():
    return len(adult_data[adult_data['Martial Status'] == 'Divorced'])

def avg_attack(k):
    return np.mean([laplace_mech(query(), 1, epsilon) for i in range(k)])
    
xs = range(1, 200)
privacy_cost_sequential = [k * epsilon for k in xs]
privacy_cost_advanced = [2 * epsilon * np.sqrt(2 * k * np.log(1/delta_p)) for k in xs]

plt.plot(xs, privacy_cost_sequential);
plt.plot(xs, privacy_cost_advanced);

In 2-5 sentences, answer the following:
- At approximately what point do the lines representing the two approaches cross? What does this mean about **which** composition technique should be used for an iterative algorithm?
- The two lines represent **different** privacy costs for the **same** algorithm. How can both be correct?

YOUR ANSWER HERE

## Question 2 (5 points)

In the cell below, define a function that implements the vector version of the Laplace mechanism. An implementation that works for a single number is defined for reference as `laplace_mech` above.

In [None]:
def laplace_mech_vec(qs, sensitivity, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

laplace_mech_vec([1,2,3,4], 1, 1.0)

In [None]:
# TEST CASE
vec = [1,2,3,4,5]
dp_results = [laplace_mech_vec(vec, 15, 1.0) for _ in range(200)]
for i, v in enumerate(vec):
    spec = [np.random.laplace(loc=v, scale=15/1.0) for _ in range(200)]
    assert stats.wasserstein_distance([r[i] for r in dp_results], spec) < 10

## Question 3 (5 points)

In the cell below, define a function that implements the vector version of the Gaussian mechanism. An implementation that works for a single number is defined for reference as `gaussian_mech` above.

In [None]:
def gaussian_mech_vec(qs, sensitivity, epsilon, delta):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
# TEST CASE
vec = [1,2,3,4,5]
dp_results = [gaussian_mech_vec(vec, 15, 1.0, 1e-5) for _ in range(200)]
for i, v in enumerate(vec):
    spec = [np.random.normal(loc=v, scale=72.67207893) for _ in range(200)]
    assert stats.wasserstein_distance([r[i] for r in dp_results], spec) < 30

## Question 4 (10 points)

Consider the code below, which defines three average queries and runs them on `adult_data`, using the Laplace mechanism to provide differential privacy with $\epsilon = 1$ for each query.

In [None]:
b_capgain = 10000
b_age = 3000

epsilon = 1

def query1():
    return np.sum(adult['Capital Gain'].clip(lower=0, upper=b_capgain))

def query2():
    return len(adult[adult['Education-Num'] < 10])

def query3():
    return np.sum(adult['Age'].clip(lower=0, upper=b_age))

def my_query():
    return [query1(), query2(), query3()]

my_query()

In 2-5 sentences, answer the following:
- What is the $L_1$ global sensitivity of `my_query`, and why?
- What is the $L_2$ global sensitivity of `my_query`, and why?

YOUR ANSWER HERE

## Question 5 (5 points)

In the cell below, write code to release the results of calling `my_query()` with differential privacy, without using sequential composition.

*Hint*: use `laplace_mech_vec`, but be careful of sensitivity!

In [None]:
def my_query_lap(epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

my_query_lap(1.0)

### Question 13 (5 points)

In the cell below, write code to release the results of `my_query()` with differential privacy, but using the Gaussian mechanism.

*Hint*: remember to use $L_2$ sensitivity

In [None]:
def my_query_gauss(epsilon, delta):
    # YOUR CODE HERE
    raise NotImplementedError()

my_query_gauss(1.0, 1e-5)