# CS211: Data Privacy
## In-Class Exercise, 10/24/2022

In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

  plt.style.use('seaborn-whitegrid')


## Question 1

The code below defines a finite set of options for marital status. Define a *scoring function* that returns high scores for common marital statuses, and low scores for uncommon ones (e.g. the score could be the number of people with that status).

In [2]:
# What is the most popular marital stuats?
adult['Marital Status'].value_counts()
# The answers is the highest row of value_counts()!

options = ['Never-married', 'Married-civ-spouse', 'Divorced',
           'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
           'Widowed']

# Sensitivity is important, sens of 1 is easiest case (using a counting query)
def score(option):
    count = len(adult[adult['Marital Status'] == option]) # Adults of the option we are currently scoring
    return count # Don't have to be on any particular scale


score('Never-married')

10683

In [3]:
# TEST CASE
assert score('Never-married') == 10683

## Question 2

Implement `report_noisy_max` using the Laplace mechanism. `report_noisy_max` should return the value in a set that approximately maximizes the value of the score function. It should *not* return the score itself.

In [4]:
# Easiest way to noisify a categorical type question
# 1. Calc all the scores
# 2. Add nosie to all scores
# 3. pick option r in R with the biggest noisy score
# 4. Can use the laplace mech
# Step 2 satifies DP of the laplace + sequential comp
# step 3 satisfies Dp by post processing
def report_noisy_max(R, score, sensitivity, epsilon):
    # 1. Calc scores
    scores = []
    for r in R:
        scores.append(score(r))
    # 2. add noisy scores
    noisy_scores = [laplace_mech(s, 1, epsilon) for s in scores] # sens of 1 depends on the score method! Our score method is just a counting query so it maintinas a sens of 1
    
    #3. pick the option r in R with the biggest noisy score
    max_idx = np.argmax(noisy_scores)
    return R[max_idx]


report_noisy_max(options, score, 1, 1)

'Married-civ-spouse'

In [5]:
scores = []

for r in options:
    scores.append(score(r))
epsilon = .00000001
probs = [np.exp((epsilon * s) / (2 * 1)) for s in scores] # 2 * 1, the 1 part is the sensitivity
np.array(probs) / sum(probs)
probs

# Important tool for reducing privacy cost when given a set of options
# Making epsilon smaller, these probs get closer to 1
# Same property where sometimes you get the wrong answer but the smaller epsilon is, the more likley you are to get the wrong answer

[1.0000534164266066,
 1.0000748828035773,
 1.0000222152467548,
 1.000002090002184,
 1.0000051250131328,
 1.0000001150000066,
 1.0000049650123257]

In [None]:
# TEST CASE
assert report_noisy_max(options, score, 1, 1) == 'Married-civ-spouse'

## Question 3

What is the **total privacy cost** of `report_noisy_max` under *sequential composition*?

- There are 7 options for R
- So we call the laplace mech 7 times
- The sens of scoring function is 1 so we use a sens of 1 for the laplace mech
- Sequential comp says the total privacy cost will then be 7*epsilon
- By post processing, the final answer has total privacy cost 7*epsilon

Because report noisy max is equivalent to the exponential mech, we can conclue the total privacy cost is just epsilon

## Above Threshold

The following code implements Above Threshold:

In [14]:
# preserves epsilon-differential privacy
def above_threshold(queries, df, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    
    for idx, q in enumerate(queries):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q(df) + nu_i >= T_hat:
            return idx
    return -1 # the index of the last element

## Question 4

Use `above_threshold` to find the first `age` for which `len(adult[adult['Age'] == age]) >= 800` (the first age for which more than 800 people have that age).

In [15]:
def make_query(age):
    def q(df):
        return len(df[df['Age'] == age])
    return q

def find_first_age_above_800(epsilon):
    Q = [make_query(age) for age in range(1, 100)]
    answers = [q(adult) for q in Q]
    noisy_answers = [laplace_mech(a, 1, epsilon) for a in answers]
    idx = 0
    for noisy_answer in noisy_answers:
        if noisy_answer >= 800:
            return idx
        idx += 1
    
find_first_age_above_800(1.0)

22

In [16]:
# TEST CASE
assert find_first_age_above_800(1.0) == 22

## Question 5

Use `above_threshold` to implement `pick_b` for the following summation query. `pick_b` should pick a clipping parameter `b`.

In [17]:
def make_query(b):
    def q(df):
        first = df['Age'].clip(lower=0, upper=b).sum()
        second = df['Age'].clip(lower=0, upper=b+1).sum()
        return first - second

def pick_b(epsilon):
    B = range(1, 1000)
    Q = [make_query(b) for b in B]
    idx = above_threshold(Q, adult, 0, epsilon)
    return idx
    
pick_b(1.0)

TypeError: 'NoneType' object is not callable

In [13]:
# TEST CASE
b = pick_b(1.0)
assert b > 80
assert b < 100

NameError: name 'above_threshold' is not defined

## Question 6

Implement `above_threshold_val`, which returns the *value* of the first query result above the threshold. Your solution should have a **total privacy cost** of `epsilon`.

In [None]:
def make_query(age):
    def q(df):
        return len(df[df['Age'] == age])
    return q

def above_threshold_val(queries, df, T, epsilon):
    idx = above_threshold(queries, df, T, epsilon)
    val = laplace_mech(queries[idx](df), 1, epsilon)
    
queries = [make_query(age) for age in range(0,100)]
above_threshold_val(queries, adult, 800, 1.0)

In [None]:
# TEST CASE
queries = [make_query(age) for age in range(0,100)]

results = [above_threshold_val(queries, adult, 800, 1.0) for _ in range(20)]
assert np.mean(results) > 865
assert np.mean(results) < 890

## Question 7

Argue informally that your solution in question 6 has a total privacy cost of `epsilon`.

Two privacy mechanism
- Above threshold, privacy cost of epsilon/2
- laplace_mech with privacy cost of epsilon/2
sequential composition says the total privacy cost is epsilon


!! Works great for picking clipping parameters!