# CS 3110/5110: Data Privacy
## In-Class Exercise, week of 11/04/2024

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs3110-data-privacy/raw/main/homework/adult_with_pii.csv')

## Question 1

Implement an *encoding function* for randomized response to a "yes/no" question.

Reference [Chapter 13](https://uvm-plaid.github.io/programming-dp/notebooks/ch13.html#randomized-response).

In [None]:
def encode_rand_resp_yes_no(true_response):
    # YOUR CODE HERE
    raise NotImplementedError()

for _ in range(5):
    print('Randomized response:', encode_rand_resp_yes_no(True))

In [None]:
# TEST CASE
assert np.sum([encode_rand_resp_yes_no(True) for _ in range(100)]) > 60
assert np.sum([encode_rand_resp_yes_no(True) for _ in range(100)]) < 90

## Question 2

Implement a *decoding function* for randomized response to a "yes/no" question.

In [None]:
def decode_rand_resp_yes_no(responses):
    # YOUR CODE HERE
    raise NotImplementedError()

# Example: 1000 "yesses" and 500 "nos"
true_responses = [True for _ in range(1000)] + [False for _ in range(500)]
print('Number of "True" yesses:', np.sum(true_responses))

# Randomized responses
# Each response satisfies 1.09-differential privacy
rand_responses = [encode_rand_resp_yes_no(r) for r in true_responses]

# Decode the responses by subtracting "fake" yesses
print('Decoded randomized response yesses:', decode_rand_resp_yes_no(rand_responses))

In [None]:
# TEST CASE
true_responses = [True for _ in range(1000)] + [False for _ in range(500)]

# Randomized responses
# Each response satisfies 1.09-differential privacy
rand_responses = [encode_rand_resp_yes_no(r) for r in true_responses]

# Decode the responses by subtracting "fake" yesses
assert decode_rand_resp_yes_no(rand_responses) < 1100
assert decode_rand_resp_yes_no(rand_responses) > 900

## Question 3

Use the definition of randomized response above to answer the question:

*How many individuals in the `adult` dataset have `Occupation` = `Sales`?*

In [None]:
# take a single occupation from the adult dataset, and return a single response
def encode_response_sales(response):
    # YOUR CODE HERE
    raise NotImplementedError()
    
def decode_responses_sales(responses):
    # YOUR CODE HERE
    raise NotImplementedError()

responses = [encode_response_sales(r) for r in adult['Occupation']]
decode_responses_sales(responses)

In [None]:
# How accurate is the answer above?
true_sales = np.sum(adult['Occupation'] == 'Sales')
print('True number of salespeople:', true_sales)

## Question 4

Implement the *encode* and *perturb* steps for Optimized Unary Hashing.

In [None]:
domain = adult['Occupation'].dropna().unique()
domain

def encode(response):
    # YOUR CODE HERE
    raise NotImplementedError()

encode('Sales')

def perturb(encoded_response):
    # YOUR CODE HERE
    raise NotImplementedError()

perturb(encode('Sales'))

In [None]:
# As in randomized response, flipping of bits causes issues in the answers
# This is the perturbed answer (without decoding)
counts = np.sum([perturb(encode(r)) for r in adult['Occupation']], axis=0)
list(zip(domain, counts))

In [None]:
# This is the real answer
counts = np.sum([encode(r) for r in adult['Occupation']], axis=0)
list(zip(domain, counts))

## Question 5

Implement the *aggregate* step for Optimized Unary Hashing.

In [None]:
def aggregate(responses):
    # YOUR CODE HERE
    raise NotImplementedError()

responses = [perturb(encode(r)) for r in adult['Occupation']]
counts = aggregate(responses)
list(zip(domain, counts))