# CS211: Data Privacy
## In-Class Exercise, week of 11/07/2022

In [2]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

  plt.style.use('seaborn-whitegrid')


## Class Notes

Local DP and local sensitivity are not the same thing
- Up to this point we have been looking at a central model of DP... there is a single server holding the data and other parties communicate with it to get the information they need
Randomized Response uses more randomness to calculate local differential privacy 

## Question 1

Implement an *encoding function* for randomized response to a "yes/no" question.

Reference [Chapter 13](https://uvm-plaid.github.io/programming-dp/notebooks/ch13.html#randomized-response).

In [3]:
# Implementation of encoding random response algorithm for local differential privacy
def encode_rand_resp_yes_no(true_response):
    # Flip a coin 
    c1 = np.random.randint(0, 2)
    
    # If heads answers truthfully
    if c1 == 0:
        return true_response
    else:
        # if tails flip another coin
        c2 = np.random.randint(0,2)
        if c2 == 0:
            return True
        else:
            return False
        
for _ in range(5):
    print('Randomized response:', encode_rand_resp_yes_no(True))

Randomized response: True
Randomized response: True
Randomized response: True
Randomized response: True
Randomized response: True


In [4]:
# TEST CASE
assert np.sum([encode_rand_resp_yes_no(True) for _ in range(100)]) > 60
assert np.sum([encode_rand_resp_yes_no(True) for _ in range(100)]) < 90

## Question 2

Implement a *decoding function* for randomized response to a "yes/no" question.

In [5]:
def decode_rand_resp_yes_no(responses):
    # Decode the results of randomized responses
    all_yesses = np.sum(responses)
    
    # Subtract the number of fake yesses
    fake_yesses = (1/4)*len(responses)
    true_yesses = all_yesses - fake_yesses
    
    # Multiply by 2
    return 2*true_yesses

# Example: 1000 "yesses" and 500 "nos"
true_responses = [True for _ in range(1000)] + [False for _ in range(500)]
print('Number of "True" yesses:', np.sum(true_responses))

# Randomized responses
# Each response satisfies 1.09-differential privacy
rand_responses = [encode_rand_resp_yes_no(r) for r in true_responses]

# Decode the responses by subtracting "fake" yesses
print('Decoded randomized response yesses:', decode_rand_resp_yes_no(rand_responses))

Number of "True" yesses: 1000
Decoded randomized response yesses: 1050.0


In [6]:
# TEST CASE
true_responses = [True for _ in range(1000)] + [False for _ in range(500)]

# Randomized responses
# Each response satisfies 1.09-differential privacy
rand_responses = [encode_rand_resp_yes_no(r) for r in true_responses]

# Decode the responses by subtracting "fake" yesses
assert decode_rand_resp_yes_no(rand_responses) < 1100
assert decode_rand_resp_yes_no(rand_responses) > 900

## Question 3

Use the definition of randomized response above to answer the question:

*How many individuals in the `adult` dataset have `Occupation` = `Sales`?*

In [11]:
# take a single occupation from the adult dataset, and return a single response
def encode_response_sales(response):
    true_response = response == 'Sales'
    randomized_response = encode_rand_resp_yes_no(true_response)
    return randomized_response
    
def decode_responses_sales(responses):
    approximate_yesses = decode_rand_resp_yes_no(responses)
    return approximate_yesses

responses = [encode_response_sales(r) for r in adult['Occupation']]
decode_responses_sales(responses)

3723.5

In [None]:
# How accurate is the answer above?
true_sales = np.sum(adult['Occupation'] == 'Sales')
print('True number of salespeople:', true_sales)

## Question 4

Implement the *encode* and *perturb* steps for Optimized Unary Hashing.

In [15]:
domain = adult['Occupation'].dropna().unique()
domain

def perturb(encoded_response):
    return [perturb_bit(b) for b in encoded_response]

def perturb_bit(bit):
    p = .75
    q = .25

    sample = np.random.random()
    if bit == 1:
        if sample <= p:
            return 1
        else:
            return 0
    elif bit == 0:
        if sample <= q:
            return 1
        else: 
            return 0


def encode(response):
    # want to build a one-hot encoding of the response
    # based on the domain domain
    #return [1 if response == v else 0 for v in domain]
    one_hot_vector = []
    for v in domain:
        if v==response:
            one_hot_vector.append(1)
        else:
            one_hot_vector.append(0)
    return one_hot_vector
    
    

perturb(encode('Sales'))

[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1]

In [16]:
# As in randomized response, flipping of bits causes issues in the answers
# This is the perturbed answer (without decoding)
counts = np.sum([perturb(encode(r)) for r in adult['Occupation']], axis=0)
list(zip(domain, counts))

[('Adm-clerical', 10075),
 ('Exec-managerial', 10142),
 ('Handlers-cleaners', 8758),
 ('Prof-specialty', 10276),
 ('Other-service', 9731),
 ('Sales', 9844),
 ('Craft-repair', 10041),
 ('Transport-moving', 8963),
 ('Farming-fishing', 8598),
 ('Machine-op-inspct', 9079),
 ('Tech-support', 8555),
 ('Protective-serv', 8444),
 ('Armed-Forces', 8208),
 ('Priv-house-serv', 8273)]

In [17]:
# This is the real answer
counts = np.sum([encode(r) for r in adult['Occupation']], axis=0)
list(zip(domain, counts))

[('Adm-clerical', 3770),
 ('Exec-managerial', 4066),
 ('Handlers-cleaners', 1370),
 ('Prof-specialty', 4140),
 ('Other-service', 3295),
 ('Sales', 3650),
 ('Craft-repair', 4099),
 ('Transport-moving', 1597),
 ('Farming-fishing', 994),
 ('Machine-op-inspct', 2002),
 ('Tech-support', 928),
 ('Protective-serv', 649),
 ('Armed-Forces', 9),
 ('Priv-house-serv', 149)]

## Question 5

Implement the *aggregate* step for Optimized Unary Hashing.

In [24]:
def aggregate(responses):
    p = .75
    q = .25
    
    sums = np.sum(responses, axis=0)
    n= len(responses)
    
    answer = [su-n*q/(p-q) for su in sums]
    return answer

responses = [perturb(encode(r)) for r in adult['Occupation']]
counts = aggregate(responses)
list(zip(domain, counts))

[('Adm-clerical', -6162.5),
 ('Exec-managerial', -6173.5),
 ('Handlers-cleaners', -7351.5),
 ('Prof-specialty', -5979.5),
 ('Other-service', -6332.5),
 ('Sales', -6303.5),
 ('Craft-repair', -6191.5),
 ('Transport-moving', -7278.5),
 ('Farming-fishing', -7600.5),
 ('Machine-op-inspct', -7012.5),
 ('Tech-support', -7659.5),
 ('Protective-serv', -7786.5),
 ('Armed-Forces', -8144.5),
 ('Priv-house-serv', -8020.5)]

In [25]:
p = .6 # How likley to keep 1 as 1
q = .9 # How likley to flip 0 to 1
# large p -> better accuracy, worse privacy
# small q -> better accuracy, worse privacy

np.log((p*(1-q)) / ((1-p)*q))

-1.7917594692280554