# CS211: Data Privacy
## In-Class Exercise, 11/18/2020

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

In [None]:
def range_query_synth(syn_rep, a, b):
    total = 0
    for i in range(a, b):
        total += syn_rep[i]
    return total

def age_hist(epsilon):
    results = [len(adult[adult['Age'] == age]) for age in range(0, 100)]
    noisy_results = [laplace_mech(v, 1, epsilon) for v in results]
    return noisy_results

## Accuracy of Synthetic Representation

In [None]:
epsilon = 1.0
syn_rep = age_hist(epsilon)

# Query 1: count with 40 <= age < 43
true_answer = len(adult[(adult['Age'] >= 40) & (adult['Age'] < 43)])
print('Q1 percent error, synthetic representation:', 
      pct_error(true_answer, range_query_synth(syn_rep, 40, 43)))

lap_answer = laplace_mech(true_answer, 1, epsilon)
print('Q1 percent error, Laplace mechanism:', 
      pct_error(true_answer, lap_answer))

In [None]:
# Query 2: count with 25 <= age < 53
true_answer = len(adult[(adult['Age'] >= 25) & (adult['Age'] < 53)])
print('Q2 percent error, synthetic representation:', 
      pct_error(true_answer, range_query_synth(syn_rep, 25, 53)))

lap_answer = laplace_mech(true_answer, 1, epsilon)
print('Q2 percent error, Laplace mechanism:', 
      pct_error(true_answer, lap_answer))

## Question 1

Which approach gives more accurate answers: the Laplace mechanism or the synthetic representation? Under what conditions is the synthetic representation good, and under what conditions is is not as good?

YOUR ANSWER HERE

## Question 2

Implement a synthetic representation for *two dimensions*: `Age` and `Occupation`.

In [None]:
ages = range(0, 100)
occupations = adult['Occupation'].dropna().unique()

def age_occupation_hist(epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

age_occupation_hist(1.0)

## Question 3

Using the synthetic representation from Question 2, implement functions to calculate the *two-way marginal distribution* and generate synthetic data samples.

In [None]:
def age_occupation_marginal(epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

def gen_samples(n, marginal):
    # YOUR CODE HERE
    raise NotImplementedError()

marginal = age_occupation_marginal(1.0)
gen_samples(15, marginal)