# CS211: Data Privacy
## Homework 10

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

## Question 1 (20 points)

Implement a function to generate synthetic data for a single column `col` of the Adult dataset. You should use a private one-way marginal. Your function should take four arguments:

- *col*: the name of the column to generate synthetic data for
- *domain*: a list of possible values occurring in the column (the column's domain)
- *n*: the number of synthetic data samples to generate
- *epsilon*: the privacy parameter to use

Your function should satisfy $\epsilon$-differential privacy.

*Hint*: follow the exercise from 11/15/2021.

In [None]:
age_domain = range(1,100)
occupation_domain = adult['Occupation'].dropna().unique()
education_num_domain = adult['Education-Num'].dropna().unique()
capital_gain_domain = range(0, 100000, 1000)

def gen_data_one_column(col, domain, n, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

data = gen_data_one_column('Age', age_domain, 30000, 1.0)
plt.hist(data, bins=40);

In [None]:
# TEST CASE

def run_one_test(col, domain):
    data = pd.Series(gen_data_one_column(col, domain, len(adult), 1.0))
    
    true_answer = np.array([len(adult[adult[col] == x]) for x in domain])
    syn_answer = np.array([len(data[data == x]) for x in domain])
    dist = np.linalg.norm(true_answer - syn_answer, ord=1)
    print('Column:', col, '-- L1 distance:', dist)
    assert dist > 0
    assert dist < 5000

run_one_test('Age', age_domain)
run_one_test('Occupation', occupation_domain)
run_one_test('Education-Num', education_num_domain)
run_one_test('Capital Gain', capital_gain_domain)

## Question 2 (20 points)

Implement a function to generate synthetic data for two columns `col1` and `col2` of the Adult dataset. You should use *two private one-way marginals*.

Your function should satisfy $\epsilon$-differential privacy.

*Hint*: you can call `gen_data_one_column` to generate data for a single column

*Hint*: you can use `pd.DataFrame(zip(data1, data2), columns=[col1, col2])` to create a dataframe using two lists called `data1` and `data2`

In [None]:
def gen_data_two_column(col1, domain1, col2, domain2, n, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

gen_data_two_column('Age', age_domain, 'Occupation', occupation_domain, 10, 1.0)

In [None]:
# TEST CASE

data = gen_data_two_column('Education-Num', education_num_domain, 'Capital Gain', capital_gain_domain, 30000, 1.0)
correlation = data.corr()['Education-Num'][1]
print('Synthetic data, correlation coefficient:', correlation)
print('Real data, correlation coefficient:', adult[['Education-Num', 'Capital Gain']].corr()['Education-Num'][1])
assert correlation < 0.01

## Question 3 (10 points)

What is the problem with the "two one-way marginals" approach you implemented in "gen_data_two_column"? How does the synthetic data you generated fail to model the true underlying data, and why does this happen?

YOUR ANSWER HERE

## Question 4 (10 points)

Consider the use of a *two-way marginal* for generating two-column synthetic data.

- What are the advantages of this approach?
- What are the disadvantages of this approach?

YOUR ANSWER HERE

## Question 5 (10 points)

In 3-5 sentences each, answer the following:

- Describe a method for constructing synthetic data with $m$ columns
- What are the advantages of your approach?
- What are the disadvantages of your approach?

YOUR ANSWER HERE