# CS 3110/5110: Data Privacy
## In-Class Exercise, week of 10/28/2024

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

In [None]:
# Load data files
import numpy as np
import urllib.request
import io

url_x = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_x.npy'
url_y = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_y.npy'

with urllib.request.urlopen(url_x) as url:
    f = io.BytesIO(url.read())
X = np.load(f)

with urllib.request.urlopen(url_y) as url:
    f = io.BytesIO(url.read())
y = np.load(f)

In [None]:
# Split data into training and test sets
training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]

y_train = y[:training_size]
y_test = y[training_size:]

print('Train and test set sizes:', len(y_train), len(y_test))

## Question 1

Using scikit-learn, train a logistic regression model on the training data loaded above.

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def train_model():
    # YOUR CODE HERE
    raise NotImplementedError()

model = train_model()
print('Model coefficients:', model.coef_[0])
print('Model accuracy:', np.sum(model.predict(X_test) == y_test)/X_test.shape[0])

## Question 2

Implement the *average gradient* of the loss below.

In [None]:
# The loss function measures how good our model is. The training goal is to minimize the loss.
# This is the logistic loss function.
def loss(theta, xi, yi):
    exponent = - yi * (xi.dot(theta))
    return np.log(1 + np.exp(exponent))

# This is the gradient of the logistic loss
# The gradient is a vector that indicates the rate of change of the loss in each direction
def gradient(theta, xi, yi):
    exponent = yi * (xi.dot(theta))
    return - (yi*xi) / (1+np.exp(exponent))

In [None]:
def avg_grad(theta, X, y):
    # YOUR CODE HERE
    raise NotImplementedError()

## Question 3

Use the average gradient from above to implement a gradient descent algorithm.

In [None]:
def gradient_descent(iterations):
    # YOUR CODE HERE
    raise NotImplementedError()

theta = gradient_descent(10)
theta

In [None]:
# Prediction: take a model (theta) and a single example (xi) and return its predicted label
def predict(xi, theta, bias=0):
    label = np.sign(xi @ theta + bias)
    return label

def accuracy(theta):
    return np.sum(predict(X_test, theta) == y_test)/X_test.shape[0]

accuracy(theta)

## Question 4

Implement a *noisy gradient descent* algorithm.

1. Calculate gradients for each example
2. Clip the gradients to have bounded $L2$ norm
3. Sum the clipped gradients
4. Use the Gaussian mechanism to add noise to the sum of gradients

In [None]:
def L2_clip(v, b):
    norm = np.linalg.norm(v, ord=2)
    
    if norm > b:
        return b * (v / norm)
    else:
        return v

def noisy_gradient_descent(iterations, epsilon, delta):
    # YOUR CODE HERE
    raise NotImplementedError()

theta = noisy_gradient_descent(10, 1.0, 1e-5)
print('Final accuracy:', accuracy(theta))

In [None]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70

## Question 5

What is the *total privacy cost* of the noisy gradient descent algorithm above, and why? Argue informally that the algorithm satisfies this privacy cost. Use sequential composition.

YOUR ANSWER HERE

## Question 6

Repeat the above, but using advanced composition.

YOUR ANSWER HERE

## Question 7

Implement a version of noisy gradient descent that satisfies a *total* of $(\epsilon, \delta)$-differential privacy. Use sequential composition.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70