In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
from utils import *

NORMAL DIST -- SYNTHETIC DATASET

Generating the synthetic data, privacy specification

In [2]:
from sklearn.preprocessing import normalize, minmax_scale
from sklearn.model_selection import train_test_split

N = 1000 # synthetic dataset size
d = 10 # dimensionality

theta = np.random.uniform(0, 10, size=d)
lamb = 0.5 # norm penalizer parameter for ridge
X, y = generate_linear_data(n = N, theta = theta, sigma=0.1)

# Our theory is for all features within the L2 unit ball, and y's in [0,1]
X = normalize(X, norm='l2') # each row is L2 normalized
y = minmax_scale(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
N_train, N_test = len(X_train), len(X_test)

epsilons = np.zeros(N_train)# training datas agents privacy levels, want 3 privacy levels
epsilons[:N_train//3] = 0.1
epsilons[N_train//3 : 2 * N_train//3] = 0.5 # change to 0.5
epsilons[2 * N_train//3 : ] = 1.0
from collections import Counter
print(Counter(epsilons))

(750, 10) (250, 10) (750,) (250,)
Counter({1.0: 500, 0.1: 250})


In [3]:
epsilons.shape, X_train.shape, N_train, N_test

((750,), (750, 10), 750, 250)

### PERSONALIZED PRIVACY, (our reweighting)

In [4]:
tot_epsilon = np.sum(epsilons)
weights_pp = epsilons / tot_epsilon # weights used in the ridge regression for personalized privacy

sol_exact_ridge_pp = weighted_rls_solution(weights_pp, X_train, y_train, lamb) # weighted ridge on training set
# print("pluggin exact soln back into weighted ridge", evaluate_weighted_rls_objective(sol_exact_ridge_pp, weights_pp, X, y, lamb))
beta_pp = compute_beta(lamb, tot_epsilon)
print("beta for pp", beta_pp)
# Do runs, in each calculate loss on unweighted train, unweighted test set loss of the private estimator; 1000 runs for randomness in L2 laplce dp noise
runs = 100
unweighted_train = []
unweighted_test = []
uniform_weight_train = np.ones(N_train) / N_train
uniform_weight_test = np.ones(N_test) / N_test
# weighted_erm = []
for _ in range(runs):
  theta_hat_pp = compute_private_estimator(sol_exact_ridge_pp, beta_pp) # exact solution on weighted training + noise
  unweighted_train.append(evaluate_weighted_rls_objective(theta_hat_pp, uniform_weight_train, X_train, y_train, lamb))
  unweighted_test.append(evaluate_weighted_rls_objective(theta_hat_pp, uniform_weight_test, X_test, y_test, lamb))
  # weighted_erm.append(evaluate_weighted_rls_objective(theta_hat_pp, weights_pp, X, y, lamb))
print("unweighted_train_usingour_pp", np.mean(unweighted_train), np.std(unweighted_train))
print("unweighted_test_usingour_pp", np.mean(unweighted_test), np.std(unweighted_test))
# print("weighted_erm_using_privateestimator", np.mean(weighted_erm), np.std(weighted_erm))

beta for pp 54.36553006146873
unweighted_train_usingour_pp 0.14335377539478197 0.015552564697300625
unweighted_test_usingour_pp 0.14304419657907377 0.015541359198146325


### *Not* PERSONALIZED PRIVACY, using our framework

  Epsilon for all agents set to min of epsilons

In [5]:
tot_epsilon = min(epsilons) * N_train # each agents privacy set to min epsilon
uniform_weight_train = np.ones(N_train) / N_train
uniform_weight_test = np.ones(N_test) / N_test

soln_ridge = weighted_rls_solution(uniform_weight_train, X_train, y_train, lamb)
# print("pluggin exact soln back into unweighted ridge", evaluate_weighted_rls_objective(sol_exact_ridge_npp, weights_npp, X, y, lamb))
beta = compute_beta(lamb, tot_epsilon)
print("beta for not", beta)
# to loop the part below
runs = 100
unweighted_train = []
unweighted_test = []
for _ in range(runs):
  theta_hat = compute_private_estimator(soln_ridge, beta)
  unweighted_train.append(evaluate_weighted_rls_objective(theta_hat, uniform_weight_train, X_train, y_train, lamb))
  unweighted_test.append(evaluate_weighted_rls_objective(theta_hat, uniform_weight_test, X_test, y_test, lamb))
  # weighted_erm.append(evaluate_weighted_rls_objective(theta_hat_npp, weights_npp, X, y, lamb))
print("unweighted_train_usingour_pp", np.mean(unweighted_train), np.std(unweighted_train))
print("unweighted_test_usingour_pp", np.mean(unweighted_test), np.std(unweighted_test))

beta for not 7.766504294495533
unweighted_train_usingour_pp 1.1799538033701387 0.7125864035430834
unweighted_test_usingour_pp 1.1791270047252367 0.7117552043429053


# Personalized Privacy using sampling mechanism from Jorgensen

- First subsample dataset points based on epsilons provided by each individual (and a threshold t) ,
- then run DP with threshold t on the sampled data



In [6]:
print(X_train.shape, y_train.shape, epsilons.shape)
mask = dataset_mask_jorgensen(epsilons, max(epsilons))

(750, 10) (750,) (750,)


In [9]:
thresh = max(epsilons) #global threshold used in jorgensen sampling
# to loop the part below
runs = 100
uniform_weight_train = np.ones(N_train) / N_train
uniform_weight_test = np.ones(N_test) / N_test

unweighted_train = []
unweighted_test = []
for _ in range(runs):
  mask = dataset_mask_jorgensen(epsilons, thresh) # which datapoint in X_train, y_train to mask, shape (N_train)
  X_samp = X_train[mask.astype(bool)]
  y_samp = y_train[mask.astype(bool)]
  N_samp = len(y_samp)
  unif_weight_samp = np.ones(N_samp) / N_samp
  tot_epsilon = thresh * N_samp # Use global threshold epsilon as privacy level for each sampled datapoint
  # now do DP with global threshold thresh, on the sampled data, using our sensitivity calculations
  theta_bar = weighted_rls_solution(unif_weight_samp, X_samp, y_samp, lamb) # unweighted soln with sampled data
  beta = compute_beta(lamb, tot_epsilon)
  theta_hat = compute_private_estimator(theta_bar, beta)
  unweighted_train.append(evaluate_weighted_rls_objective(theta_hat, uniform_weight_train, X_train, y_train, lamb))
  unweighted_test.append(evaluate_weighted_rls_objective(theta_hat, uniform_weight_test, X_test, y_test, lamb))
print("unweighted_erm_using_privateestimator", np.mean(unweighted_train), np.std(unweighted_train)) # WE care about low values here!
print("weighted_erm_using_privateestimator", np.mean(unweighted_test), np.std(unweighted_test))


unweighted_erm_using_privateestimator 0.1444019479448102 0.014186907704786984
weighted_erm_using_privateestimator 0.1439455257338352 0.01429883656338262


In [None]:
lamda, n, d