In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
from utils import *

# INSURANCE DATASET

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize, minmax_scale

def one_hot(df, cols): # idk if sklearns one-hot encoder is similar
    """
    df: pandas DataFrame
    param: cols a list of columns to encode 
    return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df
def numeric_scaler(df, cols):
    '''
    df: pandas dataframe
    numeric_cols: (array of strings) column names for numeric variables

    no return: does inplace operation
    '''
    df_new = df.copy()
    mmscaler = MinMaxScaler()
    df_new[cols] = mmscaler.fit_transform(df_new[cols])
    return df_new

In [4]:
df_medical = pd.read_csv('insurance.csv')

In [5]:
df_medical.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
numeric_all = ['age', 'bmi', 'children', 'charges']
cat_all = ['sex', 'smoker', 'region']

df_medical_mm = numeric_scaler(df_medical, numeric_all)
df_medical_mm_oh = one_hot(df_medical_mm, cat_all)
df_medical_mm_oh.drop(cat_all, axis = 1, inplace=True) # drop the categorics that were used to one hot encode
df_medical_mm_oh = df_medical_mm_oh * 1.0 # make bool true, false into 1.0, 0.0

In [7]:
X = df_medical_mm_oh.drop('charges', axis=1).to_numpy()
y = df_medical_mm_oh['charges'].to_numpy()
X = normalize(X, norm='l2') # each row is L2 normalized
# y = minmax_scale(y)

In [8]:
n, d = X.shape
lamb = 1.0
epsilons = np.array([0.1]*(n//2) + [1]*(n//2)) # has to be even

### Personalized privacy

In [9]:
tot_epsilon = np.sum(epsilons)
weights_pp = epsilons/tot_epsilon # weights used in the ridge regression for personalized privacy

sol_exact_ridge_pp = weighted_rls_solution(weights_pp, X, y, lamb)
print("pluggin exact soln back into weighted ridge", evaluate_weighted_rls_objective(sol_exact_ridge_pp, weights_pp, X, y, lamb))
beta_pp = compute_beta(lamb, tot_epsilon)
print("beta for pp", beta_pp)
# to loop the part below
runs = 1000
unweighted_erm = []
weighted_erm = []
for _ in range(runs):
  theta_hat_pp = compute_private_estimator(sol_exact_ridge_pp, beta_pp , d)
  unweighted_erm.append(evaluate_weighted_rls_objective(theta_hat_pp, np.ones(n)/n, X, y, lamb))
  weighted_erm.append(evaluate_weighted_rls_objective(theta_hat_pp, weights_pp, X, y, lamb))
print("unweighted_erm_using_privateestimator", np.mean(unweighted_erm), np.std(unweighted_erm)) # WE care about low values here!
print("weighted_erm_using_privateestimator", np.mean(weighted_erm), np.std(weighted_erm))

pluggin exact soln back into weighted ridge 0.06143817955856497
beta for pp 183.975
unweighted_erm_using_privateestimator 0.06655792595723549 0.002727000097420593
weighted_erm_using_privateestimator 0.06573730981382604 0.0027242980407818527


### Not personalized privacy

In [10]:
tot_epsilon = min(epsilons) * n
weights_npp = np.ones(n) / n

sol_exact_ridge_npp = weighted_rls_solution(weights_npp, X, y, lamb)
print("pluggin exact soln back into unweighted ridge", evaluate_weighted_rls_objective(sol_exact_ridge_npp, weights_npp, X, y, lamb))
beta_npp = compute_beta(lamb, tot_epsilon)
print("beta for not",beta_npp)
# to loop the part below
runs = 1000
unweighted_erm = []
weighted_erm = []
for _ in range(runs):
  theta_hat_npp = compute_private_estimator(sol_exact_ridge_npp, beta_npp , d)
  unweighted_erm.append(evaluate_weighted_rls_objective(theta_hat_npp, np.ones(n)/n, X, y, lamb))
  weighted_erm.append(evaluate_weighted_rls_objective(theta_hat_npp, weights_npp, X, y, lamb))
print("unweighted_erm_using_privateestimator", np.mean(unweighted_erm), np.std(unweighted_erm)) # WE care about low values here!
print("weighted_erm_using_privateestimator", np.mean(weighted_erm), np.std(weighted_erm))

pluggin exact soln back into unweighted ridge 0.06224927666537697
beta for not 33.45
unweighted_erm_using_privateestimator 0.18849493792526198 0.07568791464765202
weighted_erm_using_privateestimator 0.18849493792526198 0.07568791464765202
