# CS211: Data Privacy Final Project
## By, Isabelle Gagnon

In [21]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

#Columns of the dataset
NUMBER_COLUMNS = 581

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

loan_data = pd.read_csv('https://github.com/isabellegagnon/CS211-Final-Project/raw/main/LoanPayments.csv')

In [22]:
loan_data

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...,...
495,xqd20160496,COLLECTION_PAIDOFF,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,xqd20160497,COLLECTION_PAIDOFF,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,xqd20160498,COLLECTION_PAIDOFF,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,xqd20160499,COLLECTION_PAIDOFF,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


## Epsilon 

In [23]:
epsilon = 2.0

## Strategy

In [24]:
#Above threshold from textbook
def above_threshold(queries, df, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    for idx, q in enumerate(queries):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q(df) + nu_i >= T_hat:
            return idx
    return -1 

#Use Sparse Vector Technique from textbook
def calc_average(df, epsilon):
    epsilon_frac = epsilon/3
    
    def make_query(b):
        return lambda df: df.clip(lower = 0, upper = b).sum() - df.clip(lower = 0, upper = b + 1).sum()
    bs = range(1, 150000, 5)
    queries = [make_query(b) for b in bs]
    
    svt_eps = epsilon_frac
    b_final = bs[above_threshold(queries, df, 0, svt_eps)]
    
    count_eps = epsilon_frac
    sum_eps = epsilon_frac
    
    noisy_count =  laplace_mech(len(df), 1, count_eps) 
    noisy_sum = laplace_mech(df.clip(lower = 0, upper = b_final).sum(), b_final, sum_eps) 
    
    return noisy_sum/noisy_count 

## Epsilon Differentially Private Averages

In [25]:
def averages(df, epsilon):
    avg = sum(df) / len(df)
    dp_avg = calc_average(df, epsilon)
    error = pct_error(avg, dp_avg)
    return(dp_avg, avg, error)    

## Majority Count

In [26]:
#from textbook
def score(option, data, parameter):
    return len(data[data[parameter] == option])

def report_noisy_max(data, parameter, R, score, sensitivity, epsilon):
    scores = [score(r, data, parameter) for r in R]
    noisy_scores = [laplace_mech(score, sensitivity, epsilon) for score in scores]
   
    max_score = np.max(noisy_scores)
    max_idx = noisy_scores.index(max_score)
    return R[max_idx]

def max_count_calc(data, parameter, epsilon, option):
    count = data[parameter].mode()[0]
    dp_count = report_noisy_max(data, parameter, option, score, 1, epsilon)
    return(dp_count, count)

## Unique Values

In [27]:
def uniqueness(df, parameter):
    return df[parameter].unique()

gender_options = uniqueness(loan_data, 'Gender')
education_options = uniqueness(loan_data, 'education')
status = uniqueness(loan_data, 'loan_status')
college_educated = ['college', 'Bechalor', 'Master or Above']
past_due = ['COLLECTION', 'COLLECTION_PAIDOFF']

## Implimentation

In [33]:
QUERIES_TOTAL = 9

epsilon_frac = epsilon/QUERIES_TOTAL

#average statistics
dp_avg_age = averages(loan_data['age'], epsilon_frac)
dp_avg_pastDue = averages(loan_data['past_due_days'], epsilon_frac)
dp_avg_principal = averages(loan_data['Principal'], epsilon_frac)

#max statistics
dp_gender = max_count_calc(loan_data, 'Gender', epsilon_frac, gender_options) 
dp_education = max_count_calc(loan_data, 'education', epsilon_frac, education_options)
dp_loan_status = max_count_calc(loan_data, 'loan_status', epsilon_frac, status) 


#most common education & gender for loan statuses
dp_education_loan_status = max_count_calc(loan_data.loc[loan_data['loan_status'].isin(past_due)], 
                                          'education', epsilon_frac, education_options) 
dp_gender_loan_status = max_count_calc(loan_data.loc[loan_data['loan_status'].isin(past_due)], 
                                          'Gender', epsilon_frac, gender_options) 


#average prinicpals by age
dp_age_principal_under = averages(loan_data[loan_data['age'] <= 25]['Principal'], epsilon_frac)
dp_age_principal_over = averages(loan_data[loan_data['age'] > 25]['Principal'], epsilon_frac)


#Print results and dp results
counts = [dp_gender, dp_education, dp_loan_status, dp_education_loan_status, dp_gender_loan_status]
average = [dp_avg_age, dp_avg_principal, dp_age_principal_over, dp_age_principal_under]

for count in counts:
    print(f'Actual Max Count: {count[1]}, DP Max Count: {count[0]}')

for avg in average:
    print(f'Actual Average: {avg[1]}, DP Average: {avg[0]}, Percent Error: {avg[2]}')   

Actual Max Count: male, DP Max Count: male
Actual Max Count: college, DP Max Count: college
Actual Max Count: PAIDOFF, DP Max Count: PAIDOFF
Actual Max Count: college, DP Max Count: High School or Below
Actual Max Count: male, DP Max Count: male
Actual Average: 31.116, DP Average: 33.245821160015154, Percent Error: 6.844778120629755
Actual Average: 943.2, DP Average: 953.6602510344601, Percent Error: 1.1090172852480962
Actual Average: 943.2941176470588, DP Average: 941.68522822113, Percent Error: 0.1705607398402877
Actual Average: 942.6666666666666, DP Average: 12.530149146153542, Percent Error: 98.67077625748017
