In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import defaultdict

# Some useful utilities

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon, size=len(v))

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

def z_clip(xs, b):
    return [min(x, b) for x in xs]

def g_clip(v):
    n = np.linalg.norm(v, ord=2)
    if n > 1:
        return v / n
    else:
        return v

# Setup
Here we want to load our dataset, preprocessing, and split into train and test for our model

## Step 1: load the data

In [2]:
default_of_credit_clients = pd.read_csv("default_of_credit_card_clients.csv")

cols = default_of_credit_clients.iloc[0].tolist()
cols[-1] = "default"

df = default_of_credit_clients[1:].copy()
df.columns = cols

df = df.apply(pd.to_numeric, errors="coerce")

## Step 2: Split train/test data

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["default"]).reset_index(drop=True)
y = df["default"].astype(int)

assert isinstance(X, pd.DataFrame)
assert isinstance(y, pd.Series)

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

training_size = int(X.shape[0] * 0.8)

X_train = X.iloc[:training_size]
X_test = X.iloc[training_size:]

y_train = y.iloc[:training_size]
y_test = y.iloc[training_size:]

## Step 3: Convert X_train/X_test to StandardScaler

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

## Step 4: Convert to numpy

In [5]:
X_train = X_train_scaled.astype(float)
X_test = X_test_scaled.astype(float)

y_train = y_train.to_numpy().astype(float)
y_test = y_test.to_numpy().astype(float)

#fix train labels
y_train = 2 * y_train - 1
y_test = 2 * y_test - 1

# Using Scikit-Learn
This is going to be our baseline model that we want to compare against a 
differentially private gradient descent model

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=20000).fit(X_train, y_train)
model.predict(X_test)

array([ 1., -1., -1., ...,  1., -1., -1.], shape=(6000,))

In [7]:
np.sum(model.predict(X_test) == y_test)/X_test.shape[0]

np.float64(0.821)

# Model Prediction

In [8]:
theta = np.zeros(X_train.shape[1])

def predict(xi, theta, bias=0):
    label = np.sign(xi @ theta + bias)
    return label

def accuracy(theta):
    return np.sum(predict(X_test, theta) == y_test)/X_test.shape[0]

# Gradient Descent Model

In [9]:
def loss(theta, xi, yi):
    exponent = - yi * (xi.dot(theta))
    return np.log(1 + np.exp(exponent))

In [10]:
np.mean([loss(theta, x_i, y_i) for x_i, y_i in zip(X_test, y_test)])

np.float64(0.6931471805599454)

In [14]:
def logistic(x):
    return 1 / (1 + np.exp(-x))


def gradient(theta, xi, yi):
    z = yi * np.dot(xi, theta)

    if z >= 0:
        exp_neg_z = np.exp(-z)
        sigma = 1 / (1 + exp_neg_z)
    else:
        exp_z = np.exp(z)
        sigma = exp_z / (1 + exp_z)

    return -yi * xi * (1 - sigma)

def avg_grad(theta, X, y):
    grads = [gradient(theta, xi, yi) for xi, yi in zip(X, y)]
    return np.mean(grads, axis=0)

def gradient_descent(iterations):
    theta = np.zeros(X_train.shape[1])

    for _ in range(iterations):
        theta = theta - avg_grad(theta, X_train, y_train)
        print(f'Training loss: {np.mean(loss(theta, X_train, y_train))}')
        print(f'Testing loss: {np.mean(loss(theta, X_test, y_test))}\n')
    return theta

theta = gradient_descent(10)
accuracy(theta)

Training loss: 0.6583993849702446
Testing loss: 0.6567817457080667

Training loss: 0.655152203542232
Testing loss: 0.6534175479529754

Training loss: 0.6531685321073202
Testing loss: 0.6513885900781361

Training loss: 0.6518058565795691
Testing loss: 0.6499905636534513

Training loss: 0.6508191961271063
Testing loss: 0.6489932882115115

Training loss: 0.6500873136521533
Testing loss: 0.6482481660445689

Training loss: 0.6495359785460764
Testing loss: 0.6476933153126854

Training loss: 0.6491157849005481
Testing loss: 0.6472706358573627

Training loss: 0.6487923316131161
Testing loss: 0.6469489119182625

Training loss: 0.6485410940842373
Testing loss: 0.6467011637909764

Training loss: 0.648344301400021
Testing loss: 0.6465101152796263

Training loss: 0.6481889167491821
Testing loss: 0.6463619651079584

Training loss: 0.6480652762264731
Testing loss: 0.6462469625753956

Training loss: 0.6479661510255315
Testing loss: 0.6461575501060008

Training loss: 0.6478860896568733
Testing loss: 0.

np.float64(0.5786666666666667)