In [None]:
## Acknowledgements

# This project uses the following libraries:
# - diffprivlib (https://github.com/IBM/differential-privacy-library) by IBM Corporation — licensed under MIT License.

# pip install diffprivlib

In [None]:
# Load libraries
import pandas as pd
import numpy as np

# Load datset
df = pd.read_csv("../data/adult.csv")

# Find and replace null values
df[df == '?'] = np.nan

for col in ['workclass', 'occupation', 'native.country']:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [11]:
# Setting target and feature vector variables
X = df.drop(['income'], axis=1)
y = df['income']

In [12]:
# Split into training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [13]:
# Encode categorical variables
from sklearn import preprocessing

categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        
        X_test[feature] = le.transform(X_test[feature])

In [14]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)


In [None]:
from diffprivlib.models import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize DP model
dp_lr = LogisticRegression(
    epsilon=0.5,  # Privacy budget (lower = more privacy, less accuracy)
    data_norm=5.0  # Norm bound for each data point (required for DP)
)

# Fit the model
dp_lr.fit(X_train, y_train)

# Predict and evaluate
y_pred = dp_lr.predict(X_test)
print("DP Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))


DP Logistic Regression Accuracy: 0.8165625959668339


In [17]:
# Compare with non-private logistic regression
from sklearn.linear_model import LogisticRegression as SkLogReg

clf = SkLogReg(max_iter=1000)
clf.fit(X_train, y_train)
print("Non-private Accuracy:", accuracy_score(y_test, clf.predict(X_test)))


Non-private Accuracy: 0.8258777766403931


In [19]:
# Compare different epsilon values
# Find the best privacy level to use
for eps in [0.1, 0.2, 0.3, 0.5, 1.0, 5.0, 10.0]:
    dp_lr = LogisticRegression(epsilon=eps, data_norm=5.0)
    dp_lr.fit(X_train, y_train)
    acc = accuracy_score(y_test, dp_lr.predict(X_test))
    print(f"Epsilon: {eps} => Accuracy: {acc:.4f}")

Epsilon: 0.1 => Accuracy: 0.7652
Epsilon: 0.2 => Accuracy: 0.8145
Epsilon: 0.3 => Accuracy: 0.8121
Epsilon: 0.5 => Accuracy: 0.8239
Epsilon: 1.0 => Accuracy: 0.8272
Epsilon: 5.0 => Accuracy: 0.8270
Epsilon: 10.0 => Accuracy: 0.8265
