# Importations

In [155]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing import Reweighing
from aif360.algorithms.inprocessing import AdversarialDebiasing
import tensorflow as tf

# 1 - Classification

### 1.1 Load and Preprocess Data

In [156]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
           "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
           "hours-per-week", "native-country", "income"]
data = pd.read_csv(url, header=None, names=columns, na_values=" ?", skipinitialspace=True)

# Drop rows with missing values
data.dropna(inplace=True)

# Need to copy the data as we don't want a binarized part for testing the privacy
data_copy = data.copy()

# Binarize the 'age' attribute
binarizer = Binarizer(threshold=30)
data['age'] = binarizer.fit_transform(data[['age']])

# Convert categorical variables to dummy variables
data = pd.get_dummies(data, drop_first=True)

data

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income_>50K
0,1,77516,13,2174,0,40,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,1,83311,13,0,0,13,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,1,215646,9,0,0,40,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
3,1,234721,7,0,0,40,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,0,338409,13,0,0,40,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,257302,12,0,0,38,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
32557,1,154374,9,0,0,40,False,False,False,True,...,False,False,False,False,False,False,True,False,False,True
32558,1,151910,9,0,0,40,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
32559,0,201490,9,0,0,20,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False


### 1.2 Train a logistic regression classifier and measure its performance

In [157]:
# Split the data into features and target variable
X = data.drop('income_>50K', axis=1)
y = data['income_>50K']

# Split the data into train, validation, and test sets (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=2000)
classifier.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_scaled)

# Measure performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

performance_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
}

print("Performance Metrics:", performance_metrics)


Performance Metrics: {'Accuracy': 0.8552712384851586, 'Precision': 0.7271750805585392, 'Recall': 0.5991150442477876, 'F1 Score': 0.6569626394953906}


# 2 - Fairness


# 3 - Privacy


### 3.1 First cross tabulation

In [158]:
# Cross-tabulation on age and sex
sensitive_crosstab = pd.crosstab(data_copy['age'], data_copy['sex'])

print("the distribution is the following: \n", sensitive_crosstab)

the distribution is the following: 
 sex  Female  Male
age              
17      186   209
18      268   282
19      356   356
20      363   390
21      329   391
..      ...   ...
85        1     2
86        1     0
87        0     1
88        1     2
90       14    29

[73 rows x 2 columns]


### 3.2 Local differential privacy

In [None]:
# making a private dataset
private_data = data_copy.copy()

#We can change the values of the epsilon to make tests on them here, the current ones are the ones
#that seemed the more coherent after testing multiple ones
epsilon_age = 5
epsilon_sex = 0.2 #so 20% of the sex values are changed

#Apply Laplace noise and round it(for age)
def laplace_noise(value, epsilon):
    return round(value + np.random.laplace(0, 1 / epsilon))

#Apply randomized response (for sex)
def randomized_response(value, epsilon):
    if np.random.rand() < epsilon_sex: #this way, the valu of epsilo_sex is the proportion of changed sex
        return 'Male' if (value == 'Female') else 'Female'
    return value

#remark: we can't take a epsilon too small for sex, else we risk blurring information on sex inequalities

#Apply local differential privacy to age and sex on dataPrivate
private_data['age'] = data_copy['age'].apply(lambda x: laplace_noise(x, epsilon_age))
private_data['sex'] = data_copy['sex'].apply(lambda x: randomized_response(x, epsilon_sex))

# Concatenate the 'age' and 'sex' columns from both original (dataCopy) and private (dataPrivate) data
comparison_df = pd.concat([data_copy[['age', 'sex']], private_data[['age', 'sex']]], axis=1)
comparison_df.columns = ['Original Age', 'Original Sex', 'Private Age', 'Private Sex']
comparison_df = comparison_df[['Original Age', 'Private Age', 'Original Sex', 'Private Sex']]

# Show the comparison to see how the private dataset differs from the original one
print(comparison_df)

       Original Age  Private Age Original Sex Private Sex
0                39          245         Male        Male
1                50         -290         Male        Male
2                38           24         Male        Male
3                53           77         Male        Male
4                28          128       Female      Female
...             ...          ...          ...         ...
32556            27           72       Female      Female
32557            40          142         Male        Male
32558            58           10       Female      Female
32559            22          -38         Male        Male
32560            52         -152       Female      Female

[32561 rows x 4 columns]


### 3.3 Cross tabulation on the private dataset

In [160]:
#Cross-tabulation for the private data
private_crosstab = pd.crosstab(private_data['age'], private_data['sex'])

print("the distribution on the private data is the following:\n", private_crosstab)

# Align both crosstabs, necessary, else it returns an empty dataset
sensitive_crosstab, private_crosstab = sensitive_crosstab.align(private_crosstab, join='outer', axis=0, fill_value=0)

#Calculate the estimation errors
#this shows the differences in distribution between the private and original dataset
comparison_crosstab = sensitive_crosstab.subtract(private_crosstab, fill_value=0)
print("Estimation error:\n", comparison_crosstab)

print("note: negative values mean there are more in the private dataset than in the original one")

the distribution on the private data is the following:
 sex    Female  Male
age                
-805        0     1
-764        0     1
-723        0     1
-705        0     1
-701        1     0
...       ...   ...
 892        0     1
 928        0     1
 1010       0     1
 1012       1     0
 1031       0     1

[1146 rows x 2 columns]
Estimation error:
 sex    Female  Male
age                
-805        0    -1
-764        0    -1
-723        0    -1
-705        0    -1
-701       -1     0
...       ...   ...
 892        0    -1
 928        0    -1
 1010       0    -1
 1012      -1     0
 1031       0    -1

[1146 rows x 2 columns]
note: negative values mean there are more in the private dataset than in the original one


### 3.4 Data splitting and classification

In [161]:
# we binarize the ages again
private_data['age'] = binarizer.fit_transform(data[['age']])

# Convert categorical variables to dummy variables
private_data = pd.get_dummies(data, drop_first=True)

#now we make a process similar to the one from (1):

# Split the data into features and target variable
X_p = private_data.drop('income_>50K', axis=1)
Y_p = private_data['income_>50K']

# Split the data into train, validation, and test sets (70% train, 15% validation, 15% test)
X_train_p, X_temp_p, Y_train_p, Y_temp_p = train_test_split(X_p, Y_p, test_size=0.3, random_state=42)
X_val_p, X_test_p, Y_val_p, Y_test_p = train_test_split(X_temp_p, Y_temp_p, test_size=0.5, random_state=42)

# Scale the data
#scaler was already defined in (1)
X_train_scaled_p = scaler.fit_transform(X_train_p)
X_val_scaled_p = scaler.transform(X_val_p)
X_test_scaled_p = scaler.transform(X_test_p)

# Train a logistic regression classifier
private_classifier = LogisticRegression(max_iter=2000)
private_classifier.fit(X_train_scaled_p, Y_train_p)


# Predict on the test set
Y_pred_p = private_classifier.predict(X_test_scaled_p)

### 3.5 Performances measuring

In [162]:
# Measure performance on the private classifier
private_accuracy = accuracy_score(Y_test_p, Y_pred_p)
private_precision = precision_score(Y_test_p, Y_pred_p)
private_recall = recall_score(Y_test_p, Y_pred_p)
private_f1 = f1_score(Y_test_p, Y_pred_p)

private_performance_metrics = {
    'Accuracy': private_accuracy,
    'Precision': private_precision,
    'Recall': private_recall,
    'F1 Score': private_f1
}

print("Performance Metrics of the original dataset:", performance_metrics)
print("Performance Metrics of the private dataset: ", private_performance_metrics)

Performance Metrics of the original dataset: {'Accuracy': 0.8552712384851586, 'Precision': 0.7271750805585392, 'Recall': 0.5991150442477876, 'F1 Score': 0.6569626394953906}
Performance Metrics of the private dataset:  {'Accuracy': 0.8552712384851586, 'Precision': 0.7271750805585392, 'Recall': 0.5991150442477876, 'F1 Score': 0.6569626394953906}


# 4 - Privacy and Fairness


# 5 - Explainability


#  6 - Explainability and LLMs 

# 7 - Free Exploration 