In [None]:
import numpy as np
import pandas as pd
import io
import os
import requests
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import math
import random
import copy 
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from Component.duplicator import Duplicator
from Component.reweighing import Reweighing
from Component.preprocessing import Preprocessing

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Load dataset and Duplicating data

In [None]:
dataset = "AdultCensus"
feature = "@sex"
duplicator = Duplicator(category=dataset)

if dataset == "AdultCensus":
    p_group = "Male"
    up_group = "Female"
elif dataset == "GermanBank":
    p_group = "Male"
    up_group = "Female"
elif dataset == "Compas":
    p_group = "Female"
    up_group = "Male"

raw_train_data, raw_test_data = duplicator.train_data, duplicator.test_data
duplicator.add_raw_duplicate(4)
duplicate_train_data, test_data = duplicator.train_data, duplicator.test_data

# Data Preprocessing

In [None]:
data_preprocessing = Preprocessing(dataset=dataset)
raw_train_x, raw_train_y, raw_test_x, raw_test_y = data_preprocessing.transform(train_data=raw_train_data, test_data=raw_test_data)
duplicate_train_x, duplicate_train_y, duplicate_test_x, duplicate_test_y = data_preprocessing.transform(train_data=duplicate_train_data, test_data=test_data)

In [None]:
duplicate_train_data.head()

In [None]:
test_data.head()

In [None]:
model = LogisticRegression(penalty='l1')
model.fit(raw_train_x, raw_train_y)
print("Training acc : %0.3f" % model.score(raw_train_x, raw_train_y))
print("Test acc : %0.3f" % model.score(raw_test_x, raw_test_y))

model = LogisticRegression(penalty='l1')
model.fit(duplicate_train_x, duplicate_train_y)
print("Training acc : %0.3f" % model.score(duplicate_train_x, duplicate_train_y))
print("Test acc : %0.3f" % model.score(duplicate_test_x, duplicate_test_y))

# Data Poisoning

In [None]:
poisoned = Poisoning(data=duplicate_train_x, label=duplicate_train_y)
poisoned.add_adversarial(percentage=10)
poisoned_data, poisoned_label = poisoned.return_data()

# Measure accuracy and demographic parity on original data

In [None]:
model = LogisticRegression(penalty='l1')
model.fit(duplicate_train_x, duplicate_train_y)
print("Training acc : %0.3f" % model.score(duplicate_train_x, duplicate_train_y))
print("Test acc : %0.3f" % model.score(duplicate_test_x, duplicate_test_y))

reweighing = Reweighing(duplicate_train_x, duplicate_train_y, feature, p_group, up_group)
original_dp = reweighing.fairness_measure(model)
print("Demographic parity : %0.3f" % original_dp)

# Measure accuracy and demographic parity on poisoned data

In [None]:
def show_result(x_train, y_train, x_test, y_test, fair=False, weight=None):
    mitigated_model = LogisticRegression(penalty='l1')
    reweighing.change_data(x_train, y_train)
    
    if fair:
        sample_weight = reweighing.calculate_weight()
    elif weight is not None:
        sample_weight = weight
    else:
        sample_weight = np.ones(len(x_train))
    mitigated_model.fit(x_train, y_train, sample_weight=sample_weight)
    print("Training acc : %0.3f" % model.score(x_train, y_train))
    print("Test acc : %0.3f" % model.score(x_test, y_test))

    demo_parity = reweighing.fairnesss_measure(model)
    print("Demographic parity : %0.3f" % demo_parity)

Case 1 : Data Cleaning

In [None]:
cleaned_data, cleaned_label = cleaning(poisoned_data, poisoned_label)
show_result(cleaned_data, cleaned_label, duplicate_test_x, duplicate_test_y)

Case 2 : Data Sanitization

In [None]:
sanitized_data, sanitized_label = sanitization(poisoned_data, poisoned_label)
show_result(sanitized_data, sanitized_label, duplicate_test_x, duplicate_test_y)

Case 3 : Unfiarness Mitigation

In [None]:
show_result(poisoned_data, poisoned_label, duplicate_test_x, duplicate_test_y, True):

Case 4 : Data Cleaning -> Data Sanitization -> Unfairness Mitigation

In [None]:
cleaned_data, cleaned_label = cleaning(poisoned_data, poisoned_label)
sanitized_data, sanitized_label = sanitization(cleaned_data, cleaned_label)
show_result(sanitized_data, sanitized_label, duplicate_test_x, duplicate_test_y, True)

Case 5 : Unfairness Mitigation -> Data Sanitization -> Data Cleaning

In [None]:
reweighing.change_data(poisoned_data, poisoned_label)
sample_weight = reweighing.calculate_weight()

cleaned_data, cleaned_label = cleaning(poisoned_data, poisoned_label)
sanitized_data, sanitized_label = sanitization(cleaned_data, cleaned_label)

show_result(sanitized_data, sanitized_label, x_test, y_test, weight=sample_weight)

Case 6 : MLClean

In [None]:
def data_preprocessing(data, train):
    data_copy = data.copy()
    data_copy["Target"] = data_copy["Target"].apply(lambda x:0 if (x=='<=50K' or x=='<=50K.') else 1)
    x_data = data_copy.drop('Target', axis =1)
    y_data = data_copy['Target']

    num_data = x_data.select_dtypes(include="int")
    cat_data = x_data.select_dtypes(include='object')

    if train==1:
        num_data = pd.DataFrame(scalar.fit_transform(num_data), columns=num_data.columns)
    else:
        num_data = pd.DataFrame(scalar.transform(num_data), columns=num_data.columns)
    cat_data = pd.get_dummies(cat_data)

    x_data = pd.concat([num_data, cat_data], axis=1)
    return x_data, y_data

names = [
        "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"]

train_data = pd.read_csv('Dataset/AdultCensus/train.data', names=names, 
             sep=' *, *', na_values='?')
test_data  = pd.read_csv('Dataset/AdultCensus/test.data', names=names, 
             sep=' *, *', skiprows=1, na_values='?')

train_data.drop(['fnlwgt', 'Education'], axis=1, inplace=True)
test_data.drop(['fnlwgt', 'Education'], axis=1, inplace=True)
train_data = train_data.dropna().reset_index(drop=True)
test_data = test_data.dropna().reset_index(drop=True)

scalar = StandardScaler()
x_train, y_train = data_preprocessing(train_data, 1)
x_test, y_test = data_preprocessing(test_data, 0)

missing_cols = set(x_train.columns) - set(x_test.columns)
for column in missing_cols:
    print(column)
    x_test[column] = 0
x_test = x_test[x_train.columns]

In [None]:
train_data.head()
train_data.columns

In [None]:
model = LogisticRegression(random_state=0, penalty='l1')
#model = LinearSVC(penalty='l1', dual=False, loss='l2')
model.fit(x_train, y_train)
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

In [None]:
test_index1 = y_train.index[y_train.values==1].tolist()
print(len(test_index1))
test_index2 = y_train.index[y_train.values==0].tolist()
print(len(test_index2))

test_index3 = y_test.index[y_test.values==1].tolist()
print(len(test_index3))
test_index4 = y_test.index[y_test.values==0].tolist()
print(len(test_index4))

In [None]:
def data_poisoing(data):
    return data

def data_duplicate(data):
    return data

def cleaning(data):
    return data

def sanitization(data):
    return data

def reweighing(data, label, feature, p_group, up_group):
    size = len(data)
    p_feature = feature+"_"+p_group
    up_feature = feature+"_"+up_group

    up_index = data.index[data[up_feature]==1].tolist()
    p_index = data.index[data[p_feature]==1].tolist()
    f_index = label.index[label.values==1].tolist()
    uf_index = label.index[label.values==0].tolist()

    f_up_index = list(set(f_index)&set(up_index))
    f_p_index = list(set(f_index)&set(p_index))
    uf_up_index = list(set(uf_index)&set(up_index))
    uf_p_index = list(set(uf_index)&set(p_index))

    weight_f_up = len(f_index) * len(up_index) / (size * len(f_up_index))
    weight_f_p = len(f_index) * len(p_index) / (size * len(f_p_index))
    weight_uf_up = len(uf_index) * len(up_index) / (size * len(uf_up_index))
    weight_uf_p = len(uf_index) * len(p_index) / (size * len(uf_p_index))

    weight = np.zeros(size)
    weight[f_up_index] = weight_f_up
    weight[f_p_index] = weight_f_p
    weight[uf_up_index] = weight_uf_up
    weight[uf_p_index] = weight_uf_p
    return weight

def fairness_measure(data, label, model, feature, p_group, up_group):
    p_feature = feature+"_"+p_group
    up_feature = feature+"_"+up_group
    prediction = model.predict(data)
    
    p_index = data.index[data[p_feature]==1].tolist()
    up_index = data.index[data[up_feature]==1].tolist()
    
    p_pred = prediction[p_index]
    up_pred = prediction[up_index]
    p_ratio = np.sum(p_pred)/len(p_pred)
    up_ratio = np.sum(up_pred)/len(up_pred)
    print(p_ratio, up_ratio, p_ratio/up_ratio)

In [None]:
fairness_measure(x_train, y_train, model, "Sex", "Female", "Male" )
sample_weight = reweighing(x_train, y_train, "Sex", "Female", "Male")

mitigated_model = LogisticRegression(random_state=0, penalty='l1')
mitigated_model.fit(x_train, y_train, sample_weight=sample_weight)
print(mitigated_model.score(x_train, y_train))
print(mitigated_model.score(x_test, y_test))

fairness_measure(x_train, y_train, mitigated_model, "Sex", "Female", "Male" )
