# [Adult] Baseline -- Exponentiated Gradiet Reduction


In [None]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
cwd = '../../../core'
sys.path.append(cwd)

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset, StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.metrics import ClassificationMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector

from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult, load_preproc_data_compas, load_preproc_data_german

from aif360.algorithms.inprocessing.exponentiated_gradient_reduction import ExponentiatedGradientReduction

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer

from IPython.display import Markdown, display
import matplotlib.pyplot as plt


from load_adult import * 
from missing_module import * 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)







### Generate Missing Data

In [None]:
## Loading Data ## 
df_train, df_test = load_adult()

## Balancing the Data ##
df = balance_data(df_train, 'income', 0)
df = balance_data(df, 'gender', 1)

sens_attr = 'gender'
s = 42   # random seed

## Generate Missing Data in Training Set ##
df_ms = generate_missing(df, sens_attr, ms_label='marital-status', p_ms0=0, p_ms1=0.4, seed=s)
df_ms = generate_missing(df_ms, sens_attr, ms_label='hours-per-week', p_ms0=0, p_ms1=0.3, seed=s)
df_ms = generate_missing(df_ms, sens_attr, ms_label='race', p_ms0=0.2, p_ms1=0.2, seed=s)

df_ms.describe()



In [None]:

privileged_groups = [{'gender': 1}]
unprivileged_groups = [{'gender': 0}]


eps_list = [0.001, 0.005, 0.01, 0.02, 0.05, 0.1]

fr_mean, acc_mean, fr_std, acc_std = [], [], [], [] 


for eps in eps_list: 
    fr_list = []
    acc_list = [] 
    display(Markdown("# Epsilon ="+str(eps)))
    for seed in range (1, 11): 
        
        dataset_orig_train, dataset_orig_test = train_test_split(df_ms, test_size=0.3, random_state=seed)

        dataset_orig_train_no_sens = dataset_orig_train.drop(columns=['gender','income'])
        dataset_orig_test_no_sens = dataset_orig_test.drop(columns=['gender','income'])
        
        ## Change the following two lines to get mean or k-nn results ##
        imputer = SimpleImputer()
#         imputer = KNNImputer()
        
        dataset_orig_train_no_sens = pd.DataFrame(imputer.fit_transform(dataset_orig_train_no_sens), 
                                                  columns=dataset_orig_train_no_sens.columns, 
                                                  index=dataset_orig_train_no_sens.index)
        dataset_orig_test_no_sens = pd.DataFrame(imputer.transform(dataset_orig_test_no_sens), 
                                                 columns=dataset_orig_test_no_sens.columns, 
                                                 index=dataset_orig_test_no_sens.index)
        dataset_orig_train = pd.concat([dataset_orig_train_no_sens, dataset_orig_train[['gender','income']]], axis=1)
        dataset_orig_test = pd.concat([dataset_orig_test_no_sens, dataset_orig_test[['gender','income']]], axis=1)



        ### Converting to AIF360 StandardDataset objects ###
        dataset_orig_train = StandardDataset(dataset_orig_train, label_name='income', favorable_classes=[1],
                                         protected_attribute_names=['gender'], privileged_classes=[[1]])
        dataset_orig_test = StandardDataset(dataset_orig_test, label_name='income', favorable_classes=[1],
                                         protected_attribute_names=['gender'], privileged_classes=[[1]])


        idx_wo_protected = list(set(range(7))-set([3]))
        X_train = dataset_orig_train.features[:,idx_wo_protected]
        y_train = dataset_orig_train.labels.ravel()

        lmod = DecisionTreeClassifier(max_depth=3)
        lmod.fit(X_train, y_train, sample_weight=dataset_orig_train.instance_weights)

        X_test = dataset_orig_test.features[:,idx_wo_protected]
        y_test = dataset_orig_test.labels.ravel()

        y_pred = lmod.predict(X_test)

        dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
        dataset_orig_test_pred.labels = y_pred

        cm_pred_test = ClassificationMetric(dataset_orig_test, dataset_orig_test_pred,
                             unprivileged_groups=unprivileged_groups,
                             privileged_groups=privileged_groups)
        acc_orig = cm_pred_test.accuracy()
        fr_orig = cm_pred_test.difference(cm_pred_test.false_positive_rate)


        display(Markdown("#### Original-Predicted testing dataset"))
        print("Difference in FPR between unprivileged and privileged groups")
        print(fr_orig)

        print("Overall Test Accuracy ")
        print(acc_orig)

        estimator = DecisionTreeClassifier(max_depth=3)

        np.random.seed(0)
        exp_grad_red = ExponentiatedGradientReduction(estimator=estimator, 
                                                  constraints="EqualizedOdds",
                                                  drop_prot_attr=False, eps=eps)
        exp_grad_red.fit(dataset_orig_train)
        exp_grad_red_pred = exp_grad_red.predict(dataset_orig_test)

        cm_transf_test = ClassificationMetric(dataset_orig_test, exp_grad_red_pred,
                                        unprivileged_groups=unprivileged_groups,
                                        privileged_groups=privileged_groups)

        acc = cm_transf_test.accuracy()
        acc_list.append(acc)
        fr = np.abs(cm_transf_test.difference(cm_transf_test.false_positive_rate))
        fr_list.append(fr)

        display(Markdown("#### Original-Transformed testing dataset"))
        print("Difference in FPR between unprivileged and privileged groups")
        print(fr)

        print("Overall Test Accuracy ")
        print(acc)
        
    fr_mean.append(np.mean(fr_list))
    fr_std.append(np.std(fr_list))
    acc_mean.append(np.mean(acc_list))
    acc_std.append(np.std(acc_list))
    
    
    
    
    

In [None]:
with open('results/mean_exp_grad_result.pkl', 'wb+') as f: 
    pickle.dump({'fr_mean': fr_mean, 'fr_std': fr_std, 'acc_mean': acc_mean, 'acc_std': acc_std}, f)