# [Adult] Baseline -- Disparate Mistreatment DCCP (Zafar et al.) 

In [None]:
import os,sys

sys.path.append('../../../fair-classification_python3/fair_classification') # the code for fair classification is in this directory
sys.path.append("../")
cwd = '../../../core'
sys.path.append(cwd)


import numpy as np
import pandas as pd
import utils as ut
import funcs_disp_mist as fdm

from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve
from sklearn.impute import KNNImputer, SimpleImputer

from IPython.display import Markdown, display
import matplotlib.pyplot as plt


from load_adult import * 
from missing_module import * 


In [None]:
## Loading Data ## 
df_train, df_test = load_adult()

## Balancing the Data ##
df = balance_data(df_train, 'income', 0)
df = balance_data(df, 'gender', 1)

sens_attr = 'gender'
s = 42   # random seed

## Generate Missing Data in Training Set ##
df_ms = generate_missing(df, sens_attr, ms_label='marital-status', p_ms0=0, p_ms1=0.4, seed=s)
df_ms = generate_missing(df_ms, sens_attr, ms_label='hours-per-week', p_ms0=0, p_ms1=0.3, seed=s)
df_ms = generate_missing(df_ms, sens_attr, ms_label='race', p_ms0=0.2, p_ms1=0.2, seed=s)

## Changing label 0 to -1 to use Zafar's method ##
df_ms['income'].replace({0: -1}, inplace=True)

df_ms.describe()

In [None]:

fair = 'fpr'
fr_mean, acc_mean, fr_std, acc_std = [], [], [], [] 
tau_list = [0.01, 5, 50, 500, 5000]

for tau in tau_list: 
    fr_list = []
    acc_list = [] 
    
    for seed in range (1, 11): 
       
        ################## Train-Test Split ################### 
        dataset_orig_train, dataset_orig_test = train_test_split(df_ms, test_size=0.3, random_state=seed)

        ########################################################

        ##################### Imputation ###################### 
        
        ## Change the following two lines to get mean or k-nn results ##
#         imputer = SimpleImputer()
        imputer = KNNImputer()
        imputer.fit(dataset_orig_train)
        df_train = pd.DataFrame(imputer.transform(dataset_orig_train), columns=dataset_orig_train.columns, 
                                                  index=dataset_orig_train.index)

        df_test = pd.DataFrame(imputer.transform(dataset_orig_test), columns=dataset_orig_test.columns, 
                                                  index=dataset_orig_test.index)

        ########################################################
        sensitive_attrs = [sens_attr]

        X_train = df_train.iloc[:,:-1]
        X_train['intercept'] = np.ones(len(X_train))
        x_control_train = dict({sens_attr: np.array([int(x) for x in X_train[sens_attr]])})
        X_train = np.array(X_train.drop(columns=[sens_attr]))
        y_train = np.array(df_train.iloc[:,-1])

        X_test = df_test.iloc[:,:-1]
        X_test['intercept'] = np.ones(len(X_test))
        x_control_test = dict({sens_attr: np.array([int(x) for x in X_test[sens_attr]])})
        X_test = np.array(X_test.drop(columns=[sens_attr]))
        y_test = np.array(df_test.iloc[:,-1])


        ##########################################################

        ################ Disparate Mistreatment ################## 
        if fair == 'fpr':
            cons_type = 1 
        elif fair == 'fnr': 
            cons_type = 2
        elif fair == 'acc':
            cons_type = 0 
        elif fair == 'eqodds':
            cons_type = 4 

        mu = 1.2
        loss_function = "logreg" # perform the experiments with logistic regression
        EPS = 1e-6

        print(tau, seed)
        sensitive_attrs_to_cov_thresh = {sens_attr: {0:{0:0, 1:0}, 1:{0:0, 1:0}, 2:{0:0, 1:0}} } # zero covariance threshold, means try to get the fairest solution
        cons_params = {"cons_type": cons_type, 
                       "tau": tau, 
                       "mu": mu, 
                       "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}

         
        w = fdm.train_model_disp_mist(X_train, y_train, x_control_train, loss_function, EPS, cons_params)
        train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, X_train, y_train, x_control_train, X_test, y_test, x_control_test, sensitive_attrs)

        fr = np.abs(s_attr_to_fp_fn_test[sens_attr][0][fair] - s_attr_to_fp_fn_test[sens_attr][1][fair])
        fr_list.append(fr)
        acc_list.append(test_score)
        
    fr_mean.append(np.mean(fr_list))
    fr_std.append(np.std(fr_list))
    acc_mean.append(np.mean(acc_list))
    acc_std.append(np.std(acc_list))
    

In [None]:
# with open('results/mean_disp_mistrtment_result.pkl', 'wb+') as f: 
with open('results/knn_disp_mistrtment_result.pkl', 'wb+') as f: 
    pickle.dump({'fr_mean': fr_mean, 'fr_std': fr_std, 'acc_mean': acc_mean, 'acc_std': acc_std}, f)