# [HSLS] Baseline -- Disparate Mistreatment

In [None]:
import os,sys

sys.path.append('../../../fair-classification_python3/fair_classification') # the code for fair classification is in this directory
sys.path.append("../")
cwd = '../../../core'
sys.path.append(cwd)


import numpy as np
import pandas as pd
import utils as ut
import funcs_disp_mist as fdm

from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve
from sklearn.impute import KNNImputer, SimpleImputer

from IPython.display import Markdown, display
import matplotlib.pyplot as plt


from missing_module import * 


### Open data file

In [None]:
df_ms = pd.read_pickle('pkl_data/hsls_orig.pkl')

sens_attr = 'racebin'
privileged_groups = [{'racebin': 1}]
unprivileged_groups = [{'racebin': 0}]

df_ms['gradebin'].replace({0: -1}, inplace=True)



## Mean Imputation Results

In [None]:

fair = 'fnr'
tau_list = [500]
# tau_list = [0.1]

fr_mean, acc_mean, fr_std, acc_std = [], [], [], [] 


for tau in tau_list: 
    fr_list = []
    acc_list = [] 
    
    for seed in range (1, 11): 
        print(tau,seed)
        ################## Train-Test Split ################### 
        dataset_orig_train, dataset_orig_test = train_test_split(df_ms, test_size=0.3, random_state=seed)

        ########################################################

        ##################### Imputation ###################### 
        imputer = SimpleImputer()

        dataset_orig_train_no_sens = dataset_orig_train.drop(columns=['racebin','gradebin'])
        dataset_orig_test_no_sens = dataset_orig_test.drop(columns=['racebin','gradebin'])


        dataset_orig_train_no_sens = pd.DataFrame(imputer.fit_transform(dataset_orig_train_no_sens), 
                                                  columns=dataset_orig_train_no_sens.columns, 
                                                  index=dataset_orig_train_no_sens.index)
        dataset_orig_test_no_sens = pd.DataFrame(imputer.transform(dataset_orig_test_no_sens), 
                                                 columns=dataset_orig_test_no_sens.columns, 
                                                 index=dataset_orig_test_no_sens.index)
        dataset_orig_train = pd.concat([dataset_orig_train_no_sens, dataset_orig_train[['racebin','gradebin']]], axis=1)
        dataset_orig_test = pd.concat([dataset_orig_test_no_sens, dataset_orig_test[['racebin','gradebin']]], axis=1)



        ########################################################
        sensitive_attrs = ['racebin']

        X_train = dataset_orig_train.iloc[:,:-1]
        X_train['intercept'] = np.ones(len(X_train))
        x_control_train = dict({'racebin': np.array([int(x) for x in X_train['racebin']])})
        X_train = np.array(X_train.drop(columns=['racebin']))
        y_train = np.array(dataset_orig_train.iloc[:,-1])

        X_test = dataset_orig_test.iloc[:,:-1]
        X_test['intercept'] = np.ones(len(X_test))
        x_control_test = dict({'racebin': np.array([int(x) for x in X_test['racebin']])})
        X_test = np.array(X_test.drop(columns=['racebin']))
        y_test = np.array(dataset_orig_test.iloc[:,-1])


        ##########################################################

        ################ Disparate Mistreatment ################## 
        if fair == 'fpr':
            cons_type = 1 
        elif fair == 'fnr': 
            cons_type = 2
        elif fair == 'acc':
            cons_type = 0 
        elif fair == 'eqodds':
            cons_type = 4 

        mu = 1.2
        loss_function = "logreg" # perform the experiments with logistic regression
        EPS = 1e-6

        sensitive_attrs_to_cov_thresh = {"racebin": {0:{0:0, 1:0}, 1:{0:0, 1:0}, 2:{0:0, 1:0}} } # zero covariance threshold, means try to get the fairest solution
        cons_params = {"cons_type": cons_type, 
                       "tau": tau, 
                       "mu": mu, 
                       "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}
        try:
            w = fdm.train_model_disp_mist(X_train, y_train, x_control_train, loss_function, EPS, cons_params)
        except:
            pass
        train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, X_train, y_train, x_control_train, X_test, y_test, x_control_test, sensitive_attrs)

        fr = np.abs(s_attr_to_fp_fn_test['racebin'][0][fair] - s_attr_to_fp_fn_test['racebin'][1][fair])
        fr_list.append(fr)
        acc_list.append(test_score)
        
    fr_mean.append(np.mean(fr_list))
    fr_std.append(np.std(fr_list))
    acc_mean.append(np.mean(acc_list))
    acc_std.append(np.std(acc_list))
    

In [None]:
fr_mean

In [None]:
acc_mean

In [None]:
with open('disp_mistrtment_result.pkl', 'wb+') as f: 
    pickle.dump({'fr_mean': fr_mean, 'fr_std': fr_std, 'acc_mean': acc_mean, 'acc_std': acc_std}, f)

<br/> <br/>

## KNN Imputation Results

In [None]:

fair = 'fnr'
tau_list = [0.001, 0.01, 0.1, 1, 10, 100]

fr_mean, acc_mean, fr_std, acc_std = [], [], [], [] 


for tau in tau_list: 
    fr_list = []
    acc_list = [] 
    
    for seed in range (1, 11): 
        print(tau,seed)
        ################## Train-Test Split ################### 
        dataset_orig_train, dataset_orig_test = train_test_split(df_ms, test_size=0.3, random_state=seed)

        ########################################################

        ##################### Imputation ###################### 
        imputer = KNNImputer()

        dataset_orig_train_no_sens = dataset_orig_train.drop(columns=['racebin','gradebin'])
        dataset_orig_test_no_sens = dataset_orig_test.drop(columns=['racebin','gradebin'])


        dataset_orig_train_no_sens = pd.DataFrame(imputer.fit_transform(dataset_orig_train_no_sens), 
                                                  columns=dataset_orig_train_no_sens.columns, 
                                                  index=dataset_orig_train_no_sens.index)
        dataset_orig_test_no_sens = pd.DataFrame(imputer.transform(dataset_orig_test_no_sens), 
                                                 columns=dataset_orig_test_no_sens.columns, 
                                                 index=dataset_orig_test_no_sens.index)
        dataset_orig_train = pd.concat([dataset_orig_train_no_sens, dataset_orig_train[['racebin','gradebin']]], axis=1)
        dataset_orig_test = pd.concat([dataset_orig_test_no_sens, dataset_orig_test[['racebin','gradebin']]], axis=1)



        ########################################################
        sensitive_attrs = ['racebin']

        X_train = dataset_orig_train.iloc[:,:-1]
        X_train['intercept'] = np.ones(len(X_train))
        x_control_train = dict({'racebin': np.array([int(x) for x in X_train['racebin']])})
        X_train = np.array(X_train.drop(columns=['racebin']))
        y_train = np.array(dataset_orig_train.iloc[:,-1])

        print(X_train.shape)

        X_test = dataset_orig_test.iloc[:,:-1]
        X_test['intercept'] = np.ones(len(X_test))
        x_control_test = dict({'racebin': np.array([int(x) for x in X_test['racebin']])})
        X_test = np.array(X_test.drop(columns=['racebin']))
        y_test = np.array(dataset_orig_test.iloc[:,-1])


        ##########################################################

        ################ Disparate Mistreatment ################## 
        if fair == 'fpr':
            cons_type = 1 
        elif fair == 'fnr': 
            cons_type = 2
        elif fair == 'acc':
            cons_type = 0 
        elif fair == 'eqodds':
            cons_type = 4 

        mu = 1.2
        loss_function = "logreg" # perform the experiments with logistic regression
        EPS = 1e-6

        sensitive_attrs_to_cov_thresh = {"racebin": {0:{0:0, 1:0}, 1:{0:0, 1:0}, 2:{0:0, 1:0}} } # zero covariance threshold, means try to get the fairest solution
        cons_params = {"cons_type": cons_type, 
                       "tau": tau, 
                       "mu": mu, 
                       "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}
        try:
            w = fdm.train_model_disp_mist(X_train, y_train, x_control_train, loss_function, EPS, cons_params)
        except:
            pass
        train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, X_train, y_train, x_control_train, X_test, y_test, x_control_test, sensitive_attrs)

        fr = np.abs(s_attr_to_fp_fn_test['racebin'][0][fair] - s_attr_to_fp_fn_test['racebin'][1][fair])
        fr_list.append(fr)
        acc_list.append(test_score)
        
    fr_mean.append(np.mean(fr_list))
    fr_std.append(np.std(fr_list))
    acc_mean.append(np.mean(acc_list))
    acc_std.append(np.std(acc_list))
    

In [None]:
with open('knn_disp_mistrtment_result.pkl', 'wb+') as f: 
    pickle.dump({'fr_mean': fr_mean, 'fr_std': fr_std, 'acc_mean': acc_mean, 'acc_std': acc_std}, f)