#  [COMPAS] Exponentiated Gradient Reduction

Exponentiated gradient reduction is an in-processing technique that reduces fair classification to a sequence of cost-sensitive classification problems, returning a randomized classifier with the lowest empirical error subject to 
fair classification constraints. The code for exponentiated gradient reduction wraps the source class 
`fairlearn.reductions.ExponentiatedGradient` available in the https://github.com/fairlearn/fairlearn library,
licensed under the MIT Licencse, Copyright Microsoft Corporation.

This version of exponentiated gradient reduction (implemented in `aif360.algorithms`) wraps the sklearn compatible version of exponentiated gradient reduction implemented in `aif360.sklearn`. For a detailed tutorial on sklearn compatible exponentiated gradient reduction see [examples/sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb](sklearn/demo_exponentiated_gradient_reduction_sklearn.ipynb). 

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
cwd = '../../../core'
sys.path.append(cwd)

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset, StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.metrics import ClassificationMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector

from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult, load_preproc_data_compas, load_preproc_data_german

from aif360.algorithms.inprocessing.exponentiated_gradient_reduction import ExponentiatedGradientReduction

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer

from IPython.display import Markdown, display
import matplotlib.pyplot as plt


from load_compas import * 
from missing_module import * 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)







### Generate Missing Data

In [None]:
X, y, x_control = load_compas_data()

df = pd.DataFrame(X, columns= ['age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'race', 'sex', 
                               'priors_count', 'c_charge_degree'])

idxx = df[df['race']==0].index
print(idxx[:10])

y = pd.Series(y, name="two_year_recid")
y[y==-1] = 0

df = pd.concat([df, y], axis=1)
df_bal = balance_data(df, 'race', 0)
df_train, df_test = train_test_split(df_bal, test_size=0.3, random_state=0)

s = 777
df_ms = generate_missing(df_bal, c_label='race', ms_label='sex', p_ms0=0.4, p_ms1=0.1, seed=s)
df_ms = generate_missing(df_ms, c_label='race', ms_label='priors_count', p_ms0=0.6, p_ms1=0.2, seed=s)


privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]

df_ms.groupby(df_ms['race']).mean()

In [None]:
eps_list = [0.001, 0.005, 0.01, 0.02, 0.05, 0.1]

fr_mean, acc_mean, fr_std, acc_std = [], [], [], [] 


for eps in eps_list: 
    fr_list = []
    acc_list = [] 
    
    for seed in range (1, 11): 
        dataset_orig_train, dataset_orig_test = train_test_split(df_ms, test_size=0.3, random_state=seed)

        dataset_orig_train_no_sens = dataset_orig_train.drop(columns=['race','two_year_recid'])
        dataset_orig_test_no_sens = dataset_orig_test.drop(columns=['race','two_year_recid'])

        ## Change the following two lines to get mean or k-nn results ##
#         imputer = SimpleImputer()
        imputer = KNNImputer()

        dataset_orig_train_no_sens = pd.DataFrame(imputer.fit_transform(dataset_orig_train_no_sens), 
                                                  columns=dataset_orig_train_no_sens.columns, 
                                                  index=dataset_orig_train_no_sens.index)
        dataset_orig_test_no_sens = pd.DataFrame(imputer.transform(dataset_orig_test_no_sens), 
                                                 columns=dataset_orig_test_no_sens.columns, 
                                                 index=dataset_orig_test_no_sens.index)
        dataset_orig_train = pd.concat([dataset_orig_train_no_sens, dataset_orig_train[['race','two_year_recid']]], axis=1)
        dataset_orig_test = pd.concat([dataset_orig_test_no_sens, dataset_orig_test[['race','two_year_recid']]], axis=1)

    #     print(dataset_orig_test.columns.to_list())
    #     dataset_orig_train = dataset_orig_train.fillna(dataset_orig_train.mean())
    #     dataset_orig_test = dataset_orig_test.fillna(dataset_orig_test.mean())

        ### Converting to AIF360 StandardDataset objects ###
        dataset_orig_train = StandardDataset(dataset_orig_train, label_name='two_year_recid', favorable_classes=[1],
                                             protected_attribute_names=['race'], privileged_classes=[[1]])
        dataset_orig_test = StandardDataset(dataset_orig_test, label_name='two_year_recid', favorable_classes=[1],
                                             protected_attribute_names=['race'], privileged_classes=[[1]])

        X_train = dataset_orig_train.features[:,:-1]
        y_train = dataset_orig_train.labels.ravel()

        lmod = DecisionTreeClassifier(max_depth=3)
        lmod.fit(X_train, y_train, sample_weight=dataset_orig_train.instance_weights)

        X_test = dataset_orig_test.features[:,:-1]
        y_test = dataset_orig_test.labels.ravel()

        y_pred = lmod.predict(X_test)

        dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
        dataset_orig_test_pred.labels = y_pred

        cm_pred_test = ClassificationMetric(dataset_orig_test, dataset_orig_test_pred,
                                 unprivileged_groups=unprivileged_groups,
                                 privileged_groups=privileged_groups)

        display(Markdown("#### Original-Predicted testing dataset"))
        print("Difference in FNR between unprivileged and privileged groups")
        print(cm_pred_test.difference(cm_pred_test.false_negative_rate))

        print("Overall Test Accuracy ")
        print(cm_pred_test.accuracy())

        estimator = DecisionTreeClassifier(max_depth=3)

        np.random.seed(0)
        exp_grad_red = ExponentiatedGradientReduction(estimator=estimator, constraints="TruePositiveRateDifference",
                                                      drop_prot_attr=False, eps= eps)
        exp_grad_red.fit(dataset_orig_train)
        exp_grad_red_pred = exp_grad_red.predict(dataset_orig_test)

        cm_transf_test = ClassificationMetric(dataset_orig_test, exp_grad_red_pred,
                                        unprivileged_groups=unprivileged_groups,
                                        privileged_groups=privileged_groups)
        display(Markdown("#### Original-Transformed testing dataset"))
        print("Difference in FNR between unprivileged and privileged groups")
        fr = np.abs(cm_transf_test.difference(cm_transf_test.false_negative_rate))
        fr_list.append(fr)
        print(fr)

        print("Overall Test Accuracy ")
        acc = cm_transf_test.accuracy()
        acc_list.append(acc)
        print(acc)
        
    fr_mean.append(np.mean(fr_list))
    fr_std.append(np.std(fr_list))
    acc_mean.append(np.mean(acc_list))
    acc_std.append(np.std(acc_list))
    
    
    
    
    

In [None]:
# plt.scatter(fr_mean, ens_acc_mean, color='blue')
plt.errorbar(fr_mean, acc_mean, xerr =fr_std, yerr=acc_std, fmt="o", color='green')

In [None]:
with open('knn_exp_grad_result.pkl', 'wb+') as f: 
    pickle.dump({'fr_mean': fr_mean, 'fr_std': fr_std, 'acc_mean': acc_mean, 'acc_std': acc_std}, f)