# [HSLS] Exponentiated Gradiet Reduction

In [None]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
cwd = '../../../core'
sys.path.append(cwd)

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset, StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.metrics import ClassificationMetric
from aif360.metrics.utils import compute_boolean_conditioning_vector

from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult, load_preproc_data_compas, load_preproc_data_german

from aif360.algorithms.inprocessing.exponentiated_gradient_reduction import ExponentiatedGradientReduction

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import KNNImputer, SimpleImputer

from IPython.display import Markdown, display
import matplotlib.pyplot as plt


from missing_module import * 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)







### Open data file

In [None]:
df_ms = pd.read_pickle('pkl_data/hsls_orig.pkl')

sens_attr = 'racebin'
privileged_groups = [{'racebin': 1}]
unprivileged_groups = [{'racebin': 0}]

df_ms.describe()

In [None]:
eps_list = [0.001, 0.005, 0.01, 0.02, 0.05, 0.1]

fr_mean, acc_mean, fr_std, acc_std = [], [], [], [] 


for eps in eps_list: 
    fr_list = []
    acc_list = [] 
    display(Markdown("# Epsilon ="+str(eps)))
    for seed in range (1, 11): 
        
        dataset_orig_train, dataset_orig_test = train_test_split(df_ms, test_size=0.3, random_state=seed)

        dataset_orig_train_no_sens = dataset_orig_train.drop(columns=['racebin','gradebin'])
        dataset_orig_test_no_sens = dataset_orig_test.drop(columns=['racebin','gradebin'])

        ## Change the following two lines to get mean or k-nn results ##
#         imputer = SimpleImputer()
        imputer = KNNImputer()

        dataset_orig_train_no_sens = pd.DataFrame(imputer.fit_transform(dataset_orig_train_no_sens), 
                                                  columns=dataset_orig_train_no_sens.columns, 
                                                  index=dataset_orig_train_no_sens.index)
        dataset_orig_test_no_sens = pd.DataFrame(imputer.transform(dataset_orig_test_no_sens), 
                                                 columns=dataset_orig_test_no_sens.columns, 
                                                 index=dataset_orig_test_no_sens.index)
        dataset_orig_train = pd.concat([dataset_orig_train_no_sens, dataset_orig_train[['racebin','gradebin']]], axis=1)
        dataset_orig_test = pd.concat([dataset_orig_test_no_sens, dataset_orig_test[['racebin','gradebin']]], axis=1)


    
    

        ### Converting to AIF360 StandardDataset objects ###
        dataset_orig_train = StandardDataset(dataset_orig_train, label_name='gradebin', favorable_classes=[1],
                                             protected_attribute_names=['racebin'], privileged_classes=[[1]])
        dataset_orig_test = StandardDataset(dataset_orig_test, label_name='gradebin', favorable_classes=[1],
                                             protected_attribute_names=['racebin'], privileged_classes=[[1]])

        idx_wo_protected = list(range(9))
        X_train = dataset_orig_train.features[:,idx_wo_protected]
        y_train = dataset_orig_train.labels.ravel()

        lmod = DecisionTreeClassifier(random_state=42, max_depth=4)
        lmod.fit(X_train, y_train, sample_weight=dataset_orig_train.instance_weights)

        X_test = dataset_orig_test.features[:,idx_wo_protected]
        y_test = dataset_orig_test.labels.ravel()

        y_pred = lmod.predict(X_test)

        dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
        dataset_orig_test_pred.labels = y_pred

        cm_pred_test = ClassificationMetric(dataset_orig_test, dataset_orig_test_pred,
                                 unprivileged_groups=unprivileged_groups,
                                 privileged_groups=privileged_groups)

        display(Markdown("#### Original-Predicted testing dataset"))
        print("Difference in FNR between unprivileged and privileged groups")
        print(cm_pred_test.difference(cm_pred_test.false_negative_rate))

        print("Overall Test Accuracy ")
        print(cm_pred_test.accuracy())

        estimator = DecisionTreeClassifier(max_depth=3)

        np.random.seed(0)
        exp_grad_red = ExponentiatedGradientReduction(estimator=estimator, 
                                                  constraints="TruePositiveRateDifference",
                                                  drop_prot_attr=False, eps=eps)
        exp_grad_red.fit(dataset_orig_train)
        exp_grad_red_pred = exp_grad_red.predict(dataset_orig_test)

        cm_transf_test = ClassificationMetric(dataset_orig_test, exp_grad_red_pred,
                                        unprivileged_groups=unprivileged_groups,
                                        privileged_groups=privileged_groups)
        display(Markdown("#### Original-Transformed testing dataset"))
        print("Difference in FNR between unprivileged and privileged groups")
        fr = np.abs(cm_transf_test.difference(cm_transf_test.false_negative_rate))
        fr_list.append(fr)
        print(fr)

        print("Overall Test Accuracy ")
        acc = cm_transf_test.accuracy()
        acc_list.append(acc)
        print(acc)
        
    fr_mean.append(np.mean(fr_list))
    fr_std.append(np.std(fr_list))
    acc_mean.append(np.mean(acc_list))
    acc_std.append(np.std(acc_list))
    
    
    
    
    

In [None]:
plt.errorbar(fr_mean, acc_mean, xerr =fr_std, yerr=acc_std, fmt="o", color='green')

In [None]:
plt.errorbar(fr_mean, acc_mean, xerr =fr_std, yerr=acc_std, fmt="o", color='green')

In [None]:
plt.errorbar(fr_mean, acc_mean, xerr =fr_std, yerr=acc_std, fmt="o", color='green')

In [None]:
with open('knn_exp_grad_result.pkl', 'wb+') as f: 
    pickle.dump({'fr_mean': fr_mean, 'fr_std': fr_std, 'acc_mean': acc_mean, 'acc_std': acc_std}, f)