In [2]:
from src.TestProcedure import *
from src.BaselineModel import *
from src.AdultData import build_adult_data, normalize

import pandas as pd
import cvxpy as cp

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics.scorer import make_scorer



In [3]:
!git clone https://github.com/mlohaus/SearchFair.git
%cd SearchFair

fatal: destination path 'SearchFair' already exists and is not an empty directory.
/Users/Chris/Documents/School/Current/MIE424/MIE424-FinalProject/SearchFair


#### Load Unbalanced (Raw) Dataset

In [4]:
# Load data into pandas DataFrame
unbalanced_dataset = pd.read_csv('data/adult/adult.csv')

# Drop fnlwgt, education, education-num, capital-gain, capital-loss as Lohaus et al do
unbalanced_dataset = unbalanced_dataset.drop(columns=['fnlwgt', 'education', 'capital-gain', 'capital-loss'])

#### Load Balanced (Fixed) Dataset

In [5]:
# Load data into pandas DataFrame
dataset = pd.read_csv('data/adult/adult.csv')
data50minus = dataset[dataset["income"].str.contains("<=50K")].iloc[:11687]
data50plus = dataset[dataset["income"].str.contains(">50K")].iloc[:11687]
databalanced = pd.concat([data50minus, data50plus])
balanced_datset = databalanced.sample(frac=1).reset_index(drop=True)
# Drop fnlwgt, education, education-num, capital-gain, capital-loss as Lohaus et al do
balanced_datset = balanced_datset.drop(columns=['fnlwgt', 'education', 'capital-gain', 'capital-loss'])

### Baseline Results on Unbalanced Dataset

In [6]:
baseline_linear_hinge_unbalanced = BaselineModel(kernel='linear',loss_name='hinge')
baseline_rbf_hinge_unbalanced = BaselineModel(kernel='rbf',loss_name='hinge')

#### Sensitive Attribute = 'Sex' 

In [7]:
baseline_1_tester = TestProcedure(baseline_linear_hinge_unbalanced)
baseline_1_test_results = baseline_1_tester.RunTest(dataset=unbalanced_dataset,sens_attribute='sex')

KeyboardInterrupt: 

In [None]:
baseline_2_tester = TestProcedure(baseline_rbf_hinge_unbalanced)
baseline_2_test_results = baseline_2_tester.RunTest(dataset=unbalanced_dataset,sens_attribute='sex')

#### Sensitive Attribute = 'Race' 

In [None]:
baseline_3_tester = TestProcedure(baseline_linear_hinge_unbalanced)
baseline_3_test_results = baseline_3_tester.RunTest(dataset=unbalanced_dataset,sens_attribute='race')

In [None]:
baseline_4_tester = TestProcedure(baseline_rbf_hinge_unbalanced)
baseline_4_test_results = baseline_4_tester.RunTest(dataset=unbalanced_dataset,sens_attribute='race')

### Baseline Results on Balanced Dataset

In [None]:
baseline_linear_hinge_balanced = BaselineModel(kernel='linear',loss_name='hinge')
baseline_rbf_hinge_balanced = BaselineModel(kernel='rbf',loss_name='hinge')

#### Sensitive Attribute = 'Sex'

In [None]:
baseline_5_tester = TestProcedure(baseline_linear_hinge_balanced)
baseline_5_test_results = baseline_5_tester.RunTest(dataset=balanced_dataset,sens_attribute='sex')

In [None]:
baseline_6_tester = TestProcedure(baseline_rbf_hinge_balanced)
baseline_6_test_results = baseline_6_tester.RunTest(dataset=balanced_dataset,sens_attribute='sex')

#### Sensitive Attribute = 'Race' 

In [None]:
baseline_7_tester = TestProcedure(baseline_linear_hinge_balanced)
baseline_7_test_results = baseline_7_tester.RunTest(dataset=balanced_dataset,sens_attribute='race')

In [None]:
baseline_8_tester = TestProcedure(baseline_rbf_hinge_balanced)
baseline_8_test_results = baseline_8_tester.RunTest(dataset=balanced_dataset,sens_attribute='race')

## Baseline Hyperparameter Grid Search **for Accuracy**

### Unbalanced Dataset 

In [None]:
sens_attribute = 'sex'

grid_search_1_model = BaselineModel()

beta_params = [0.0001, 0.001, 0.01] # For Linear Kernel
gamma_params = [0.01, 0.1, 1] # For RBF Kernel
kernel_params = ['linear','rbf']
cv_params = {'l2_beta': beta_params,'gamma': gamma_params,'kernel':kernel_params}

x_data, y_data, s_data = build_adult_data(unbalanced_dataset,sens_attribute,load_data_size=None)
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=1200, shuffle=True)

grid_accuracy_unbalanced = GridSearchCV(grid_search_1_model,cv_params, cv=4, n_jobs=1, scoring='accuracy')
grid_accuracy_unbalanced.fit(x_train, y_train, s_train = s_train)

In [None]:
grid_accuracy_unbalanced.cv_results_

### Balanced Dataset 

In [None]:
grid_search_2_model = BaselineModel()

beta_params = [0.0001, 0.001, 0.01] # For Linear Kernel
gamma_params = [0.01, 0.1, 1] # For RBF Kernel
kernel_params = ['linear','rbf']
cv_params = {'l2_beta': beta_params,'gamma': gamma_params,'kernel':kernel_params}

x_data, y_data, s_data = build_adult_data(balanced_dataset,sens_attribute,load_data_size=None)
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=1200, shuffle=True)

grid_accuracy_balanced = GridSearchCV(grid_search_2_model,cv_params, cv=4, n_jobs=1, scoring='accuracy')
grid_accuracy_balanced.fit(x_train, y_train, s_train = s_train)

In [None]:
grid_accuracy_balanced.cv_results_

## Baseline Hyperparameter Grid Search **for DDP**

In [10]:
grid_split_counter = 1

def get_positive_rate(y_predicted, y_true):
    tn, fp, fn, tp = confusion_matrix(y_true, y_predicted).ravel()
    pr = (tp+fp) / (tp+fp+tn+fn)
    return pr

def DDP_Grid_Scoring(y_true,y_predicted,sens_attr,size):
    global grid_split_counter
    
    chunk_size = 1200/size
    sens_attribute = sens_attr[int((grid_split_counter - 1)*chunk_size): int(grid_split_counter*chunk_size)]
    
    if grid_split_counter == size:
        grid_split_counter = 1
    else:
        grid_split_counter += 1
    
    positive_rate_prot = get_positive_rate(y_predicted[sens_attribute==-1], y_true[sens_attribute==-1])
    positive_rate_unprot = get_positive_rate(y_predicted[sens_attribute==1], y_true[sens_attribute==1])
    
    DDP = abs(positive_rate_unprot - positive_rate_prot)
    return DDP

DDP_scorer = make_scorer(DDP_Grid_Scoring, greater_is_better=False, sens_attr = s_train, size = size)

### Unbalanced Dataset 

In [None]:
size = 4

grid_search_3_model = BaselineModel()

beta_params = [0.0001, 0.001, 0.01] # For Linear Kernel
gamma_params = [0.01, 0.1, 1] # For RBF Kernel
kernel_params = ['linear','rbf']
cv_params = {'l2_beta': beta_params,'gamma': gamma_params,'kernel':kernel_params}

x_data, y_data, s_data = build_adult_data(unbalanced_dataset,sens_attribute,load_data_size=None)
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=1200, shuffle=True)

grid_ddp_unbalanced = GridSearchCV(grid_search_3_model,cv_params, cv=size, n_jobs=1, scoring=DDP_scorer)
grid_ddp_unbalanced.fit(x_train, y_train, s_train = s_train)

In [None]:
grid_ddp_unbalanced.cv_results_

### Balanced Dataset

In [None]:
grid_search_4_model = BaselineModel()

beta_params = [0.0001, 0.001, 0.01] # For Linear Kernel
gamma_params = [0.01, 0.1, 1] # For RBF Kernel
kernel_params = ['linear','rbf']
cv_params = {'l2_beta': beta_params,'gamma': gamma_params,'kernel':kernel_params}

x_data, y_data, s_data = build_adult_data(balanced_dataset,sens_attribute,load_data_size=None)
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=1200, shuffle=True)

grid_ddp_balanced = GridSearchCV(grid_search_4_model,cv_params, cv=size, n_jobs=1, scoring=DDP_scorer)
grid_ddp_balanced.fit(x_train, y_train, s_train = s_train)

In [None]:
grid_ddp_balanced.cv_results_