# MIE424 Project

## 1. Load the Adult dataset

In [1]:
!git clone https://github.com/mlohaus/SearchFair.git
%cd SearchFair

Cloning into 'SearchFair'...
remote: Enumerating objects: 86, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 86 (delta 32), reused 74 (delta 24), pack-reused 0[K
Unpacking objects: 100% (86/86), done.
/content/SearchFair


In [2]:
import pandas as pd
import time

In [3]:
# Load data into pandas DataFrame
dataset = pd.read_csv('data/adult/adult.csv')

# Drop fnlwgt, education, education-num, capital-gain, capital-loss as Lohaus et al do
dataset = dataset.drop(columns=['fnlwgt', 'education', 'capital-gain', 'capital-loss'])
dataset

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,13,Divorced,Prof-specialty,Not-in-family,White,Female,36,United-States,<=50K
48838,64,?,9,Widowed,?,Other-relative,Black,Male,40,United-States,<=50K
48839,38,Private,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,50,United-States,<=50K
48840,44,Private,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,40,United-States,<=50K


In [None]:
def replaceWithOneHot(df, col_name):
    # Takes in a pandas dataframe and replaces column with name col_name
    # with multiple columns of its one-hot encoding
    one_hots = pd.get_dummies(dataset[col_name], prefix=col_name)
    df = df.drop(columns =[col_name])
    df = df.join(one_hots)
    return df 

## Onehot categorical variables
#for feature in ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']:
#    dataset = replaceWithOneHot(dataset, feature);

#Format below features in dataset to be binary based on Lohaus' get_real_data.py
dataset.loc[dataset['age'] > 37, 'age'] = 1
dataset.loc[dataset['age'] != 1, 'age'] = 0

dataset.loc[dataset['workclass'] == 'Private', 'workclass'] = 1
dataset.loc[dataset['workclass'] != 1, 'workclass'] = 0

dataset.loc[dataset['education-num'] == 9, 'education-num'] = 1
dataset.loc[dataset['education-num'] != 1, 'education-num'] = 0

dataset.loc[dataset['marital-status'] == "Married-civ-spouse", 'marital-status'] = 1
dataset.loc[dataset['marital-status'] != 1, 'marital-status'] = 0

dataset.loc[dataset['occupation'] == "Craft-repair", 'occupation'] = 1
dataset.loc[dataset['occupation'] != 1, 'occupation'] = 0

dataset.loc[dataset['relationship'] == "Not-in-family", 'relationship'] = 1
dataset.loc[dataset['relationship'] != 1, 'relationship'] = 0

dataset.loc[dataset['race'] == "White", 'race'] = 1
dataset.loc[dataset['race'] != 1, 'race'] = 0

dataset.loc[dataset['hours-per-week'] > 40, 'hours-per-week'] = 1
dataset.loc[dataset['hours-per-week'] != 1, 'hours-per-week'] = 0

dataset.loc[dataset['native-country'] == "United-States", 'native-country'] = 1
dataset.loc[dataset['native-country'] != 1, 'native-country'] = 0

#Replace 'Male' with 1 and 'Female' with 0 in sex column
dataset.loc[dataset['sex'] == 'Male', 'sex'] = 1
dataset.loc[dataset['sex'] == 'Female', 'sex'] = -1

#replace '>50K' with 1 and '<=50K' with 0 in income column
dataset.loc[dataset['income'] == '>50K', 'income'] = 1
dataset.loc[dataset['income'] == '<=50K', 'income'] = -1

dataset

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,1,0,0,0,0,1,1,1,0,1,-1
1,1,0,0,1,0,0,1,1,0,1,-1
2,1,1,1,0,0,1,1,1,0,1,-1
3,1,1,0,1,0,0,0,1,0,1,-1
4,0,1,0,1,0,0,0,-1,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...
48837,1,1,0,0,0,1,1,-1,0,1,-1
48838,1,0,1,0,0,0,0,1,0,1,-1
48839,1,1,0,1,0,0,1,1,1,1,-1
48840,1,1,0,0,0,0,0,1,0,1,-1


In [None]:
# Lohaus uses a random 10000 points for training, validation, and the rest for testing
# Since adult.csv is already shuffled, use the first 10000 rows as training and rest as testing

train_dataset = dataset.head(10000)
test_dataset = dataset.tail(len(dataset) - 10000)

In [36]:
from sklearn.model_selection import train_test_split
import os
from random import shuffle
import pandas as pd
def get_adult_data(sens_attribute = 'sex',load_data_size=None):
  """Load the Adult dataset.
  Source: UCI Machine Learning Repository.

  Parameters
  ----------
  load_data_size: int
      The number of points to be loaded. If None, returns all data points unshuffled.

  Returns
  ---------
  X: numpy array
      The features of the datapoints with shape=(number_points, number_features).
  y: numpy array
      The class labels of the datapoints with shape=(number_points,).
  s: numpy array
      The binary sensitive attribute of the datapoints with shape=(number_points,).
  """
  

  def mapping(tuple):
    # age, 37
    tuple['age'] = 1 if tuple['age'] > 37 else 0
    # workclass
    tuple['workclass'] = 'NonPrivate' if tuple['workclass'] != 'Private' else 'Private'
    # edunum
    tuple['education-num'] = 1 if tuple['education-num'] > 9 else 0
    # maritial statue
    tuple['marital-status'] = "Marriedcivspouse" if tuple['marital-status'] == "Married-civ-spouse" else "nonMarriedcivspouse"
    # occupation
    tuple['occupation'] = "Craftrepair" if tuple['occupation'] == "Craft-repair" else "NonCraftrepair"
    # relationship
    tuple['relationship'] = "NotInFamily" if tuple['relationship'] == "Not-in-family" else "InFamily"
    # race
    tuple['race'] = 'NonWhite' if tuple['race'] != "White" else "White"
    # sex
    tuple['sex'] = 'Female' if tuple['sex'] != "Male" else 'Male'
    # hours per week
    tuple['hours-per-week'] = 1 if tuple['hours-per-week'] > 40 else 0
    # native country
    tuple['native-country'] = "US" if tuple['native-country'] == "United-States" else "NonUS"
    return tuple


  df = dataset
  df = df.apply(mapping, axis=1)

  if sens_attribute == 'sex':
    sensitive_attr_map = {'Male': 1, 'Female': -1}
  elif sens_attribute == 'race':
    sensitive_attr_map = {'White': 1, 'NonWhite': -1}
  label_map = {'>50K': 1, '<=50K': -1}

  if sens_attribute == 'sex':
    x_vars = ['age','workclass','education-num','marital-status','occupation','relationship','race','hours-per-week','native-country']
  elif sens_attribute == 'race':
    x_vars = ['age','workclass','education-num','marital-status','occupation','relationship','sex','hours-per-week','native-country']

  s = df[sens_attribute].map(sensitive_attr_map).astype(int)
  y = df['income'].map(label_map).astype(int)


  x = pd.DataFrame(data=None)
  for x_var in x_vars:
    x = pd.concat([x, pd.get_dummies(df[x_var],prefix=x_var, drop_first=False)], axis=1)

  X = x.to_numpy()
  s = s.to_numpy()
  y = y.to_numpy()

  if load_data_size is not None: # Don't shuffle if all data is requested
      # shuffle the data
      perm = list(range(0, len(y)))
      shuffle(perm)
      X = X[perm]
      y = y[perm]
      s = s[perm]

      print("Loading only %d examples from the data" % load_data_size)
      X = X[:load_data_size]
      y = y[:load_data_size]
      s = s[:load_data_size]

  X = X[:, (X != 0).any(axis=0)]

  return X, y, s

def normalize(x):
	# scale to [-1, 1]
	x_ = (x - x.min()) / (x.max() - x.min()) * 2 - 1
	return x_

## 2. Implement baseline models

In [49]:
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator
import sklearn.metrics.pairwise as kernels
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import numpy as np
import cvxpy as cp
import random

In [53]:
class BaselineModel(BaseEstimator):
    def __init__(self, l2_beta=0.001, kernel='linear', gamma=0.1, loss_name='hinge', lambda_max=1, max_iter=3000, solver='SCS', verbose=False,reason_points=0.5):

        self.l2_beta = l2_beta # Regularization parameter beta for the l2 regularization
        self.kernel = kernel # The SVM kernel to be used.. Options:['linear','rbf','poly']
        self.gamma = gamma # If kernel='rbf', gamma to be kernel width, If kernel='poly', gamma to be degree.
        self.loss_name = loss_name # Loss function to be used. Options:['hinge','logistic','squared','exponential']
        self.lambda_max = lambda_max # The max lambda value for the start of the binary search.
        self.max_iter = max_iter # The number of iterations.
        self.solver = solver # The solver to be used by cvxpy. Options:['SCS','ECOS'].
        self.verbose = verbose # If true, Overrides the default of hiding solver output.
        self.reason_points = reason_points # The ratio of points used as reasonable points for the similarity-based approach of SearchFair.

    def fit(self, x_train, y_train,s_train):
        """Fits a baseline SVM model on the given training data.
        Parameters
        ----------
        x_train: numpy array
            The features of the training data with shape=(number_points,number_features).
        y_train: numpy array
            The class labels of the training data with shape=(number_points,).
      
        Returns
        ----------
        self: object
        """
        self.x_train = x_train
        self.y_train = y_train
        self.s_train = s_train
        
        self._preprocess()

        lambda_min, lambda_max = 0, self.lambda_max

        self._construct_problem()
                
        self._optimize()
            
        criterion = False
        
        return self

    def predict(self, x_test):
        """Predict the label of test data.
        Parameters
        ----------
        x_test: numpy array
            The features of the test data with shape=(number_points,number_features).
        Returns
        ----------
        y_hat: numpy array
            The predicted class labels with shape=(number_points,).
        """
        kernel_matr = self.kernel_function(x_test, self.x_train[self.reason_pts_index])
        y_hat = np.dot(self.coef_, np.transpose(kernel_matr))
        return np.sign(y_hat)

    def _preprocess(self):
        """Setting the attributes loss_func and kernel_function.
        """
        self.coef_ = None
        if self.loss_name == 'logistic':
            self.loss_func = lambda z: cp.logistic(-z)
        elif self.loss_name == 'hinge':
            self.loss_func = lambda z: cp.pos(1.0 - z)
        elif self.loss_name == 'squared':
            self.loss_func = lambda z: cp.square(-z)
        elif self.loss_name == 'exponential':
            self.loss_func = lambda z: cp.exp(-z)
        else:
            print('Using default loss: hinge loss.')
            self.loss_func = lambda z: cp.pos(1.0 - z)

        if self.kernel == 'rbf':
            self.kernel_function = lambda X, Y: kernels.rbf_kernel(X, Y, self.gamma)
        elif self.kernel == 'poly':
            self.kernel_function = lambda X, Y: kernels.polynomial_kernel(X, Y, degree=self.gamma)
        elif self.kernel == 'linear':
            self.kernel_function = lambda X, Y: kernels.linear_kernel(X, Y) + 1
        else:
            self.kernel_function = kernel

        # Choose random reasonable points
        self.nmb_pts = self.x_train.shape[0]
        if self.reason_points <= 1:
            self.reason_pts_index = list(range(int(self.nmb_pts * self.reason_points)))
        else:
            self.reason_pts_index = list(range(self.reason_points))
        self.nmb_reason_pts = len(self.reason_pts_index)

    def _construct_problem(self):
        """ Construct the cvxpy minimization problem.
        """

        # Variable to optimize
        self.params = cp.Variable((len(self.reason_pts_index), 1))
        # Parameter for Kernel Matrix
        self.kernel_matrix = cp.Parameter(shape=(self.x_train.shape[0], len(self.reason_pts_index)))
        self.bias = cp.Variable()

         
        # Loss Function to Minimize (with Regularization)
        
        self.loss = self.loss = cp.sum(self.loss_func(cp.multiply(self.y_train.reshape(-1, 1), self.kernel_matrix @ self.params))) + self.l2_beta * self.nmb_pts * cp.square(
                cp.norm(self.params, 2))
        
        # Final Problem Formulization
        self.prob = cp.Problem(cp.Minimize(self.loss))

    def _optimize(self):
        """Conduct the optimization of the created problem by using ECOS or SCS
        with cvxpy. 
        """
        self.K_sim = self.kernel_function(self.x_train, self.x_train[self.reason_pts_index])
        self.kernel_matrix.value = self.K_sim

        if self.solver == 'SCS':
            self.prob.solve(solver=cp.SCS, max_iters=self.max_iter, verbose=self.verbose, warm_start=True)
        elif self.solver == 'ECOS':
            try:
                self.prob.solve(solver=cp.ECOS, max_iters=self.max_iter, verbose=self.verbose, warm_start=True)
            except Exception as e:
                self.prob.solve(solver=cp.SCS, max_iters=self.max_iter, verbose=self.verbose, warm_start=True)
    
        self.coef_ = self.params.value.squeeze()

## 3. Implement Basic Test Procedure

In [44]:
class TestProcedure():
    def __init__(self,model):

        self.model = model

    def BuildDataset(self,sens_attribute,train_size = 1200):
        x_data, y_data, s_data = get_adult_data(sens_attribute,load_data_size=None)
        # Train Test split. Here, we choose a small number to reduce running time.
        train_size = 1200
        x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=train_size, shuffle=True)
        
        self.X_train = x_train
        self.y_train = y_train

        self.X_test = x_test
        self.y_test = y_test

        self.s_train = s_train
        self.s_test = s_test

                
    def BuildModel(self):
        
        start_time = time.time()
        
        self.model.fit(self.X_train,self.y_train,self.s_train)
        
        end_time = time.time()
        build_time = end_time - start_time
        
        return build_time
        
    def RunBasicTest(self,sens_attribute):
        self.BuildDataset(sens_attribute)
        build_time = self.BuildModel()
        predictions = self.model.predict(self.X_test)
        prediction_accuracy = np.equal(self.y_test, predictions).mean()
        
        ddp,deo = self.compute_fairness_measures(predictions, self.y_test ,self.s_test)
        results = {"BuildTime":build_time,"PredictionAccuracy":prediction_accuracy,"DDP":ddp,"DEO":deo}
        self.PrintResults(results)
        return results
        
    def compute_fairness_measures(self, y_predicted, y_true, sens_attr):
        """Compute value of demographic parity and equality of opportunity for given predictions.
        Parameters
        ----------
        y_predicted: numpy array
            The predicted class labels of shape=(number_points,).
        y_true: numpy array
            The true class labels of shape=(number_points,).
        sens_attr: numpy array
            The sensitive labels of shape=(number_points,).
        Returns
        ----------
        DDP: float
            The difference of demographic parity.
        DEO: float
            The difference of equality of opportunity.
        """
        
        positive_rate_prot = self.get_positive_rate(y_predicted[sens_attr==-1], y_true[sens_attr==-1])
        positive_rate_unprot = self.get_positive_rate(y_predicted[sens_attr==1], y_true[sens_attr==1])
        true_positive_rate_prot = self.get_true_positive_rate(y_predicted[sens_attr==-1], y_true[sens_attr==-1])
        true_positive_rate_unprot = self.get_true_positive_rate(y_predicted[sens_attr==1], y_true[sens_attr==1])
        DDP = positive_rate_unprot - positive_rate_prot
        DEO = true_positive_rate_unprot - true_positive_rate_prot

        return DDP, DEO

    def get_positive_rate(self, y_predicted, y_true):
        """Compute the positive rate for given predictions of the class label.
        Parameters
        ----------
        y_predicted: numpy array
            The predicted class labels of shape=(number_points,).
        y_true: numpy array
            The true class labels of shape=(number_points,).
        Returns
        ---------
        pr: float
            The positive rate.
        """
        tn, fp, fn, tp = confusion_matrix(y_true.astype(int), y_predicted.astype(int)).ravel()
        pr = (tp+fp) / (tp+fp+tn+fn)
        return pr

    def get_true_positive_rate(self, y_predicted, y_true):
        """Compute the true positive rate for given predictions of the class label.
        Parameters
        ----------
        y_predicted: numpy array
            The predicted class labels of shape=(number_points,).
        y_true: numpy array
            The true class labels of shape=(number_points,).
        Returns
        ---------
        tpr: float
            The true positive rate.
        """
        tn, fp, fn, tp = confusion_matrix(y_true.astype(int), y_predicted.astype(int)).ravel()
        tpr = tp / (tp+fn)
        return tpr
        
    def PrintResults(self,results):
      print("Kernel Type:",self.model.kernel)
      print("Loss Func:",self.model.loss_name)
      print("Run Time:",round(results['BuildTime'],4),"seconds")
      print("Prediction Accuracy:",str(round(results['PredictionAccuracy']*100,4)),"%")
      print("DDP Score:",str(round(results['DDP'],4)))
      print("DEO Score:",str(round(results['DEO'],4)))

In [46]:
baseline_linear_hinge = BaselineModel(kernel='linear',loss_name='hinge')
baseline_1_tester = TestProcedure(baseline_linear_hinge)
baseline_1_test_results = baseline_1_tester.RunBasicTest(sens_attribute='sex')

Fit
Preprocess
Construct
Optimize
Predict
Kernel Type: linear
Loss Func: hinge
Run Time: 2.342 seconds
Prediction Accuracy: 80.767 %
DDP Score: 0.0957
DEO Score: 0.0149


In [48]:
baseline_rbf_hinge = BaselineModel(kernel='rbf',loss_name='hinge')
baseline_2_tester = TestProcedure(baseline_rbf_hinge)
baseline_2_test_results = baseline_2_tester.RunBasicTest(sens_attribute='race')

Fit
Preprocess
Construct
Optimize
Predict
Kernel Type: rbf
Loss Func: hinge
Run Time: 2.7336 seconds
Prediction Accuracy: 81.5814 %
DDP Score: 0.0894
DEO Score: 0.0509


## 4. GridSearch

In [59]:
# regularization parameter beta
sens_attribute = 'sex'

grid_search_model = BaselineModel()

beta_params = [0.0001, 0.001, 0.01] # For Linear Kernel
gamma_params = [0.01, 0.1, 1] # For RBF Kernel
kernel_params = ['linear','rbf']
cv_params = {'l2_beta': beta_params,'gamma': gamma_params,'kernel':kernel_params}

x_data, y_data, s_data = get_adult_data(sens_attribute,load_data_size=None)
x_train, x_test, y_train, y_test, s_train, s_test = train_test_split(x_data, y_data, s_data, train_size=1200, shuffle=True)

grid_clf = GridSearchCV(grid_search_model,cv_params, cv=4, n_jobs=1, scoring='accuracy')
grid_clf.fit(x_train, y_train, s_train = s_train)

GridSearchCV(cv=4, error_score=nan,
             estimator=BaselineModel(gamma=0.1, kernel='linear', l2_beta=0.001,
                                     lambda_max=1, loss_name='hinge',
                                     max_iter=3000, reason_points=0.5,
                                     solver='SCS', verbose=False),
             iid='deprecated', n_jobs=1,
             param_grid={'gamma': [0.01, 0.1, 1], 'kernel': ['linear', 'rbf'],
                         'l2_beta': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [57]:
grid_clf.cv_results_

{'mean_fit_time': array([0.94513726, 0.93955191, 0.90660055, 0.83508706, 0.86360677,
        0.84023309, 0.84569343, 0.84799782, 0.8198127 , 1.81358155,
        1.91670354, 1.92958538, 1.25409619, 1.2327923 , 1.26333014,
        1.95298656, 1.96494754, 1.94357387]),
 'mean_score_time': array([0.00536474, 0.00408999, 0.00392254, 0.0038592 , 0.00230948,
        0.0023284 , 0.00236988, 0.00234032, 0.0030729 , 0.01120504,
        0.01096574, 0.0142192 , 0.00729553, 0.01055566, 0.01173043,
        0.00750128, 0.00728623, 0.00729338]),
 'mean_test_score': array([0.805     , 0.805     , 0.805     , 0.805     , 0.805     ,
        0.805     , 0.805     , 0.805     , 0.805     , 0.79666667,
        0.79666667, 0.79666667, 0.805     , 0.805     , 0.805     ,
        0.805     , 0.805     , 0.805     ]),
 'param_kernel': masked_array(data=['linear', 'linear', 'linear', 'linear', 'linear',
                    'linear', 'linear', 'linear', 'linear', 'rbf', 'rbf',
                    'rbf', 'rbf', '