# [COMPAS] Example Training of Fair MIP Forest



In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
# cwd = '/home/ec2-user/SageMaker/imputation-fairness/data/Adult/code'
cwd = '../../../core'
sys.path.append(cwd)

import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from load_compas import * 
from treeutils import *
from missing_module import * 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(threshold=np.inf)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.impute import KNNImputer

from IPython.display import display, HTML

from itertools import combinations

## Loading & Balancing Data 

In [None]:
## Loading Data ## 
X, y, x_control = load_compas_data()

df = pd.DataFrame(X, columns= ['age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'race', 'sex', 
                               'priors_count', 'c_charge_degree'])

y = pd.Series(y, name="two_year_recid")

df = pd.concat([df, y], axis=1)


## Balancing the Data ##
df = balance_data(df, 'race', 0)

df.describe()

## Generating Missing Entries

In [None]:
sens_attr='race'
s = 777  # random seed

## Generate Missing Data in Training Set ##
df_ms = generate_missing(df, c_label='race', ms_label='sex', p_ms0=0.4, p_ms1=0.1, seed=s)
df_ms = generate_missing(df_ms, c_label='race', ms_label='priors_count', p_ms0=0.6, p_ms1=0.2, seed=s)

## Saving the data with missing values to a pickle file 
df_ms_to_pickle(df_ms, sens_attr, filename='pkl_data/compas_ms.pkl')


<br/>

## Training a Fair MIP Forest Model

### Setting the parameters

In [None]:
### Example Parameters ###
D = 3
num_tree = 30
batch_size = 200 
t_limit = 60
fair = 'fnr'
input_file = 'pkl_data/compas_ms.pkl'
seed = 42
lambd = 0.5
####################################

### Training

In [None]:

### Train a Fair MIP Forest model ###
## Results are saved to a file
run_ensemble_tree(D, num_tree, batch_size, lambd, t_limit, fair, input_file, seed)

## Open the Trained Model

In [None]:
output_file = 'forests/d3trees_seed{}/{}_L{:.2f}_N{}_b{}.pkl'.format(seed, fair, lambd, num_tree, batch_size)

with open(output_file, 'rb') as handle: 
    sol_trees = pickle.load(handle)

forest = sol_to_forest(sol_trees, D)

## Evaluate the Model

In [None]:
## Open the Test Dataset ## 

### Loading Test Data ### 
filename = 'pkl_data/compas_ms.pkl'

with open(filename, 'rb') as handle: 
    data = pickle.load(handle)
    
X_orig = data['X']
y_orig = data['y']
m_orig = data['m']
S_orig = data['S']

X_train, X_test, y_train, y_test, m_train, m_test, S_train, S_test = train_test_split(X_orig, y_orig, m_orig, S_orig, test_size=0.3, random_state=seed)

X_test = np.nan_to_num(X_test, copy=False,nan=-999)


### Evaluate The Model ###
y_hat = forest.predict(X_test, m_test)
forest_acc = binary_score(y_test, y_hat)
forest_fr = np.abs(binary_score(y_test[S_test==0], y_hat[S_test==0], fair) - 
                   binary_score(y_test[S_test==1], y_hat[S_test==1], fair))

print(" ======== Model Performance ======== ")
print("Accuracy: {:.2f}".format(forest_acc))
print("Fairness Metric: {:.2f}".format(forest_fr))