In [2]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, LeaveOneOut

import numpy as np
import pandas as pd

In [3]:
X = pd.read_csv("data/X.csv", index_col=0).values
y = pd.read_csv("data/y.csv", index_col=0)
y = y["class"].values

In [3]:
# These are the best hyperparameters
classifier = RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample', 
                                    criterion='entropy', max_depth=20, max_features=None,
                                    min_samples_leaf=1, min_samples_split=5, n_estimators=100)

In [13]:
classifier.fit(X, y)

# find train accuracy for reference, to compare to loo-cv results
pred = classifier.predict(X)
print(accuracy_score(y, pred))

0.996


In [28]:
loo = LeaveOneOut()
loo.get_n_splits(X)

# Perform leave one out cross-validation
cv_results = cross_val_predict(classifier, X, y, cv=loo)

# save to csv to avoid re-run
pd.DataFrame(cv_results).to_csv('cross_val_predictions.csv')

In [30]:
print('Test accuracy:')
print(accuracy_score(y, cv_results))


Test accuracy:
0.77


This accuracy is not great, but satisfactory given the small amount of data.

In [6]:
# Re-read X to get column names
X = pd.read_csv("data/X.csv", index_col=0)
# Flip y indexing: 1 = credit granted, 0 = no credit granted
cv_results = pd.read_csv('cross_val_predictions.csv').iloc[:,1]
pred = np.array(pd.DataFrame(cv_results).replace({0:1, 1:0}))

300


In [9]:
# Implement group fairness measures (assume women being inferiorly treated to men,
# immigrant inferiorly treated to nonimmigrant, and young to old (split at 25)).

def group_fairness(group_split_condition):
    discr_group = X[group_split_condition].index
    other_group = X[~group_split_condition].index

    # granted credit percentage for the discriminated
    discr_group_rate = pred[discr_group].sum() / pred[discr_group].size
    other_group_rate = pred[other_group].sum() / pred[other_group].size
    print('Discriminated group credit grant rate: {}'.format(round(discr_group_rate, 3)))
    print('Other group credit grant rate:         {}'.format(round(other_group_rate, 3)))
    print('Difference:                            {}'.format(round(other_group_rate-discr_group_rate, 3)))
    print('Ratio:                                 {}'.format(round(discr_group_rate/other_group_rate, 3)))
    print()

female_condition = X['Sex'] == 1
immigrant_condition = X['foreign worker'] == 1
young_condition = X['Age'] <= 25

print('Women - men')
group_fairness(female_condition)

# immigrant - not immigrant
print('Immigrant - non immigrant')
group_fairness(immigrant_condition)

# young - old
print('Young - old')
group_fairness(young_condition)

# Intersection groups

# woman immigrant - other
print('Woman immigrant - other')
group_fairness(female_condition & immigrant_condition)

# young immigrant - other
print('Young immigrant - other')
group_fairness(young_condition & immigrant_condition)

# young woman - other
print('Young woman - other')
group_fairness(young_condition & female_condition)

# young woman immigrant - other
print('Young woman immigrant - other')
group_fairness(female_condition & young_condition & immigrant_condition)


Women - men
Discriminated group credit grant rate: 0.719
Other group credit grant rate:         0.793
Difference:                            0.073
Ratio:                                 0.907

Immigrant - non immigrant
Discriminated group credit grant rate: 0.764
Other group credit grant rate:         0.919
Difference:                            0.155
Ratio:                                 0.832

Young - old
Discriminated group credit grant rate: 0.647
Other group credit grant rate:         0.799
Difference:                            0.151
Ratio:                                 0.81

Woman immigrant - other
Discriminated group credit grant rate: 0.716
Other group credit grant rate:         0.793
Difference:                            0.077
Ratio:                                 0.903

Young immigrant - other
Discriminated group credit grant rate: 0.642
Other group credit grant rate:         0.8
Difference:                            0.158
Ratio:                                 0.803

