In [178]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, LeaveOneOut

import numpy as np
import pandas as pd
import random
import math
from tqdm.auto import tqdm

In [175]:
dfX = pd.read_csv("data/X.csv", index_col=0)
X = df.values
dfY = pd.read_csv("data/y.csv", index_col=0)
y = dfY["class"].values

In [3]:
# These are the best hyperparameters
classifier = RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample', 
                                    criterion='entropy', max_depth=20, max_features=None,
                                    min_samples_leaf=1, min_samples_split=5, n_estimators=100)

In [5]:
_ = classifier.fit(X, y)

In [43]:
probs = pd.Series(classifier.predict_proba(X)[:, 0], name="class_prob")
young = dfX["Age"] <= 25
woman = dfX["Sex"] == 1
foreign = dfX["foreign worker"] == 1
credit = 1 - dfY["class"]
table = pd.concat([young, woman, foreign, credit, probs], axis=1)

In [148]:
def get_group_ratio(labels):
    print(labels.sum())
    print(f"ratio young to old:\t{(labels[young].sum() / labels[young].size) / (labels[~young].sum() / labels[~young].size)}")
    print(f"ratio female to male:\t{(labels[woman].sum() / labels[woman].size) / (labels[~woman].sum() / labels[~woman].size)}")
    print(f"ratio foreign to else:\t{(labels[foreign].sum() / labels[foreign].size) / (labels[~foreign].sum() / labels[~foreign].size)}")

In [154]:
def adjust(criterion, labels, probs, p=1):
    ratio = labels.sum() / labels.size
    ratio_True = labels[criterion].sum() / labels[criterion].size
    ratio_False = labels[~criterion].sum() / labels[~criterion].size
    if ratio_True > ratio_False:
        return adjust(~criterion, labels, probs)
    # ratio = (ratio_True + ratio_False) / 2
    n_True = round(labels[criterion].size * (ratio - ratio_True) * p)
    n_False = round(labels[~criterion].size * (ratio_False - ratio) * p)
    changed_True = labels.sort_values(key=lambda _: probs, ascending = False)[criterion & (labels == False)][:n_True].replace(0, value=1)
    changed_False = labels.sort_values(key=lambda _: probs, ascending = True)[(~criterion) & (labels == True)][:n_False].replace(1, value=0)
    labels.update(changed_True)
    labels.update(changed_False)

In [169]:
credit = 1 - dfY["class"]
print("== No adjustment ==")
get_group_ratio(credit)
print(f"ratio overall: \t\t{credit.sum() / credit.size}")
n = 100
for i in range(n):
    criteria = [young, woman, foreign]
    random.shuffle(criteria)
    for criterion in criteria:
        adjust(criterion, credit, probs, p=i/n)
    # print(f"Adjustment round {i}")
print("== With Adjustment ==")
get_group_ratio(credit)
print(f"ratio overall: \t\t{credit.sum() / credit.size}")

== No adjustment ==
700
ratio young to old:	0.7948260481712757
ratio female to male:	0.8965673282047968
ratio foreign to else:	0.7765820195726738
ratio overall: 		0.7
== With Adjustment ==
699
ratio young to old:	0.97400795486079
ratio female to male:	0.9493745885450954
ratio foreign to else:	0.9945283169582235
ratio overall: 		0.699


In [171]:
(1 - credit).to_csv("data/y_adjusted.csv")

In [186]:
loo = LeaveOneOut()
cv_results = cross_val_predict(classifier, X, (1 - credit).values, cv=loo)
# save to csv to avoid re-run
pd.DataFrame(cv_results).to_csv('cross_val_predictions_adjusted.csv')

In [187]:
print('Test accuracy:')
print(accuracy_score(y, cv_results))

Test accuracy:
0.776


In [190]:
# Re-read X to get column names
X = pd.read_csv("data/X.csv", index_col=0)
# Flip y indexing: 1 = credit granted, 0 = no credit granted
cv_results = pd.read_csv('cross_val_predictions_adjusted.csv').iloc[:,1]
pred = np.array(pd.DataFrame(cv_results).replace({0:1, 1:0}))

In [191]:
# Implement group fairness measures (assume women being inferiorly treated to men,
# immigrant inferiorly treated to nonimmigrant, and young to old (split at 25)).

def group_fairness(group_split_condition):
    discr_group = X[group_split_condition].index
    other_group = X[~group_split_condition].index

    # granted credit percentage for the discriminated
    discr_group_rate = pred[discr_group].sum() / pred[discr_group].size
    other_group_rate = pred[other_group].sum() / pred[other_group].size
    print('Discriminated group credit grant rate: {}'.format(discr_group_rate))
    print('Other group credit grant rate: {}'.format(other_group_rate))
    print('Difference: {}'.format(other_group_rate-discr_group_rate))
    print('Ratio: {}'.format(discr_group_rate/other_group_rate))
    print()

female_condition = X['Sex'] == 1
immigrant_condition = X['foreign worker'] == 1
young_condition = X['Age'] <= 25

print('Women - men')
group_fairness(female_condition)

# immigrant - not immigrant
print('Immigrant - non immigrant')
group_fairness(immigrant_condition)

# young - old
print('Young - old')
group_fairness(young_condition)

# Intersection groups

# woman immigrant - other
print('Woman immigrant - other')
group_fairness(female_condition & immigrant_condition)

# young immigrant - other
print('Young immigrant - other')
group_fairness(young_condition & immigrant_condition)

# young woman - other
print('Young woman - other')
group_fairness(young_condition & female_condition)

# young woman immigrant - other
print('Young woman immigrant - other')
group_fairness(female_condition & young_condition & immigrant_condition)


Women - men
Discriminated group credit grant rate: 0.7387096774193549
Other group credit grant rate: 0.755072463768116
Difference: 0.016362786348761094
Ratio: 0.9783295152002972

Immigrant - non immigrant
Discriminated group credit grant rate: 0.7455867082035307
Other group credit grant rate: 0.8648648648648649
Difference: 0.11927815666133423
Ratio: 0.8620846313603323

Young - old
Discriminated group credit grant rate: 0.7052631578947368
Other group credit grant rate: 0.7604938271604939
Difference: 0.05523066926575704
Ratio: 0.9273752563226246

Woman immigrant - other
Discriminated group credit grant rate: 0.7392739273927392
Other group credit grant rate: 0.7546628407460545
Difference: 0.01538891335331527
Ratio: 0.979608226982394

Young immigrant - other
Discriminated group credit grant rate: 0.7058823529411765
Other group credit grant rate: 0.7601476014760148
Difference: 0.05426524853483827
Ratio: 0.9286122215876642

Young woman - other
Discriminated group credit grant rate: 0.7142857