In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from fairdata import FairData
from fairopt import FairOptimization

In [2]:
df_raw_1 = pd.read_excel('data/Fintech-fairnessJun2020.xlsx', sheet_name='Approved&Default')
df_raw_1

Unnamed: 0,customer_id,loan_request_id,loan_transferred_date,def_flag,age,noofconnections,noofapps,noofsms,noofcontacts,gender
0,144,199.0,2016-04-18 10:34:00,0.0,29.9315,401.0,19.0,472.0,358.0,
1,144,1658.0,2016-06-30 10:04:00,1.0,29.9315,401.0,19.0,472.0,358.0,
2,144,1039.0,2016-06-03 02:18:00,0.0,29.9315,401.0,19.0,472.0,358.0,
3,144,417.0,2016-05-04 10:18:00,0.0,29.9315,401.0,19.0,472.0,358.0,
4,150,218.0,2016-04-19 14:34:00,0.0,32.0740,811.0,10.0,584.0,1820.0,m
...,...,...,...,...,...,...,...,...,...,...
383756,3200000,,NaT,,,,,,,m
383757,3200000,,NaT,,,,,,,m
383758,3200000,,NaT,,,,,,,m
383759,3200000,,NaT,,,,,,,m


In [3]:
df_raw_2 = pd.read_excel('data/Fintech-fairnessJun2020.xlsx', sheet_name='Rejected')
df_raw_2

Unnamed: 0,customer_id,age,noofconnections,noofapps,noofsms,noofcontacts,loan_request_initial_id,gender
0,185,30.6932,0.0,54.0,3646.0,217.0,12116,m
1,237,30.1973,1556.0,97.0,302.0,589.0,145910,m
2,290,47.6575,511.0,4.0,7.0,25996.0,135,m
3,769,47.4247,249.0,18.0,366.0,565.0,193588,m
4,803,31.3452,109.0,39.0,719.0,226.0,1808,m
...,...,...,...,...,...,...,...,...
49979,3200000,26.3123,0.0,21.0,,209.0,711504,m
49980,3200000,25.5945,384.0,30.0,,663.0,711581,m
49981,3200000,24.1973,938.0,,,0.0,711596,m
49982,3200000,20.1370,0.0,98.0,,409.0,711610,m


In [6]:
df_raw_1['reward'] = 2 * (0.5 - df_raw_1['def_flag'])
df_raw_2['reward'] = 0
df_raw_1['approved_dum'] = 1
df_raw_2['approved_dum'] = 0
df_raw = pd.concat([
    df_raw_1.drop(['loan_transferred_date', 'def_flag'], axis=1),
    df_raw_2.rename(columns={'loan_request_initial_id': 'loan_request_id'}),
]).rename(columns={
    'noofconnections': 'connections',
    'noofapps': 'apps',
    'noofsms': 'sms',
    'noofcontacts': 'contacts',
}).astype({
    'customer_id': 'Int64',
    'loan_request_id': 'Int64',
    'age': 'float',
    'connections': 'float',
    'apps': 'float',
    'sms': 'float',
    'contacts': 'float',
    'approved_dum': 'Int64',
    'reward': 'Int64',
})
df_raw


Unnamed: 0,age,approved_dum,customer_id,gender,loan_request_id,apps,connections,contacts,sms,reward
0,29.9315,1,144,,199,19.0,401.0,358.0,472.0,1
1,29.9315,1,144,,1658,19.0,401.0,358.0,472.0,-1
2,29.9315,1,144,,1039,19.0,401.0,358.0,472.0,1
3,29.9315,1,144,,417,19.0,401.0,358.0,472.0,1
4,32.0740,1,150,m,218,10.0,811.0,1820.0,584.0,1
...,...,...,...,...,...,...,...,...,...,...
49979,26.3123,0,3200000,m,711504,21.0,0.0,209.0,,0
49980,25.5945,0,3200000,m,711581,30.0,384.0,663.0,,0
49981,24.1973,0,3200000,m,711596,,938.0,0.0,,0
49982,20.1370,0,3200000,m,711610,98.0,0.0,409.0,,0


In [7]:
df_csv = pd.read_csv('data/Cashe_information.csv').rename(columns={
    'AGE': 'age',
}).astype({
    'customer_id': 'Int64',
    'loan_request_id': 'Int64',
    'age': 'float',
    'connections': 'float',
    'apps': 'float',
    'sms': 'float',
    'contacts': 'float',
    'approved_dum': 'Int64',
})
df_csv

Unnamed: 0,customer_id,loan_request_id,loan_amount,def_flag,age,salary,referal,referer,connections,apps,...,highschool,college,googleplus_status,sales,dating,ios,travel_app,finsavy_app,socialconnect_app,mloan_app
0,320,16,30000.0,0.0,34.0466,85449.0,0.0,0.0,25.0,,...,1.0,0.0,0.0,0.0,0.0,1,1,0,0,0
1,301,18,10000.0,0.0,45.2301,42603.0,0.0,1.0,1050.0,56.0,...,1.0,0.0,0.0,0.0,0.0,0,1,1,1,0
2,302,20,20000.0,0.0,53.7123,44783.0,0.0,0.0,,35.0,...,1.0,0.0,0.0,0.0,0.0,0,1,0,1,0
3,286,21,5000.0,0.0,43.3644,38500.0,0.0,0.0,186.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
4,302,19,20000.0,0.0,53.7123,44783.0,0.0,0.0,,35.0,...,1.0,0.0,0.0,0.0,0.0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417670,549179,,25000.0,,47.5616,25000.0,,,4290.0,49.0,...,0.0,0.0,,,,0,0,0,0,0
417671,1984639,,11000.0,,40.5452,27000.0,0.0,0.0,,62.0,...,1.0,0.0,1.0,,,0,0,0,0,0
417672,776459,,5000.0,,22.5205,15000.0,,,36.0,,...,0.0,0.0,,,,1,0,0,0,0
417673,1865368,,5000.0,,26.4164,25000.0,0.0,0.0,4976.0,48.0,...,1.0,0.0,0.0,,,0,0,0,0,0


In [8]:
df_all = pd.merge(
    df_raw.dropna(),
    df_csv.loc[:, ['loan_request_id', 'salary', 'loan_amount', 'CIBIL']].dropna(), 
    how='left', on='loan_request_id').dropna()
df_all

Unnamed: 0,age,approved_dum,customer_id,gender,loan_request_id,apps,connections,contacts,sms,reward,salary,loan_amount,CIBIL
0,32.0740,1,150,m,218,10.0,811.0,1820.0,584.0,1,40000.0,5000.0,707.0
1,32.0740,1,150,m,97964,10.0,811.0,1820.0,584.0,-1,40000.0,40000.0,707.0
2,32.0740,1,150,m,66,10.0,811.0,1820.0,584.0,1,40000.0,5000.0,707.0
3,32.0740,1,150,m,2312,10.0,811.0,1820.0,584.0,1,40000.0,5000.0,707.0
4,32.0740,1,150,m,3754,10.0,811.0,1820.0,584.0,1,40000.0,5000.0,707.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
261924,18.3178,0,1600000,m,282451,24.0,1.0,171.0,121.0,0,115000.0,200000.0,747.0
261958,30.0904,0,1600000,m,283249,57.0,736.0,862.0,1638.0,0,50251.0,101000.0,663.0
261970,18.6904,0,1600000,m,283473,37.0,196.0,203.0,1328.0,0,68328.0,137000.0,745.0
261987,27.9096,0,1600000,m,283850,48.0,9.0,878.0,237.0,0,32012.0,64000.0,716.0


In [9]:
del df_raw_1, df_raw_2, df_raw, df_csv

In [9]:
df_gender = df_all[~df_all.gender.isna()]
df_gender['gender_dum'] = 1
df_gender.loc[df_gender.gender == 'f', 'gender_dum'] = 0
df_gender = df_gender.drop('gender', axis=1).astype({'approved_dum': 'int64'})
df_gender

Unnamed: 0,customer_id,loan_request_id,age,connections,apps,sms,contacts,approved_dum,salary,loan_amount,CIBIL,gender_dum
0,150,218,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1
1,150,97964,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,40000.0,707.0,1
2,150,66,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1
3,150,2312,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1
4,150,3754,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
261924,1600000,282451,18.3178,1.0,24.0,121.0,171.0,0,115000.0,200000.0,747.0,1
261958,1600000,283249,30.0904,736.0,57.0,1638.0,862.0,0,50251.0,101000.0,663.0,1
261970,1600000,283473,18.6904,196.0,37.0,1328.0,203.0,0,68328.0,137000.0,745.0,1
261987,1600000,283850,27.9096,9.0,48.0,237.0,878.0,0,32012.0,64000.0,716.0,1


In [10]:
df_g = df_gender.drop(['customer_id', 'loan_request_id'], axis=1)
log_vars = ['loan_amount', 'salary', 'connections',  'apps', 'sms', 'contacts']
for c in log_vars:
    df_g[c] = np.log(df_g[c] + 1)
norm_vars = log_vars + ['age', 'CIBIL']
scaler = StandardScaler().fit(df_g[norm_vars])
df_g[norm_vars] = scaler.transform(df_g[norm_vars])
df_g

Unnamed: 0,age,connections,apps,sms,contacts,approved_dum,salary,loan_amount,CIBIL,gender_dum
0,0.022181,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,-1.302074,0.469424,1
1,0.022181,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1.156955,0.469424,1
2,0.022181,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,-1.302074,0.469424,1
3,0.022181,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,-1.302074,0.469424,1
4,0.022181,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,-1.302074,0.469424,1
...,...,...,...,...,...,...,...,...,...,...
261924,-2.445320,-2.702040,-1.024403,-0.808190,-0.729581,0,2.584841,3.060321,0.726930,1
261958,-0.333625,0.686491,0.358826,0.455738,0.377513,0,0.897138,2.252349,0.186168,1
261970,-2.378486,-0.070057,-0.336196,0.353732,-0.612465,0,1.523554,2.612887,0.714055,1
261987,-0.724803,-1.779172,0.081672,-0.483064,0.390122,0,-0.022064,1.712790,0.527363,1


In [11]:
# y = df_g.approved_dum.values.reshape(-1, 1)
# s = df_g.gender_dum.values.reshape(-1, 1)
# a = df_g.drop(['approved_dum', 'gender_dum'], axis=1).values

# n_test = 10000

# shuffle_idx = np.arange(y.shape[0])
# np.random.shuffle(shuffle_idx)
# y_train, y_test = y[shuffle_idx[n_test:]], y[shuffle_idx[:n_test]]
# s_train, s_test, s_eval = s[shuffle_idx[n_test:]], s[shuffle_idx[:n_test]], np.array([0, 1] * n_test).reshape(-1, 1)
# a_train, a_test, a_eval = a[shuffle_idx[n_test:]], a[shuffle_idx[:n_test]], np.repeat(a[shuffle_idx[:n_test]], 2, axis=0)

In [12]:
# df_g_fair_o = FairData(s_train, a_train, y_train, preprocess_method='o')
# decision = df_g.iloc[np.repeat(shuffle_idx[:n_test], 2)].copy()
# decision.gender = s_eval
# decision['y_ml'] = df_g_fair_o.f_ml(s_eval, a_eval)
# decision['y_eo'] = df_g_fair_o.f_eo(a_eval)
# decision['y_aa'] = df_g_fair_o.f_aa(s_eval, a_eval)
# decision['y_1'] = df_g_fair_o.f_1(s_eval, a_eval)
# decision['y_2'] = df_g_fair_o.f_2(s_eval, a_eval)
# decision

In [13]:
# eval_o = df_g_fair_o.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
# pd.DataFrame(eval_o, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

In [14]:
# df_g_fair_o.cit_parametric(summary=True)

In [15]:
df_age = df_gender.copy()
df_age['age_dum'] = 1
df_age.loc[df_age.age < 28, 'age_dum'] = 0
df_age

Unnamed: 0,customer_id,loan_request_id,age,connections,apps,sms,contacts,approved_dum,salary,loan_amount,CIBIL,gender_dum,age_dum
0,150,218,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1,1
1,150,97964,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,40000.0,707.0,1,1
2,150,66,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1,1
3,150,2312,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1,1
4,150,3754,32.0740,811.0,10.0,584.0,1820.0,1,40000.0,5000.0,707.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
261924,1600000,282451,18.3178,1.0,24.0,121.0,171.0,0,115000.0,200000.0,747.0,1,0
261958,1600000,283249,30.0904,736.0,57.0,1638.0,862.0,0,50251.0,101000.0,663.0,1,1
261970,1600000,283473,18.6904,196.0,37.0,1328.0,203.0,0,68328.0,137000.0,745.0,1,0
261987,1600000,283850,27.9096,9.0,48.0,237.0,878.0,0,32012.0,64000.0,716.0,1,0


In [16]:
df_a = df_age.drop(['customer_id', 'loan_request_id', 'age', 'CIBIL', 'loan_amount'], axis=1)
log_vars = ['salary', 'connections',  'apps', 'sms', 'contacts']
for c in log_vars:
    df_a[c] = np.log(df_a[c] + 1)
norm_vars = log_vars
scaler = StandardScaler().fit(df_a[norm_vars])
df_a[norm_vars] = scaler.transform(df_a[norm_vars])
df_a

Unnamed: 0,connections,apps,sms,contacts,approved_dum,salary,gender_dum,age_dum
0,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1
1,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1
2,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1
3,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1
4,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1
...,...,...,...,...,...,...,...,...
261924,-2.702040,-1.024403,-0.808190,-0.729581,0,2.584841,1,0
261958,0.686491,0.358826,0.455738,0.377513,0,0.897138,1,1
261970,-0.070057,-0.336196,0.353732,-0.612465,0,1.523554,1,0
261987,-1.779172,0.081672,-0.483064,0.390122,0,-0.022064,1,0


In [17]:
np.random.seed(1)

y = df_a.approved_dum.values.reshape(-1, 1)
s = df_a.age_dum.values.reshape(-1, 1)
a = df_a.drop(['approved_dum', 'age_dum'], axis=1).values

n_test = 10000

shuffle_idx = np.arange(y.shape[0])
np.random.shuffle(shuffle_idx)
y_train, y_test = y[shuffle_idx[n_test:]], y[shuffle_idx[:n_test]]
s_train, s_test, s_eval = s[shuffle_idx[n_test:]], s[shuffle_idx[:n_test]], np.array([0, 1] * n_test).reshape(-1, 1)
a_train, a_test, a_eval = a[shuffle_idx[n_test:]], a[shuffle_idx[:n_test]], np.repeat(a[shuffle_idx[:n_test]], 2, axis=0)

In [18]:
df_a_fair_o = FairData(s_train, a_train, y_train, preprocess_method='o')
eval_o = df_a_fair_o.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
pd.DataFrame(eval_o, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

Unnamed: 0,y_ml,y_ftu,y_eo,y_aa,y_1,y_2
eo,0.047609,0.0,0.0,0.007883,0.007818,0.007778
cf,0.036733,0.005647,0.008429,0.000969,0.000965,0.000959
mae,0.124969,0.126105,0.126727,0.126556,0.125827,0.125827


In [19]:
df_a_fair_m = FairData(s_train, a_train, y_train, preprocess_method='m')
eval_m = df_a_fair_m.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
pd.DataFrame(eval_m, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

Unnamed: 0,y_ml,y_ftu,y_eo,y_aa,y_1,y_2
eo,0.047609,0.0,0.0,0.007883,0.009826,0.010484
cf,0.036733,0.005647,0.008429,0.000969,7.2e-05,0.000111
mae,0.124969,0.126105,0.126727,0.126556,0.125743,0.12579


In [20]:
df_a_fair_r = FairData(s_train, a_train, y_train, preprocess_method='r')
eval_r = df_a_fair_r.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
pd.DataFrame(eval_r, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

Unnamed: 0,y_ml,y_ftu,y_eo,y_aa,y_1,y_2
eo,0.047609,0.0,0.0,0.007883,0.009888,0.010507
cf,0.036733,0.005647,0.008429,0.000969,4.8e-05,0.000149
mae,0.124969,0.126105,0.126727,0.126556,0.125724,0.125799


In [21]:
df_a_fair_r.a_iscategory

[False, True, False, False, False, True]

In [26]:
df_b = df_a.copy()
df_b['dum'] = 0
df_b.loc[(df_b.age_dum == 0) & (df_b.gender_dum == 1), 'dum'] = 1
df_b.loc[(df_b.age_dum == 1) & (df_b.gender_dum == 0), 'dum'] = 2
df_b.loc[(df_b.age_dum == 1) & (df_b.gender_dum == 1), 'dum'] = 3
df_b

Unnamed: 0,connections,apps,sms,contacts,approved_dum,salary,gender_dum,age_dum,dum
0,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1,3
1,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1,3
2,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1,3
3,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1,3
4,0.742061,-2.373794,-0.045504,0.890059,1,0.432050,1,1,3
...,...,...,...,...,...,...,...,...,...
261924,-2.702040,-1.024403,-0.808190,-0.729581,0,2.584841,1,0,1
261958,0.686491,0.358826,0.455738,0.377513,0,0.897138,1,1,3
261970,-0.070057,-0.336196,0.353732,-0.612465,0,1.523554,1,0,1
261987,-1.779172,0.081672,-0.483064,0.390122,0,-0.022064,1,0,1


In [27]:
np.random.seed(1)

y = df_b.approved_dum.values.reshape(-1, 1)
s = df_b.dum.values.reshape(-1, 1)
a = df_b.drop(['approved_dum', 'age_dum', 'gender_dum', 'dum'], axis=1).values

n_test = 10000

shuffle_idx = np.arange(y.shape[0])
np.random.shuffle(shuffle_idx)
y_train, y_test = y[shuffle_idx[n_test:]], y[shuffle_idx[:n_test]]
s_train, s_test, s_eval = s[shuffle_idx[n_test:]], s[shuffle_idx[:n_test]], np.array([0, 1] * n_test).reshape(-1, 1)
a_train, a_test, a_eval = a[shuffle_idx[n_test:]], a[shuffle_idx[:n_test]], np.repeat(a[shuffle_idx[:n_test]], 2, axis=0)

In [28]:
df_b_fair_o = FairData(s_train, a_train, y_train, preprocess_method='o')
eval_o = df_b_fair_o.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
pd.DataFrame(eval_o, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

Unnamed: 0,y_ml,y_ftu,y_eo,y_aa,y_1,y_2
eo,0.051645,0.0,0.0,0.014663,0.014539,0.014438
cf,0.039234,0.013032,0.015422,0.001123,0.001117,0.001115
mae,0.124886,0.126101,0.126719,0.126567,0.125835,0.125834


In [74]:
df_b_fair_o.ml.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,193656.0
Model:,Logit,Df Residuals:,193647.0
Method:,MLE,Df Model:,8.0
Date:,"Wed, 23 Sep 2020",Pseudo R-squ.:,0.03138
Time:,15:40:43,Log-Likelihood:,-46848.0
converged:,True,LL-Null:,-48365.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,2.5047,0.049,51.098,0.000,2.409,2.601
x2,2.1805,0.017,130.204,0.000,2.148,2.213
x3,2.8940,0.040,72.137,0.000,2.815,2.973
x4,2.8714,0.012,233.602,0.000,2.847,2.895
x5,0.1789,0.009,20.375,0.000,0.162,0.196
x6,0.2703,0.009,29.968,0.000,0.253,0.288
x7,0.1728,0.009,19.139,0.000,0.155,0.190
x8,-0.1164,0.010,-11.406,0.000,-0.136,-0.096
x9,-0.1312,0.009,-14.294,0.000,-0.149,-0.113


In [29]:
df_b_fair_m = FairData(s_train, a_train, y_train, preprocess_method='m')
eval_m = df_b_fair_m.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
pd.DataFrame(eval_m, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

Unnamed: 0,y_ml,y_ftu,y_eo,y_aa,y_1,y_2
eo,0.051645,0.0,0.0,0.014663,0.016576,0.016719
cf,0.039234,0.013032,0.015422,0.001123,0.000766,0.000745
mae,0.124886,0.126101,0.126719,0.126567,0.125755,0.125772


In [30]:
df_b_fair_m.cit_parametric(summary=True)

<F test: F=array([[25127.24149695]]), p=0.0, df_denom=1.94e+05, df_num=3>


0.0

In [59]:
p = df_b_fair_m.f_2(s_train, a_train, 'm')
y_hat = np.random.binomial(1, 0.95, len(y_train)).reshape(-1, 1)
df_b_processed = FairData(s_train, a_train, y_hat, preprocess_method='m')
df_b_processed.cit_parametric(summary=True)

<F test: F=array([[25759.31690959]]), p=0.0, df_denom=1.94e+05, df_num=3>


0.0

In [66]:
df_downsamp_0 = df_b[df_b.approved_dum == 0]
df_downsamp_1 = df_b[df_b.approved_dum == 1].sample(len(df_downsamp_0))
df_downsamp = pd.concat([df_downsamp_0, df_downsamp_1])
df_downsamp

Unnamed: 0,connections,apps,sms,contacts,approved_dum,salary,gender_dum,age_dum,dum
240043,-3.099497,0.271532,0.844877,-0.566906,0,0.753932,1,1,3
240044,1.115360,1.220952,-0.365584,0.116484,0,-0.446098,1,1,3
240045,0.477619,-3.669729,-2.133790,2.714895,0,-0.295031,1,1,3
240046,0.066562,-1.475476,-0.272350,0.087979,0,0.432050,1,1,3
240048,0.488710,1.286706,0.201531,0.925681,0,-1.369973,1,0,1
...,...,...,...,...,...,...,...,...,...
24265,0.849892,0.179342,-0.495486,-4.262774,1,-0.154390,1,0,1
187497,-0.270022,-0.133020,0.039026,-0.425172,1,0.658516,1,1,3
41203,-2.702040,-2.703623,-1.641612,-0.524183,1,-1.126989,1,0,1
13730,0.294454,1.647014,-0.106566,-0.215022,1,1.572837,1,1,3


In [68]:
np.random.seed(1)

y = df_downsamp.approved_dum.values.reshape(-1, 1)
s = df_downsamp.dum.values.reshape(-1, 1)
a = df_downsamp.drop(['approved_dum', 'age_dum', 'gender_dum', 'dum'], axis=1).values

n_test = 10000

shuffle_idx = np.arange(y.shape[0])
np.random.shuffle(shuffle_idx)
y_train, y_test = y[shuffle_idx[n_test:]], y[shuffle_idx[:n_test]]
s_train, s_test, s_eval = s[shuffle_idx[n_test:]], s[shuffle_idx[:n_test]], np.array([0, 1] * n_test).reshape(-1, 1)
a_train, a_test, a_eval = a[shuffle_idx[n_test:]], a[shuffle_idx[:n_test]], np.repeat(a[shuffle_idx[:n_test]], 2, axis=0)

In [69]:
df_downsamp_fair_o = FairData(s_train, a_train, y_train, preprocess_method='o')
eval_o = df_downsamp_fair_o.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
pd.DataFrame(eval_o, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

Unnamed: 0,y_ml,y_ftu,y_eo,y_aa,y_1,y_2
eo,0.153452,0.0,0.0,0.038834,0.038915,0.038944
cf,0.147116,0.03473,0.036162,0.002751,0.002758,0.002767
mae,0.471849,0.480594,0.480206,0.479685,0.479642,0.479629


In [70]:
df_downsamp_fair_m = FairData(s_train, a_train, y_train, preprocess_method='m')
eval_m = df_downsamp_fair_m.evaluate(s_test, a_test, y_test, ['eo', 'cf', 'mae'])
pd.DataFrame(eval_m, index=['eo', 'cf', 'mae'], columns=['y_ml', 'y_ftu', 'y_eo', 'y_aa', 'y_1', 'y_2'])

Unnamed: 0,y_ml,y_ftu,y_eo,y_aa,y_1,y_2
eo,0.153452,0.0,0.0,0.038834,0.037291,0.037029
cf,0.147116,0.03473,0.036162,0.002751,0.003745,0.003693
mae,0.471849,0.480594,0.480206,0.479685,0.47992,0.480044


In [71]:
df_downsamp_fair_m.cit_parametric(summary=True)

<F test: F=array([[110.36026334]]), p=8.385252753101234e-71, df_denom=1.79e+04, df_num=3>


8.385252753101234e-71

In [73]:
p = df_downsamp_fair_m.f_2(s_train, a_train, 'o')
y_hat = np.random.binomial(1, p).reshape(-1, 1)
df_b_processed = FairData(s_train, a_train, y_hat, preprocess_method='o')
df_b_processed.cit_parametric(summary=True)

<F test: F=array([[10.53193649]]), p=6.451279021429456e-07, df_denom=1.79e+04, df_num=3>


6.451279021429456e-07