In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [17]:
from data import germanDataset
import transport
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
pd.set_option("display.max_rows", None, "display.max_columns", None)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from graph import Map
import plotly.express as px
import plotly.graph_objects as go
from responsibly.dataset import GermanDataset

In [2]:
def adj_to_sankey(M):
    source = []
    target = []
    value = [] 
    for i in range(len(M)):
        for j in range(len(M[i])):
            if M[i][j] > 0:
                source.append(i)
                target.append(j + len(M[i]))
                value.append(M[i][j])
                
    return source, target, value

In [3]:
def unfairness_metric(T, X0, X1):
    d = distance_matrix(X0,X1)
    max_distance = np.amax(d, axis = 1) # get max along rows
    uf = np.sum( np.multiply(T, d), axis = 1) # element-wise mult + row sums
    uf_norm = np.divide(uf, max_distance) * X0.shape[0] # normalize by worst possible case
    
    return uf_norm

In [4]:
def parallel_plot(M, groups):
    colors = px.colors.sequential.Plasma

    src, tar, val = adj_to_sankey(M)
    color_swatch = px.colors.qualitative.Pastel

    colors = [color_swatch[i] for i in src]
    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "gray", width = 0.5),
          label = groups*2,
          color = color_swatch[:4]*2,
        ),
        link = dict(
          source = src,
          target = tar,
          value = val,
          color = colors[:len(val)]
      ))])

    fig.update_layout(font_size=10)
    fig.show()

In [81]:
german_ds = GermanDataset() # get orig dataset for quick lookups
german_ds = german_ds.df
german_ds.head()


Columnar iteration over characters will be deprecated in future releases.



Unnamed: 0,status,duration,credit_history,purpose,credit_amount,savings,present_employment,installment_rate,status.1,sex,other_debtors,present_residence_since,property,age,age_factor,installment_plans,housing,number_of_existing_credits,job,number_of_people_liable_for,telephone,foreign_worker,credit
0,male,6,critical account/ other credits existing (not ...,radio/television,1169,unknown/ no savings account,.. >= 7 years,4,male,single,none,4,real estate,67,"[25, 76)",none,own,2,skilled employee/ official,1,True,True,good
1,female,48,existing credits paid back duly till now,radio/television,5951,... < 100 DM,1 <= ... < 4 years,2,female,divorced/separated/married,none,2,real estate,22,"[19, 25)",none,own,1,skilled employee/ official,1,False,True,bad
2,male,12,critical account/ other credits existing (not ...,education,2096,... < 100 DM,4 <= ... < 7 years,2,male,single,none,3,real estate,49,"[25, 76)",none,own,1,unskilled - resident,2,False,True,good
3,male,42,existing credits paid back duly till now,furniture/equipment,7882,... < 100 DM,4 <= ... < 7 years,2,male,single,guarantor,4,building society savings agreement / life insu...,45,"[25, 76)",none,for free,1,skilled employee/ official,2,False,True,good
4,male,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male,single,none,4,unknown / no property,53,"[25, 76)",none,for free,2,skilled employee/ official,2,False,True,bad


In [5]:
cdf = germanDataset() #cdf has processed data (including classification)
X = cdf.drop(["age_factor", "age", "sex", "credit"], axis=1) # remove sensitive attributes and classification
X = X.to_numpy()
X = preprocessing.scale(X)

Y_truth = cdf["credit"] # ground truth classification
cdf0 = cdf[Y_truth==0]
cdf1 = cdf[Y_truth==1]

# split into those with good credit and those with bad
X0 = X[Y_truth==0] # bad credit
X1 = X[Y_truth==1] # good credit



In [6]:
f_columns = cdf.columns
column_ix = np.arange(len(f_columns))
f ={f_columns[i]: column_ix[i] for i in range(len(f_columns))}
f

{'sex': 0,
 'duration': 1,
 'credit_amount': 2,
 'installment_rate': 3,
 'present_residence_since': 4,
 'age': 5,
 'age_factor': 6,
 'number_of_existing_credits': 7,
 'number_of_people_liable_for': 8,
 'telephone': 9,
 'foreign_worker': 10,
 'credit': 11,
 'critical account/ other credits existing (not at this bank)': 12,
 'existing credits paid back duly till now': 13,
 'delay in paying off in the past': 14,
 'no credits taken/ all credits paid back duly': 15,
 'all credits at this bank paid back duly': 16,
 'radio/television': 17,
 'education': 18,
 'furniture/equipment': 19,
 'car (new)': 20,
 'car (used)': 21,
 'business': 22,
 'domestic appliances': 23,
 'repairs': 24,
 'others': 25,
 'retraining': 26,
 'unknown/ no savings account': 27,
 '... < 100 DM': 28,
 '500 <= ... < 1000 DM': 29,
 '.. >= 1000 DM': 30,
 '100 <= ... < 500 DM': 31,
 '.. >= 7 years': 32,
 '1 <= ... < 4 years': 33,
 '4 <= ... < 7 years': 34,
 'unemployed': 35,
 '... < 1 year': 36,
 'single': 37,
 'divorced/separ

In [7]:
# train a classifier based on ground truth credit classification
clf = LogisticRegression(random_state=0).fit(X, Y_truth)
print(clf.score(X, Y_truth))

# split into those with good credit and those with bad
truth_outcomes = clf.predict_proba(X)
Y0 = truth_outcomes[Y_truth==0]
Y1 = truth_outcomes[Y_truth==1]

0.77


In [8]:
# compute the transport map between people with good and bad credit
T = transport.compute_map(Y0, Y1)

In [41]:
# gender
g = Map(T, cdf0.to_numpy(), cdf1.to_numpy(), symmetric = False)
g.add_group(lambda x: x[f["sex"]] == 0, "Male")
g.add_group(lambda x: x[f["sex"]] > 0, "Female")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [44]:
# age
g = Map(T, cdf0.to_numpy(), cdf1.to_numpy(), symmetric = False)
g.add_group(lambda x: x[f["age"]] < 25, "[25, 76)")
g.add_group(lambda x: x[f["age"]] >= 25, "[19, 25)")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [42]:
# gender and age
g = Map(T, cdf0.to_numpy(), cdf1.to_numpy(), symmetric = False)
g.add_group(lambda x: x[f["age"]] < 25 and x[f["sex"]] == 0, "Male [25, 76)")
g.add_group(lambda x: x[f["age"]] < 25 and x[f["sex"]] > 0, "Female [25, 76)")
g.add_group(lambda x: x[f["age"]] >= 25 and x[f["sex"]] == 0, "Male [19, 25)")
g.add_group(lambda x: x[f["age"]] >= 25 and x[f["sex"]] > 0, "Female [19, 25)")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [37]:
# employment detailed
g.add_group(lambda x: x[f["unemployed"]], "Unemployed")
g.add_group(lambda x: x[f['... < 1 year']], "Employed <1 year")
g.add_group(lambda x: x[f['1 <= ... < 4 years']], "Employed 1-4 years")
g.add_group(lambda x: x[f['4 <= ... < 7 years']], "Employed 4-7 years")
g.add_group(lambda x: x[f['.. >= 7 years']], "Employed >7 years")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [43]:
# employment
g = Map(T, cdf0.to_numpy(), cdf1.to_numpy(), symmetric = False)
g.add_group(lambda x: x[f["unemployed"]], "Unemployed")
g.add_group(lambda x: x[f['... < 1 year']]==False, "Employed ")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [46]:
g = Map(T, cdf0.to_numpy(), cdf1.to_numpy(), symmetric = False)
g.add_group(lambda x: x[f["unemployed"]] and x[f["sex"]] == 0, "Unemployed Male")
g.add_group(lambda x: x[f["unemployed"]] and x[f["sex"]] > 0, "Unemployed Female")
g.add_group(lambda x: x[f['... < 1 year']]==False and x[f["sex"]] == 0, "Employed Male")
g.add_group(lambda x: x[f['... < 1 year']]==False and x[f["sex"]]> 0, "Employed Female")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [57]:
# credit history
g = Map(T, cdf0.to_numpy(), cdf1.to_numpy(), symmetric = False)
g.add_group(lambda x:  x[f['critical account/ other credits existing (not at this bank)']], "Critical account")
g.add_group(lambda x:  x[f['existing credits paid back duly till now']], "Credit paid til now")
g.add_group(lambda x: x[f['delay in paying off in the past']], "Delayed credit")
g.add_group(lambda x:  x[f['no credits taken/ all credits paid back duly']], "No credits")
g.add_group(lambda x: x[f['all credits at this bank paid back duly']], "Credit paid")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [60]:
# fix gender and age and examine credit history
g = Map(T, cdf0.to_numpy(), cdf1.to_numpy(), symmetric = False)
g.add_group(lambda x: x[f["age"]] > 25 and x[f["sex"]] == 0 and x[f['critical account/ other credits existing (not at this bank)']], "Critical account")
g.add_group(lambda x: x[f["age"]] > 25 and x[f["sex"]] == 0 and x[f['existing credits paid back duly till now']], "Credit paid til now")
g.add_group(lambda x: x[f["age"]] > 25 and x[f["sex"]] == 0 and x[f['delay in paying off in the past']], "Delayed credit")
g.add_group(lambda x: x[f["age"]] > 25 and x[f["sex"]] == 0 and x[f['no credits taken/ all credits paid back duly']], "No credits")
g.add_group(lambda x: x[f["age"]] > 25 and x[f["sex"]] == 0 and x[f['all credits at this bank paid back duly']], "Credit paid")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])

In [84]:
# examine groups with the same immutable features
group1 = cdf.loc[(cdf["sex"]==0) & (cdf["single"]) & (cdf["age"] >= 25) & cdf['car (new)']]

In [85]:
group1_bad = group1[group1["credit"]==0]
group1_good = group1[group1["credit"]==1]

In [86]:
g0 = cdf.loc[group1_bad.index]
g1 = cdf.loc[group1_good.index]
Y0 = truth_outcomes[group1_bad.index]
Y1 = truth_outcomes[group1_good.index]

# compute the transport map between people with good and bad credit
T = transport.compute_map(Y0, Y1)

In [87]:
g = Map(T, g0.to_numpy(), g1.to_numpy(), symmetric = False)
g.add_group(lambda x: x[f['critical account/ other credits existing (not at this bank)']], "Critical account")
g.add_group(lambda x: x[f['existing credits paid back duly till now']], "Credit paid til now")
g.add_group(lambda x: x[f['delay in paying off in the past']], "Delayed credit")
g.add_group(lambda x: x[f['no credits taken/ all credits paid back duly']], "No credits")
g.add_group(lambda x: x[f['all credits at this bank paid back duly']], "Credit paid")
parallel_plot(g.group_adj,[gr[0] for gr in g.groups])