## Preliminaries

In [1]:
import json
from DiscriminationMitigation import *
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 100)

In [3]:
def simple_synth(n=10000, class_probab=0.5, gamma0=4, gamma1=6, alpha0=2, alpha1=1, beta0=1, beta1=1):

    np.random.seed(123)

    # Protected class variable
    c1 = np.random.binomial(1, p=class_probab, size=n) # group 1
    c0 = 1-c1 # group 0

    # Other covariate
    w = gamma0*c0 + gamma1*c1 + np.random.normal(0, 0.5, size=n) # linear function of class & shock

    # Outcome variable
    y = alpha0*c0 + alpha1*c1 + beta0*c0*w + beta1*c1*w + np.random.normal(0, 0.5, size=n)

    return pd.DataFrame([y, c0, c1, w]).T.rename(columns={0:'y', 1: 'c0', 2: 'c1', 3: 'w'})

In [19]:
# Get example configuration files
with open('example_config.json') as j:
    config = json.load(j)

with open('example_weights.json') as j:
    weights = json.load(j)

print(config)
print(weights)

{'protected_class_features': ['c0', 'c1', 'z'], 'target_feature': ['y']}
{'c0': {'0': 0.2, '1': 0.8}, 'c1': {'0': 0.8, '1': 0.2}, 'z': {'1': 0.1, '2': 0.2, '3': 0.4, '4': 0.3}}


### Instantiate some synthetic data

In [10]:
synth = simple_synth()
synth['z'] = np.random.randint(low=1, high=5, size=len(synth)) # add higher-dimensional protected class
print(synth.head())

          y   c0   c1         w  z
0  7.383773  0.0  1.0  6.479200  2
1  6.255114  1.0  0.0  4.230080  3
2  5.614841  1.0  0.0  3.773609  4
3  7.692184  0.0  1.0  6.553467  4
4  7.440835  0.0  1.0  6.139432  1


### Split the data into train, validation, and test sets

In [12]:
# Train (and val) / test split
X_train, X_test, y_train, y_test = train_test_split(synth.loc[:, ~synth.columns.isin(config['target_feature'])],
                                                    synth[config['target_feature']], random_state=123,
                                                    test_size=500)

# Train / val split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=123, test_size=0.2)

for x in X_train, X_val, X_test:
    print(x.shape)

(7600, 4)
(1900, 4)
(500, 4)


### Train a TF Keras Model class deep learning model

In [16]:
# Tensorflow Keras Model class
tf.keras.backend.clear_session()
inputs = tf.keras.layers.Input(shape=4,)
dense = tf.keras.layers.Dense(8)(inputs)
dropout = tf.keras.layers.Dropout(0.3)(dense)
dense = tf.keras.layers.Dense(16)(dropout)
dropout = tf.keras.layers.Dropout(0.1)(dense)
output = tf.keras.layers.Dense(1, activation='linear', name='output')(dropout)
model = tf.keras.Model(inputs=inputs, outputs=output)

model.compile(optimizer='adam', loss='mse')

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense (Dense)                (None, 8)                 40        
_________________________________________________________________
dropout (Dropout)            (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
output (Dense)               (None, 1)                 17        
Total params: 201
Trainable params: 201
Non-trainable params: 0
_______________________________________________________________

In [17]:
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1f3c81a1bc8>

### Discrimination mitigation tool

In [29]:
pred = DiscriminationMitigator(df=X_test, model=model, config=config, train=X_train, weights=weights).predictions()

If no category is omitted, users must ensure custom marginal weights for one-hot vectors align correctly.
  "If no category is omitted, users must ensure custom marginal weights for one-hot vectors align correctly.".format(' '.join(extreme_corr)))


In [28]:
print("Dataframe of predictions \n", pred.head())
print("\nCorrelation matrix between predictions \n", pred.corr())

Dataframe of predictions 
       unadj_pred  unif_wts   pop_wts  cust_wts
2656    7.018159  7.017337  7.033709  7.049608
445     6.378776  6.429476  6.428673  6.444573
9505    6.237130  6.186430  6.186176  6.202075
332     6.783767  6.817292  6.822215  6.838114
4168    7.006484  7.005661  7.022033  7.037933

Correlation matrix between predictions 
             unadj_pred  unif_wts   pop_wts  cust_wts
unadj_pred    1.000000  0.999065  0.999168  0.999168
unif_wts      0.999065  1.000000  0.999914  0.999914
pop_wts       0.999168  0.999914  1.000000  1.000000
cust_wts      0.999168  0.999914  1.000000  1.000000
