In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, precision_recall_curve
from DiscriminationMitigation import DiscriminationMitigator

pd.set_option('display.max_columns', 50)

In [2]:
def binary_metrics(y_true, y_pred):
    '''
    Calculates binary classification performance metrics for a given model.
    :param y_true: array_like, truth values as int
    :param y_pred: array_like, predicted values as int
    :returns: dict, with keys for each metric: 
        accuracy - proportion of correct predictions out of total predictions
        sensitivity - (aka recall), of all true positives reviews how many did we correctly predict as positive
        specificity - (aka selectivity/TNR), of all true negatives how many did we correctly predict as negative
        precision - of all predicted positive cases how many were actually positive
        F-1 score - harmonic/weighted mean of precision and sensitivity scores
        ROC-AUC - area under receiver operating characteristic curve
        
    '''
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    metrics = {}
    metrics['accuracy'] = round((tp + tn) / len(y_true), 4)
    metrics['sensitivity/recall'] = round(tp / (fn + tp), 4) # aka recall
    metrics['specificity'] = round(tn / (tn + fp), 4) # aka TNR
    metrics['precision'] = round(tp / (tp + fp), 4)
    metrics['f1'] = round(2 * (metrics['precision'] * metrics['sensitivity/recall']) \
                        / (metrics['precision'] + metrics['sensitivity/recall']), 4)
    metrics['roc_auc'] = round(roc_auc_score(y_true, y_pred), 4)
    
    return metrics

In [3]:
def continuous_metrics(y_true, y_pred):
    '''
    Calculates performance metrics for a continuous outcome from a model.
    :param y_true: array_like, truth values as float
    :param y_pred: array_like, predicted values as float
    :returns: dict, with keys for each metric:
        mae - Mean Absolute Error
        mse - Mean Squared Error
        rmse - Root Mean Squared Error
        r2 - Coefficient of Determination (r-squared)
    '''
    metrics = {}
    metrics['mae'] = round(sum(abs(y_true - y_pred)) / len(y_true), 4)
    metrics['mse'] = round(sum((y_true - y_pred)**2) / len(y_true), 4)
    metrics['rmse'] = round(np.sqrt(metrics['mse']), 4)
    metrics['r2'] = round(1 - (sum((y_true - y_pred)**2) / sum((y_true.mean() - y_true)**2)), 4)
    
    return metrics    

In [4]:
def combine_prediction(X_test, y_test, pred, outcome='50k'):
    '''
    Combines X_test dataframe with np.array of predictions
    :param X_test: pd.DataFrame, X_test dataset
    :param pred: np.array, predicted y values
    :param outcome: str, column name to rename true y values
    :returns:
        pd.DataFrame
    '''
    X_test_combined = pd.concat([X_test, pd.DataFrame(y_test, columns=[outcome])], axis=1)
    return pd.concat([X_test_combined.reset_index(drop=True), pd.DataFrame(pred, columns=['pred'])], axis=1)

# Data preparation

### Read in [2019 ASEC (March CPS)](https://cps.ipums.org/cps/)

In [5]:
df = pd.read_csv('./data/asec_2019.csv')

In [6]:
print(df.shape)

(180101, 18)


In [7]:
print(df.head())

   YEAR  MONTH  ASECFLAG   ASECWT  RELATE  AGE  SEX  RACE  MARST  POPSTAT  \
0  2019      3         1  2031.67     101   21    1   100      6        1   
1  2019      3         1  1232.04     101   85    2   100      5        1   
2  2019      3         1  1209.17     101   61    2   100      6        1   
3  2019      3         1  1146.23     101   73    2   100      4        1   
4  2019      3         1  1480.79     301   37    1   100      6        1   

   HISPAN  LABFORCE  UHRSWORK1  EDUC  SCHLCOLL  WKSWORK1  UHRSWORKLY  INCWAGE  
0       0         2       30.0    60       5.0        52          30  18000.0  
1       0         1      999.0    73       0.0         0         999      0.0  
2       0         2       44.0    73       0.0        52          44  12000.0  
3       0         1      999.0    73       0.0         0         999      0.0  
4       0         2       20.0    73       5.0        52          20  12000.0  


### Restrict dataset

In [8]:
# Drop individuals with no employment or earnings last year
df = df.loc[df['INCWAGE'] > 0] # No wage/salary last year
df = df.loc[(df['WKSWORK1'] > 0) & (df['WKSWORK1'] <= 52)] # No weeks worked last year
df = df.loc[(df['UHRSWORKLY'] > 0) & (df['UHRSWORKLY'] < 999)] # No usual hours/week worked last year

In [9]:
print(df.shape)

(85644, 18)


In [10]:
# Restrict to individuals aged 18-64
df = df.loc[(df['AGE'] >=18) & (df['AGE'] <=64)]

In [11]:
print(df.shape)

(78644, 18)


In [12]:
# Restrict to non-Hispanics
df = df.loc[df['HISPAN'] == 0]

In [13]:
print(df.shape)

(63125, 18)


In [14]:
# Restrict to non-mixed-race Blacks and Whites
df = df.loc[df.RACE.isin([100, 200])]
df['blk'] = np.where(df['RACE'] == 200, 1, 0)

In [15]:
print(df.shape)

(55508, 19)


In [16]:
# Restrict to adult civilians
df = df.loc[df['POPSTAT'] == 1]

In [17]:
print(df.shape)

(55069, 19)


### Engineer features

In [18]:
# Flag for part-time usual work
df['pt'] = np.where(df['UHRSWORKLY'] < 35, 1, 0)

In [19]:
# Non-linearities for age
df['AGE2'] = df['AGE'] ** 2
df['AGE3'] = df['AGE'] ** 3

In [20]:
# Ensure education coded correctly
df = df.loc[df['EDUC'] <= 125] # 999 is missing

In [21]:
# Hourly wage
df['hrwage'] = df['INCWAGE'] / (df['WKSWORK1'] * df['UHRSWORKLY'])

In [22]:
df['hrwage'].describe()

count    55069.000000
mean        31.269112
std        184.416973
min          0.000962
25%         14.375000
50%         21.634615
75%         34.188034
max      25481.250000
Name: hrwage, dtype: float64

In [23]:
# Restrict to people earning 1 < hr_wage < 100
df = df.loc[(df['hrwage'] > 1) & (df['hrwage'] < 100)]

In [24]:
print(df['hrwage'].describe())

count    53671.000000
mean        25.907302
std         17.049376
min          1.016667
25%         14.202864
50%         21.634615
75%         33.333333
max         99.759615
Name: hrwage, dtype: float64


In [25]:
# Log hourly wage
df['lnwage'] = np.log(df['hrwage'])

In [26]:
df['INCWAGE'].describe()

count     53671.000000
mean      52682.839876
std       41242.872191
min          25.000000
25%       24617.500000
50%       43680.000000
75%       70000.000000
max      420000.000000
Name: INCWAGE, dtype: float64

In [27]:
# Flag for whether total earnings > 50,000 or not
df['50k'] = np.where(df['INCWAGE'] > 50000, 1, 0)

In [28]:
df = df[['lnwage', 'hrwage', '50k', 'pt', 'INCWAGE', 'WKSWORK1', 'UHRSWORKLY', 
         'AGE', 'AGE2', 'AGE3', 'SEX', 'blk', 'MARST', 'SCHLCOLL', 'EDUC']]

In [29]:
print(df.head())

     lnwage     hrwage  50k  pt  INCWAGE  WKSWORK1  UHRSWORKLY  AGE  AGE2  \
0  2.445686  11.538462    0   1  18000.0        52          30   21   441   
2  1.657229   5.244755    0   0  12000.0        52          44   61  3721   
4  2.445686  11.538462    0   1  12000.0        52          20   37  1369   
6  3.179655  24.038462    1   0  55000.0        52          44   53  2809   
8  3.442059  31.251250    1   0  50002.0        40          40   62  3844   

     AGE3  SEX  blk  MARST  SCHLCOLL  EDUC  
0    9261    1    0      6       5.0    60  
2  226981    2    0      6       0.0    73  
4   50653    1    0      6       5.0    73  
6  148877    2    0      4       5.0    73  
8  238328    1    0      1       0.0   111  


In [30]:
categorical_features = ['SEX', 'MARST', 'SCHLCOLL', 'EDUC', 'pt', 'blk']
for col in categorical_features:
    df[col] = df[col].astype('category')

In [31]:
df.dtypes

lnwage         float64
hrwage         float64
50k              int32
pt            category
INCWAGE        float64
WKSWORK1         int64
UHRSWORKLY       int64
AGE              int64
AGE2             int64
AGE3             int64
SEX           category
blk           category
MARST         category
SCHLCOLL      category
EDUC          category
dtype: object

# Descriptive differences in earnings by race 

### Association between race and likelihood of annual earnings surpassing 50K (i.e. "high-income")

#### Table 1: Linear probability model regressing high/low earnings dummy on race dummy

In [32]:
y = df['50k'].reset_index(drop=True)
predictors = df['blk'].astype('int').reset_index(drop=True)
predictors = sm.add_constant(predictors) # add constant
sm.OLS(y, predictors, hasconst=True).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,50k,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,738.0
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,2.0400000000000002e-161
Time:,17:01:48,Log-Likelihood:,-37646.0
No. Observations:,53671,AIC:,75300.0
Df Residuals:,53669,BIC:,75310.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4276,0.002,184.321,0.000,0.423,0.432
blk,-0.1486,0.005,-27.166,0.000,-0.159,-0.138

0,1,2,3
Omnibus:,208719.027,Durbin-Watson:,1.819
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8611.43
Skew:,0.375,Prob(JB):,0.0
Kurtosis:,1.187,Cond. No.,2.85


Compared to Whites, Blacks have a 14.86% lower likelihood of earning >$50k per year

#### Table 2: Linear probability model regressing high/low earnings dummy on race dummy and control variables

In [33]:
y = df['50k'].reset_index(drop=True)
predictors = df[['blk', 'AGE', 'AGE2', 'AGE3', 'pt']].reset_index(drop=True)
predictors['blk'] = predictors['blk'].astype(int)
predictors['pt'] = predictors['pt'].astype(int)
for col in ['SEX', 'MARST', 'SCHLCOLL', 'EDUC']:
    onehot_vector = pd.DataFrame(OneHotEncoder().fit_transform(df[[col]]).toarray()[:, 1:])
    onehot_vector.columns = [str(i)+'_'+col[:3] for i in onehot_vector.columns]
    predictors = pd.concat([predictors, onehot_vector], axis=1)
predictors = sm.add_constant(predictors) # add constant

sm.OLS(y, predictors, hasconst=True).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,50k,R-squared:,0.285
Model:,OLS,Adj. R-squared:,0.285
Method:,Least Squares,F-statistic:,1259.0
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,0.0
Time:,17:01:48,Log-Likelihood:,-28954.0
No. Observations:,53671,AIC:,57970.0
Df Residuals:,53639,BIC:,58260.0
Df Model:,31,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3332,0.080,4.148,0.000,0.176,0.491
blk,-0.0733,0.005,-14.478,0.000,-0.083,-0.063
AGE,-0.0193,0.005,-3.989,0.000,-0.029,-0.010
AGE2,0.0009,0.000,6.780,0.000,0.001,0.001
AGE3,-8.663e-06,1.08e-06,-8.042,0.000,-1.08e-05,-6.55e-06
pt,-0.2688,0.004,-66.484,0.000,-0.277,-0.261
0_SEX,-0.1804,0.004,-49.260,0.000,-0.188,-0.173
0_MAR,-0.0522,0.017,-2.995,0.003,-0.086,-0.018
1_MAR,-0.0931,0.013,-6.923,0.000,-0.119,-0.067

0,1,2,3
Omnibus:,19550.706,Durbin-Watson:,1.892
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2717.034
Skew:,0.112,Prob(JB):,0.0
Kurtosis:,1.921,Cond. No.,18800000.0


Holding other factors in the model constant, Blacks have about an 7.3% lower probability of earning >$50k per year

### Association between race and log hourly wage

#### Table 3: OLS regression of log hourly wages including only race dummy

In [34]:
y = df['lnwage'].reset_index(drop=True)
predictors = df['blk'].astype('int').reset_index(drop=True)
predictors = sm.add_constant(predictors) # add constant
sm.OLS(y, predictors, hasconst=True).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,lnwage,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.014
Method:,Least Squares,F-statistic:,801.4
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,5.29e-175
Time:,17:01:48,Log-Likelihood:,-54084.0
No. Observations:,53671,AIC:,108200.0
Df Residuals:,53669,BIC:,108200.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.0816,0.003,986.494,0.000,3.075,3.088
blk,-0.2200,0.008,-28.309,0.000,-0.235,-0.205

0,1,2,3
Omnibus:,2062.62,Durbin-Watson:,1.743
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2671.806
Skew:,-0.417,Prob(JB):,0.0
Kurtosis:,3.707,Cond. No.,2.85


Compared to Whites, Blacks earn on average 19.75% (0.1975 = 1-exp(-0.22)) per hour worked.

#### Table 4: OLS regression of log hourly wages using race dummy and control variables

In [35]:
y = df['lnwage'].reset_index(drop=True)
predictors = df[['blk', 'AGE', 'AGE2', 'AGE3', 'pt']].reset_index(drop=True)
predictors['blk'] = predictors['blk'].astype(int)
predictors['pt'] = predictors['pt'].astype(int)
for col in ['SEX', 'MARST', 'SCHLCOLL', 'EDUC']:
    onehot_vector = pd.DataFrame(OneHotEncoder().fit_transform(df[[col]]).toarray()[:, 1:])
    onehot_vector.columns = [str(i)+'_'+col[:3] for i in onehot_vector.columns]
    predictors = pd.concat([predictors, onehot_vector], axis=1)
predictors = sm.add_constant(predictors) # add constant
sm.OLS(y, predictors, hasconst=True).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,lnwage,R-squared:,0.296
Model:,OLS,Adj. R-squared:,0.296
Method:,Least Squares,F-statistic:,665.6
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,0.0
Time:,17:01:49,Log-Likelihood:,-45032.0
No. Observations:,53671,AIC:,90130.0
Df Residuals:,53639,BIC:,90410.0
Df Model:,31,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.7981,0.142,12.662,0.000,1.520,2.076
blk,-0.1094,0.007,-15.589,0.000,-0.123,-0.096
AGE,0.0582,0.008,7.478,0.000,0.043,0.073
AGE2,-0.0008,0.000,-3.974,0.000,-0.001,-0.000
AGE3,2.664e-06,1.62e-06,1.640,0.101,-5.19e-07,5.85e-06
pt,-0.1937,0.009,-22.032,0.000,-0.211,-0.176
0_SEX,-0.2118,0.005,-42.615,0.000,-0.222,-0.202
0_MAR,-0.1029,0.024,-4.304,0.000,-0.150,-0.056
1_MAR,-0.1771,0.018,-9.582,0.000,-0.213,-0.141

0,1,2,3
Omnibus:,4391.268,Durbin-Watson:,1.852
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10292.104
Skew:,-0.509,Prob(JB):,0.0
Kurtosis:,4.889,Cond. No.,18800000.0


Holding other factors in the model constant, Blacks earn about 10.4% lower hourly wages (0.104 = 1-exp(-0.1094))

# Discrimination in predicting high/low earners

### Train-test split

In [36]:
y = df['50k']
X = df[['blk', 'AGE', 'AGE2', 'AGE3', 'SEX', 'EDUC', 'SCHLCOLL', 'MARST', 'pt']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=999)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=999)

In [37]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(38642, 9)
(9661, 9)
(5368, 9)


In [38]:
foo = pd.concat([X_test, y_test], axis=1)
print(f"Percent high-income Blacks in test set: {round(foo[foo.blk==1]['50k'].mean()*100, 2)}%")
print(f"Percent high-income Whites in test set: {round(foo[foo.blk==0]['50k'].mean()*100, 2)}%")
print(f"Difference: {round((foo[foo.blk==1]['50k'].mean() - foo[foo.blk==0]['50k'].mean())*100, 2)}%")

Percent high-income Blacks in test set: 26.91%
Percent high-income Whites in test set: 42.34%
Difference: -15.43%


## Naive model: Excluding race

In [39]:
model1 = lgb.LGBMClassifier(objective='binary',
                           random_state=999,
                           metric='logloss')

In [40]:
X_train_mod = X_train.loc[:, X_train.columns != 'blk']
X_val_mod = X_val.loc[:, X_val.columns != 'blk']
X_test_mod = X_test.loc[:, X_test.columns != 'blk']
model1.fit(X_train_mod, y_train, eval_set=[(X_val_mod, y_val)], eval_metric='logloss', early_stopping_rounds=10, verbose=False)

LGBMClassifier(metric='logloss', objective='binary', random_state=999)

In [41]:
naive_pred = model1.predict(X_test_mod)

In [42]:
binary_metrics(y_test, naive_pred)

{'accuracy': 0.7493,
 'sensitivity/recall': 0.6884,
 'specificity': 0.7897,
 'precision': 0.6852,
 'f1': 0.6868,
 'roc_auc': 0.7391}

In [43]:
naive = combine_prediction(X_test, y_test, naive_pred)

#### Table 5: Linear probability model regressing naive model predicted values on race dummy

In [44]:
naive_reg = sm.add_constant(naive) # add constant
naive_reg['blk'] = naive_reg['blk'].astype(int)
sm.OLS(naive_reg['pred'], naive_reg[['const', 'blk']]).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,pred,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,27.24
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,1.87e-07
Time:,17:01:50,Log-Likelihood:,-3776.6
No. Observations:,5368,AIC:,7557.0
Df Residuals:,5366,BIC:,7570.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4157,0.007,56.772,0.000,0.401,0.430
blk,-0.0927,0.018,-5.219,0.000,-0.128,-0.058

0,1,2,3
Omnibus:,21127.127,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,884.818
Skew:,0.399,Prob(JB):,7.31e-193
Kurtosis:,1.178,Cond. No.,2.83


Compared to an observed differential of -0.1543 in the test set, the naive model predicts a smaller differential at -0.093

## Discriminatory model: Including race

In [45]:
model2 = lgb.LGBMClassifier(objective='binary',
                           random_state=999,
                           metric='logloss')

In [46]:
model2.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='logloss', early_stopping_rounds=10, verbose=False)

LGBMClassifier(metric='logloss', objective='binary', random_state=999)

In [47]:
discrim_pred = model2.predict(X_test)

In [48]:
binary_metrics(y_test, discrim_pred)

{'accuracy': 0.7547,
 'sensitivity/recall': 0.7206,
 'specificity': 0.7773,
 'precision': 0.6827,
 'f1': 0.7011,
 'roc_auc': 0.749}

In [49]:
discrim = combine_prediction(X_test, y_test, discrim_pred)

#### Table 6: Linear probability model regressing discriminatory model predicted values on race dummy

In [50]:
discrim_reg = sm.add_constant(discrim) # add constant
discrim_reg['blk'] = discrim_reg['blk'].astype(int)
sm.OLS(discrim_reg['pred'], discrim_reg[['const', 'blk']]).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,pred,R-squared:,0.027
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,184.8
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,2.07e-41
Time:,17:01:51,Log-Likelihood:,-3756.2
No. Observations:,5368,AIC:,7516.0
Df Residuals:,5366,BIC:,7530.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4563,0.007,61.660,0.000,0.442,0.471
blk,-0.2231,0.016,-13.595,0.000,-0.255,-0.191

0,1,2,3
Omnibus:,21106.787,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,814.198
Skew:,0.287,Prob(JB):,1.5800000000000001e-177
Kurtosis:,1.18,Cond. No.,2.83


Compared to an observed differential of -0.1543 in the test set, the discriminatory model predicts a *larger* differential at -0.2231

## `DiscriminationMitigator`

In [51]:
config = {'protected_class_features': ['blk']}

In [52]:
mitigated = DiscriminationMitigator(df=X_test, model=model2, config=config).predictions()

In [53]:
print(mitigated.head())

        unadj_pred  unif_wts
94428            1       1.0
145967           1       0.5
18557            0       0.0
175691           1       0.5
132926           1       1.0


#### Evaluating performance

In [54]:
mitigated['thresh_0.5'] = np.where(mitigated['unif_wts'] >= 0.5, 1, 0) # naive threshold of 0.5

In [55]:
mitigated.describe()

Unnamed: 0,unadj_pred,unif_wts,thresh_0.5
count,5368.0,5368.0,5368.0
mean,0.421572,0.377981,0.440574
std,0.493857,0.4515,0.496502
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [56]:
binary_metrics(y_test, mitigated['thresh_0.5'])

{'accuracy': 0.7513,
 'sensitivity/recall': 0.7402,
 'specificity': 0.7587,
 'precision': 0.671,
 'f1': 0.7039,
 'roc_auc': 0.7494}

#### Table 7: Linear probability model regressing discriminatory model predicted values on race dummy
#### Adjusted using `DiscriminationMitigator`

In [57]:
mitigated_reg = sm.add_constant(mitigated) # add constant
mitigated_reg['blk'] = X_test['blk']
mitigated_reg['blk'] = mitigated_reg['blk'].astype(int)
sm.OLS(mitigated_reg['thresh_0.5'], mitigated_reg[['const', 'blk']]).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,thresh_0.5,R-squared:,0.006
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,31.91
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,1.7e-08
Time:,17:01:52,Log-Likelihood:,-3842.8
No. Observations:,5368,AIC:,7690.0
Df Residuals:,5366,BIC:,7703.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.4565,0.007,61.687,0.000,0.442,0.471
blk,-0.1025,0.018,-5.649,0.000,-0.138,-0.067

0,1,2,3
Omnibus:,19574.254,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,876.238
Skew:,0.236,Prob(JB):,5.34e-191
Kurtosis:,1.078,Cond. No.,2.83


# Discrimination in predicting high-wage workers

#### Train-test split

In [58]:
y = df['lnwage']
X = df[['blk', 'AGE', 'AGE2', 'AGE3', 'SEX', 'EDUC', 'SCHLCOLL', 'MARST', 'pt']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=999)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=999)

In [59]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(38642, 9)
(9661, 9)
(5368, 9)


In [60]:
foo = pd.concat([X_test, y_test], axis=1)
print(f"Mean hourly wage of Blacks in test set: ${round(np.exp(foo[foo.blk==1]['lnwage']).mean(), 2)}")
print(f"Mean hourly wage of Whites in test set: ${round(np.exp(foo[foo.blk==0]['lnwage']).mean(), 2)}")
print(f"Difference: ${round(np.exp(foo[foo.blk==1]['lnwage']).mean() - np.exp(foo[foo.blk==0]['lnwage']).mean(), 2)}")
print(f"Log difference: {round(foo[foo.blk==1]['lnwage'].mean() - foo[foo.blk==0]['lnwage'].mean(), 2)}")

Mean hourly wage of Blacks in test set: $21.26
Mean hourly wage of Whites in test set: $26.56
Difference: $-5.3
Log difference: -0.22


## Naive model: Excluding race

In [61]:
model3 = lgb.LGBMRegressor(random_state=999,
                           metric='mean_squared_error')

In [62]:
X_train_mod = X_train.loc[:, X_train.columns != 'blk']
X_val_mod = X_val.loc[:, X_val.columns != 'blk']
X_test_mod = X_test.loc[:, X_test.columns != 'blk']
model3.fit(X_train_mod, y_train, eval_set=[(X_val_mod, y_val)], eval_metric='mean_squared_error', early_stopping_rounds=10, verbose=False)

LGBMRegressor(metric='mean_squared_error', random_state=999)

In [63]:
naive_pred_cont = model3.predict(X_test_mod)

In [64]:
for key, val in continuous_metrics(y_test, naive_pred_cont).items():
    print(key, val)

mae 0.4183
mse 0.306
rmse 0.5532
r2 0.2979


In [65]:
naive_cont = combine_prediction(X_test, y_test, naive_pred_cont, outcome='lnwage')

#### Table 8: OLS regression of naive model predicted values on race dummy

In [66]:
naive_reg_cont = sm.add_constant(naive_cont) # add constant
naive_reg_cont['blk'] = naive_reg_cont['blk'].astype(int)
sm.OLS(naive_reg_cont['pred'], naive_reg_cont[['const', 'blk']]).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,pred,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,64.78
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,1.02e-15
Time:,17:01:52,Log-Likelihood:,-2089.7
No. Observations:,5368,AIC:,4183.0
Df Residuals:,5366,BIC:,4197.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.0644,0.005,575.472,0.000,3.054,3.075
blk,-0.1066,0.013,-8.048,0.000,-0.133,-0.081

0,1,2,3
Omnibus:,155.295,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,90.652
Skew:,-0.162,Prob(JB):,2.07e-20
Kurtosis:,2.452,Cond. No.,2.83


Compared to a observed difference of -0.22 log percentage points in between Blacks and Whites in the test set, a naive model predicts -0.1066 log percentage point difference. 

## Discriminatory model: Including race

In [67]:
model4 = lgb.LGBMRegressor(random_state=999,
                           metric='mean_squared_error')

In [68]:
model4.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='MSE', early_stopping_rounds=10, verbose=False)

LGBMRegressor(metric='mean_squared_error', random_state=999)

In [69]:
discrim_pred_cont = model4.predict(X_test)

In [70]:
for key, val in continuous_metrics(y_test, discrim_pred_cont).items():
    print(key, val)

mae 0.4166
mse 0.3032
rmse 0.5506
r2 0.3043


In [71]:
discrim_cont = combine_prediction(X_test, y_test, discrim_pred_cont, outcome='lnwage')

#### Table 9: OLS regression of discriminatory model predicted values on race dummy

In [72]:
discrim_reg_cont = sm.add_constant(discrim_cont) # add constant
discrim_reg_cont['blk'] = discrim_reg_cont['blk'].astype(int)
sm.OLS(discrim_reg_cont['pred'], discrim_reg_cont[['const', 'blk']]).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,pred,R-squared:,0.04
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,264.8
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,3.7e-58
Time:,17:01:53,Log-Likelihood:,-1992.2
No. Observations:,5368,AIC:,3988.0
Df Residuals:,5366,BIC:,4002.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.0775,0.005,580.198,0.000,3.067,3.088
blk,-0.1971,0.012,-16.273,0.000,-0.221,-0.173

0,1,2,3
Omnibus:,149.378,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,95.898
Skew:,-0.198,Prob(JB):,1.5000000000000001e-21
Kurtosis:,2.478,Cond. No.,2.83


Compared to a observed difference of -0.22 log percentage points in between Blacks and Whites in the test set, a discriminatory model predicts a difference of -0.1971, which is considerably larger than the naive model prediction at -0.1066. 

## `DiscriminationMitigator`

In [73]:
mitigated_cont = DiscriminationMitigator(df=X_test, model=model4, config=config).predictions()

In [74]:
print(mitigated_cont.head(), '\n')
print(mitigated_cont.describe())

        unadj_pred  unif_wts
94428     3.301243  3.289145
145967    3.146659  3.047996
18557     2.801680  2.762560
175691    3.143289  3.040616
132926    3.291120  3.220549 

        unadj_pred     unif_wts
count  5368.000000  5368.000000
mean      3.046788     3.009338
std       0.357940     0.338295
min       2.047689     2.093242
25%       2.808305     2.794492
50%       3.046140     3.018227
75%       3.301243     3.270785
max       3.823983     3.704577


In [75]:
for key, val in continuous_metrics(y_test, mitigated_cont['unif_wts']).items():
    print(key, val)

mae 0.4213
mse 0.308
rmse 0.555
r2 0.2933


#### Table 10: OLS regression of discriminatory model predicted values on race dummy
#### Adjusted using `DiscriminationMitigator`

In [76]:
mitigated_reg_cont = sm.add_constant(mitigated_cont) # add constant
mitigated_reg_cont['blk'] = X_test['blk']
mitigated_reg_cont['blk'] = mitigated_reg_cont['blk'].astype(int)
sm.OLS(mitigated_reg_cont['unif_wts'], mitigated_reg_cont[['const', 'blk']]).fit(cov_type='HC3', use_t=True).summary()

0,1,2,3
Dep. Variable:,unif_wts,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,58.0
Date:,"Tue, 13 Apr 2021",Prob (F-statistic):,3.08e-14
Time:,17:01:53,Log-Likelihood:,-1770.3
No. Observations:,5368,AIC:,3545.0
Df Residuals:,5366,BIC:,3558.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.0241,0.005,602.816,0.000,3.014,3.034
blk,-0.0951,0.012,-7.616,0.000,-0.120,-0.071

0,1,2,3
Omnibus:,151.197,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,105.46
Skew:,-0.234,Prob(JB):,1.26e-23
Kurtosis:,2.498,Cond. No.,2.83


Compared to a predicted of differential at -0.1066 according to the naive model, the mitigated model yields a smaller predicted differential at -0.0951.