In [None]:
import pandas as pd
import time
import warnings
import random, json
import pandas as pd
import datetime
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from scipy import stats

In [None]:
# Measuring the overall fairness w.r.t., a demographic attribute (Section 4.2.1) using：
# - OSA(Overall Score Accuracy)：
# - OSD(Overall Score Difference)
# - OSA(COnditional Score Difference)
# Reference: Loukina, Anastassia, Nitin Madnani, and Klaus Zechner. "The many dimensions of algorithmic fairness in educational applications." Proceedings of the Fourteenth Workshop on Innovative Use of NLP for Building Educational Applications. 2019.
def overall_fairness(predicted,actual,demo,measure="OSA"):
    demo=demo.to_numpy()
    if demo.ndim<2:
        demo=np.reshape(demo,[-1,1])
    lr = LinearRegression()    
    if measure == "OSA":
        error = np.square(np.array(predicted)-np.squeeze(np.array(actual)))
        error = np.reshape(sq_error,[-1,1])
        feature = demo
        lr.fit(feature, error)
        pred = lr.predict(demo)
        fairness = metrics.r2_score(error, pred)
    elif measure == "OSD":
        error = np.array(predicted)-np.squeeze(np.array(actual))
        error = np.reshape(sq_error,[-1,1])
        feature = demo
        lr.fit(feature, error)
        pred = lr.predict(demo)
        fairness = metrics.r2_score(error, pred)
    elif measure = "CSD":
        error = np.array(predicted)-np.squeeze(np.array(actual))
        error = np.reshape(error,[-1,1])
        feature = pd.concat([demo,actual],axis=1)
        feature=feature.to_numpy()
        if feature.ndim<2:
            feature=np.reshape(feature,[-1,1])
        lr.fit(actual, error)
        pred1 = lr.predict(actual)
        lr.fit(feature, error)
        pred2 = lr.predict(feature)
        fairness = metrics.r2_score(error, pred2) - metrics.r2_score(error, pred1)        
    else:
        print("Only OSA, OSD, and CSD are available currently!")
        return
    
    return fairness    

In [None]:
# Looking at fairness w.r.t., a specific group (Section 4.2.2) using: 
# - SMD(Standardized Mean Difference): checking the difference between standardized predicted scores and actual scores for each subgroup
# - MAE(Mean Abolute Error): checking the mean absolute error of each subgroup
# Reference: Loukina, Anastassia, Nitin Madnani, and Klaus Zechner. "The many dimensions of algorithmic fairness in educational applications." Proceedings of the Fourteenth Workshop on Innovative Use of NLP for Building Educational Applications. 2019.
def subgroup_fairness(predicted,actual,demo,measure="SMD"):
    groups = set(demo)
    n_groups = len(groups)
    index_by_group = {k:[] for k in groups}
    for i in range(len(demo)):
        v = demo[i]
        index_by_group[v].append(i) 
        
    predicted =np.array(predicted)
    actual = np.array(actual)
    if measure == "SMD":
        zs_pred = np.squeeze(stats.zscore(predicted))
        zs_actual = np.squeeze(stats.zscore(actual))
    elif measure  == "MAE":
        zs_pred = predicted
        zs_actual = actual
    else:
        print("Only SMD and MAE are availiable currently!")
    
    fairness_bygroup = {}
    for g in groups:
        g_ids = np.array(index_by_group[g])
        preds_g = zs_pred[g_ids]
        actuals_g = zs_actual[g_ids]
        zs_diff = preds_g-actuals_g
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            if measure == "SMD":
                metric = np.nanmean(zs_diff, axis=0)               
            elif measure  == "MAE":
                metric = metrics.mean_absolute_error(actuals_g, preds_g)
            else:
                return
        print("%s for %s is %f with %d students."%(measure,g,smd,len(g_ids)))
        fairness_bygroup[g] = metric
    return fairness_bygroup

In [None]:
# Measure the fairness w.r.t., Performance Gap using (Section 4.2.3)
# - KS(Kolmogorov-Smirnov distance metric), which has been frequently used to measure the difference between two data distributions. 
# Reference: Chzhen, Evgenii, et al. "Fair regression with wasserstein barycenters." Advances in Neural Information Processing Systems 33 (2020): 7321-7331.
def getKS(data_col,demo):
    data_col =np.array(data_col)
    min_y =min(data_col)
    max_y = max(data_col)
    tt=np.linspace(min(data_col),max(data_col),1000)
    v = np.array(demo)
    ids_v1 = np.where(v==1)[0]
    ids_v0 = np.where(v==0)[0]
    n_g = (len(ids_v0),len(ids_v1))
    if 0 in n_g:
        ks = np.inf
    else:
        ks = 0
        for t in tt:
            ks = max(ks, abs(sum(data_col[ids_v0]<=t)/n_g[0]-sum(data_col[ids_v1]<=t)/n_g[1]))
    return ks

In [None]:
demo_attrs = ['Gender','Disability','NCCD-Funded','Kinder_Age', 'NumAbvYear9','NumAbvDiploma','NumProf','NumSibling','SiblingOrder']
def evaluate_fairness(y_pred,y_test,feature_test):
    aggregated_fairness = {}
    for demo_attr in demo_attrs:
        demo_col = feature_test[demo_attr]
        aggregated_fairness["OSA"] = overall_fairness(y_pred, y_test, demo_col, measure="OSA")
        aggregated_fairness["OSD"] = overall_fairness(y_pred, y_test, demo_col, measure="OSD")
        aggregated_fairness["CSD"] = overall_fairness(y_pred, y_test, demo_col, measure="CSD")
     
        fairness_by_group = {}
        fairness_by_group["SMD"] = subgroup_fairness(y_pred, y_test, demo_col, measure="SMD")
        fairness_by_group["MAE"] = subgroup_fairness(y_pred, y_test, demo_col, measure="MAE")

        KS = getKS(y_pred,demo_col)
        print("For attribute ", demo_attr)
        Print("aggregated fairness: ", aggregated_fairness)
        Print("subgroup fairness: ", fairness_by_group)
        print("performance gap: ", KS)