# Algorithmic Fairness, Accountability, and Ethics, Spring 2024

## Mandatory Assignment 2

Please use the following code to prepare the dataset.
 

In [None]:
from folktables.acs import adult_filter
from folktables import ACSDataSource, BasicProblem, generate_categories
import numpy as np
import pandas as pd
import random
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")
random.seed(12)

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)


def adult_filter(data):
    """Mimic the filters in place for Adult data.
    Adult documentation notes: Extraction was done by Barry Becker from
    the 1994 Census database. A set of reasonably clean records was extracted
    using the following conditions:
    ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
    """
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    df = df[df["RAC1P"] < 3] ## keep only Whites and African-Americans
    return df


ACSIncomeNew = BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'RELP',
        'WKHP',
        'PWGTP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
    target_transform=lambda x: x > 25000,    
    group=['SEX', 'RAC1P'],
    preprocess=adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

definition_df = data_source.get_definitions(download=True)
categories = generate_categories(features=ACSIncomeNew.features, definition_df=definition_df)
features, labels, groups = ACSIncomeNew.df_to_pandas(acs_data, categories=categories, dummies=True)

# Drop the "redundant" columns
features = features.drop(["RAC1P_White alone", 
                          "SEX_Male", 
                          "SCHL_1 or more years of college credit, no degree",  
                          "MAR_Divorced", 
                          "RELP_Adopted son or daughter",
                          'COW_Working without pay in family business or farm' ], axis = 1) 

print("Columns with the protected features:")
for i, f in enumerate(features.columns):
    if ("RAC1P" in f) or ("SEX" in f):
        print("Column ID: %s" %i, "(%s)"%f)
        
features.head()

In [None]:
### Train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sample_indices = random.sample(range(len(features)), 20000)
features, labels = features.iloc[sample_indices], labels.iloc[sample_indices]

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(columns= X_train.columns, data= scaler.transform(X_train), index = X_train.index)
X_test = pd.DataFrame(columns= X_test.columns, data = scaler.transform(X_test), index= X_test.index)

### Create one classifier to predict income on RAW DATA
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)
gbc.fit(X_train, y_train)

### Report general accuracy 
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve
from sklearn.model_selection import cross_val_score

probs = gbc.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, probs)
preds = accuracy_score(y_test, probs > 0.5)

print('Scores across all groups.')
print('AUC: ', auc(fpr, tpr))
print('Accuracy: ', preds)
print('Cross val score: ', cross_val_score(gbc, X_test, y_test))
print('---------------------------')

### report accuracy for gender groups and races

for i in set(X_test['SEX_Female']):
    X_test_temp = X_test[X_test["SEX_Female"] == i]
    drops = []
    for j in y_test.index:
        if j not in X_test_temp.index:
            drops.append(j)
    y_test_temp = y_test.drop(drops)
    temp_preds = gbc.predict(X_test_temp)
    accuracy = accuracy_score(y_test_temp, temp_preds)
    print(f'Accuracy for SEX {i}: {accuracy}')
    print('Cross val score: ', cross_val_score(gbc, X_test, y_test))
    print('---------------------------')

race_columns = [col for col in X_test.columns if "RAC1P" in col]

for i in set(X_test['RAC1P_Black or African American alone']):
    X_test_temp = X_test[X_test['RAC1P_Black or African American alone'] == i]
    drops = []
    for j in y_test.index:
        if j not in X_test_temp.index:
            drops.append(j)
    y_test_temp = y_test.drop(drops)
    temp_preds = gbc.predict(X_test_temp)
    accuracy = accuracy_score(y_test_temp, temp_preds)
    print(f'Accuracy for RAC1P {i}: {accuracy}')
    print('Cross val score: ', cross_val_score(gbc, X_test_temp, y_test_temp))
    print('---------------------------')

In [None]:
Xs_train_p = X_train.values[:, 54:]
Xs_test_p = X_test.values[:, 54:]
Xs_train_np = X_train.values[:, :54]
Xs_test_np = X_test.values[:, :54]

In [None]:
# ### Create a fairer version of the dataset to protect select groups
# protected_cols = ['RAC1P_Black or African American alone','SEX_Female']
# X_train_unprotected, X_train_protected = X_train.drop(columns = protected_cols), X_train[protected_cols]
# X_test_unprotected, X_test_protected = X_test.drop(columns = protected_cols), X_test[protected_cols]

def debias_features(Xs_np, Xs_p):
    import scipy
    assert Xs_np.shape[0]==Xs_p.shape[0]
    
    # Find orthonormal basis of protected features
    orthbasis = scipy.linalg.orth(Xs_p)

    # Debias nonprotected features
    Xs_np_debiased = Xs_np - orthbasis @ orthbasis.T @ Xs_np

    # Return debiased nonprotected features
    return Xs_np_debiased

X_train_unprotected_debiased = debias_features(Xs_train_np, Xs_train_p)
X_train_debiased = np.concatenate([X_train_unprotected_debiased, Xs_train_p], axis=1)


In [None]:
from scipy.stats import pearsonr

n_features = X_train.shape[1]
alpha = 0.05 # Significance level
corrected_alpha = alpha / (n_features**2/2) # Bonferroni correction for multiple testings

# Compute correlation matrix
corr = np.zeros((n_features, n_features))
p = np.zeros((n_features, n_features))
for i in range(n_features):
    for j in range(n_features):
        corr[i,j], p[i,j] = pearsonr(X_train.values[:,i], X_train.values[:,j])
        if np.isnan(corr[i,j]):
            p[i,j] = 1
plt.figure(figsize=(2,9))
sns.heatmap(corr[:, 54:], cmap="coolwarm", xticklabels=features.columns[54:], yticklabels=features.columns,  mask = p[:, 54:]  > corrected_alpha, vmin=-1, vmax=1)
plt.title("Pearson's Correlation Coeff between SEX and other variables (Masked by p value)")
plt.show()

In [None]:
# Compute correlation matrix
n_features = X_train.shape[1]
corr_ = np.zeros((n_features, n_features))
p_ = np.zeros((n_features, n_features))
for i in range(n_features):
    for j in range(n_features):
        corr_[i,j], p_[i,j] = pearsonr(X_train_debiased[:,i], X_train_debiased[:,j])
        corr_ = np.nan_to_num(corr_, 0)

# Plot correlations with protected features
plt.figure(figsize=(4,15))
sns.heatmap(corr_[:,54:], cmap="coolwarm", xticklabels=features.columns[54:], yticklabels=features.columns, vmin=-1, vmax=1)
plt.title("Pearson's Correlation Coeff between SEX and other variables (Masked by p value)")
plt.show()

In [None]:
### Build model using de-correlation effect from https://dl.acm.org/doi/10.1145/3375627.3375864

### Record results and plot differences in accuracies



In [None]:
### Build classifier using reprojected data from Fair PCA

