In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric

In [341]:
X_train = pd.read_csv('../data/preprocessed_X_train.csv')
X_test = pd.read_csv('../data/preprocessed_X_test.csv')
y_train = pd.read_csv('../data/preprocessed_y_train.csv')
y_test = pd.read_csv('../data/preprocessed_y_test.csv')

In [342]:
columns_to_drop = ['interest_rate', 'rate_spread', 'origination_charges', 'total_loan_costs']
X_train = X_train.drop(columns=columns_to_drop, axis=1)
X_test = X_test.drop(columns=columns_to_drop, axis=1)

In [None]:
X_train.info()

In [47]:
feature_names = X_train.columns.tolist()

In [343]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()

In [344]:
def eval(model, X, y):
    kfold = KFold(n_splits=10)

    metrics = {
        "Accuracy": [],
        "F1": [],
        "ROC AUC": [],
        "Precision": [],
        "Recall": []
    }

    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)[:, 1]

        metrics['Accuracy'].append(accuracy_score(y_test, y_pred))
        metrics['F1'].append(f1_score(y_test, y_pred))
        metrics['ROC AUC'].append(roc_auc_score(y_test, y_pred_prob))
        metrics['Precision'].append(precision_score(y_test, y_pred))
        metrics['Recall'].append(recall_score(y_test, y_pred))
    
    metrics = {metric: np.mean(value) for metric, value in metrics.items()}

    return metrics

In [335]:
lr = LogisticRegression()

In [None]:
metrics = eval(lr, X_train, y_train)
metrics

In [345]:
rf = RandomForestClassifier()
xgb = XGBClassifier()
mlp = MLPClassifier()

models = [rf, xgb, mlp]

In [346]:
result = {}
for model in models:
    metrics = eval(model, X_train, y_train)
    result[model.__class__.__name__] = metrics

result



In [24]:
smote = SMOTE(random_state=334)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [25]:
def train_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return classification_report(y_test, y_pred)

In [None]:
smote_result = {}
models.append(lr)
for model in models:
    report = train_evaluate(model, X_train_smote, y_train_smote, X_test, y_test)
    smote_result[model.__class__.__name__] = report

In [29]:
print(smote_result['RandomForestClassifier'])

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7627
           1       1.00      1.00      1.00     22373

    accuracy                           0.99     30000
   macro avg       0.99      0.99      0.99     30000
weighted avg       0.99      0.99      0.99     30000



In [30]:
print(smote_result['XGBClassifier'])

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7627
           1       1.00      1.00      1.00     22373

    accuracy                           0.99     30000
   macro avg       0.99      0.99      0.99     30000
weighted avg       0.99      0.99      0.99     30000



In [31]:
print(smote_result['MLPClassifier'])

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7627
           1       0.99      0.99      0.99     22373

    accuracy                           0.99     30000
   macro avg       0.99      0.99      0.99     30000
weighted avg       0.99      0.99      0.99     30000



In [32]:
print(smote_result['LogisticRegression'])

              precision    recall  f1-score   support

           0       0.53      0.70      0.60      7627
           1       0.88      0.79      0.83     22373

    accuracy                           0.76     30000
   macro avg       0.71      0.74      0.72     30000
weighted avg       0.79      0.76      0.77     30000



In [35]:
def get_feature_importance(model, X_train, y_train):
    importances = model.feature_importances_
    perm_importances = permutation_importance(model, X_train, y_train).importances_mean

    imp_indices = np.argsort(importances)[::-1]
    top10_imp_indices = imp_indices[:10]

    perm_indices = np.argsort(perm_importances)[::-1]
    top10_perm_indices = perm_indices[:10]

    return importances[top10_imp_indices], perm_importances[top10_perm_indices]

In [54]:
xgb.fit(X_train_smote, y_train_smote)
importances = xgb.feature_importances_
imp_dict = dict(zip(feature_names, importances))
imp_df = pd.DataFrame(sorted(imp_dict.items(), key=lambda x: x[1], reverse=True), columns=['Feature', 'Importance'])

In [56]:
xgb_imp = imp_df.copy()

In [None]:
xgb_imp.head(20)

In [52]:
rf_imp = imp_df.copy()

In [None]:
rf_imp.head(20)

In [338]:
def get_param_grid(model_name):
    if model_name == "rf":
        param_grid = {
            'n_estimators': [10, 50, 100],
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': [None, 'sqrt', 'log2']
        }
    elif model_name == "xgb":
        param_grid = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 6, 9],
            'subsample': [0.7, 0.9, 1],
            'colsample_bytree': [0.7, 0.9, 1],
            'min_child_weight': [1, 3, 5]
        }
    elif model_name == "mlp":
        param_grid = {
            
        }
    else:
        param_grid = {}

    return param_grid

In [339]:
def clf_tuning(model, param_grid, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:, 1]

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_pred_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred)
    }

    return best_params, metrics

In [None]:
rf_param = get_param_grid('rf')
best_rf_param, rf_metrics = clf_tuning(RandomForestClassifier(), rf_param, X_train, y_train, X_test, y_test)

In [4]:
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [5]:
columns_to_drop = ['interest_rate', 'rate_spread', 'origination_charges', 'total_loan_costs']
X_train = X_train.drop(columns=columns_to_drop, axis=1)
X_test = X_test.drop(columns=columns_to_drop, axis=1)

In [6]:
def recode_categorical_cols(df, recode_map):
    for col, mapping in recode_map.items():
        df[col] = df[col].replace(mapping)
    return df

In [7]:
# Privileged: 1, Unprivileged: 0
recode_map = {
    'ethnicity': {'Not Hispanic or Latino': 1, 'Hispanic or Latino': 0},
    'race': {'White': 1, 'Black or African American': 2, 'Asian': 3, 'American Indian or Alaska Native': 4, '2 or more minority races': 5, 'Native Hawaiian or Other Pacific Islander': 6},
    'sex': {'Male': 1, 'Female': 0},
    'age': {'<25': 1, '25-34': 2, '35-44': 3, '45-54': 4, '55-64': 5, '65-74': 6, '>74': 7}
}

X_train = recode_categorical_cols(X_train, recode_map)
X_test = recode_categorical_cols(X_test, recode_map)

In [8]:
def imputing(X_train, X_test):
    # Impute missing values with median for floating point columns and most frequent values (mode) for others
    imputer_mode = SimpleImputer(strategy='most_frequent')
    imputer_median = SimpleImputer(strategy='median')
    for col in X_train.columns:
        if X_train[col].dtype == 'float64':
            X_train[col] = imputer_median.fit_transform(X_train[[col]]).ravel()
            X_test[col] = imputer_median.transform(X_test[[col]]).ravel()
        else:
            X_train[col] = imputer_mode.fit_transform(X_train[[col]]).ravel()
            X_test[col] = imputer_mode.transform(X_test[[col]]).ravel()

    return X_train, X_test

In [9]:
X_train, X_test = imputing(X_train, X_test)

In [12]:
def scaling(X_train, X_test):
    scaler = StandardScaler()
    numeric_cols = ['loan_amount', 'combined_loan_to_value_ratio', 'loan_term', 'property_value', 'total_units', 'income']
    for col in numeric_cols:
        X_train[col] = scaler.fit_transform(X_train[[col]])
        X_test[col] = scaler.transform(X_test[[col]])

    return X_train, X_test

In [13]:
X_train, X_test = scaling(X_train, X_test)

In [14]:
def select_features(X_train, X_test, y_train):
    train_data = pd.concat([X_train, y_train], axis=1)
    corr_matrix = train_data.corr()
    target_corr = corr_matrix.iloc[:-1, -1].abs()
    col_to_drop = set(target_corr[target_corr > 0.8].index.tolist())

    for i in range(len(corr_matrix.columns) - 1):
        for j in range(i+1, len(corr_matrix.columns) - 1):
            if abs(corr_matrix.iloc[i, j]) > 0.9:
                if target_corr[i] > target_corr[j]:
                    col_to_drop.add(corr_matrix.columns[j])
                else:
                    col_to_drop.add(corr_matrix.columns[i])

    X_train = X_train.drop(columns=list(col_to_drop))
    X_test = X_test.drop(columns=list(col_to_drop))

    return X_train, X_test

In [15]:
X_train, X_test = select_features(X_train, X_test, y_train)

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 22 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   state_code                                70000 non-null  object 
 1   conforming_loan_limit                     70000 non-null  object 
 2   loan_product_type                         70000 non-null  object 
 3   dwelling_category                         70000 non-null  object 
 4   ethnicity                                 70000 non-null  int64  
 5   race                                      70000 non-null  int64  
 6   sex                                       70000 non-null  int64  
 7   preapproval                               70000 non-null  int64  
 8   loan_purpose                              70000 non-null  object 
 9   open_end_line_of_credit                   70000 non-null  object 
 10  loan_amount                       

In [17]:
X_train['manufactured_home_secured_property_type'] = X_train['manufactured_home_secured_property_type'].astype(object)
X_test['manufactured_home_secured_property_type'] = X_test['manufactured_home_secured_property_type'].astype(object)

In [20]:
def encoding(X_train, X_test):
    categorical_cols = list(range(0, 4)) + [8, 9, 12, 15, 17, 20]
    categorical_names = X_train.columns[categorical_cols]

    X_train_encoded = pd.get_dummies(X_train[categorical_names])
    X_test_encoded = pd.get_dummies(X_test[categorical_names])

    X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)
    
    X_train = X_train.drop(columns=categorical_names)
    X_test = X_test.drop(columns=categorical_names)

    X_train = pd.concat([X_train, X_train_encoded], axis=1)
    X_test = pd.concat([X_test, X_test_encoded], axis=1)

    return X_train, X_test

In [21]:
X_train, X_test = encoding(X_train, X_test)

In [22]:
X_train, X_test = select_features(X_train, X_test, y_train)

In [23]:
X_train = X_train.astype({'ethnicity': int, 'race': int, 'sex': int, 'age': int})
X_test = X_test.astype({'ethnicity': int, 'race': int, 'sex': int, 'age': int})

In [24]:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [25]:
bld_ethnicity_train = BinaryLabelDataset(df=train_df, label_names=['loan_approved'], protected_attribute_names=['ethnicity'], favorable_label=1, unfavorable_label=0)
bld_ethnicity_test = BinaryLabelDataset(df=test_df, label_names=['loan_approved'], protected_attribute_names=['ethnicity'], favorable_label=1, unfavorable_label=0)
privileged_ethnicity = [{'ethnicity': 1}]
unprivileged_ethnicity = [{'ethnicity': 0}]
dataset_metric_ethnicity = BinaryLabelDatasetMetric(bld_ethnicity_train, privileged_groups=privileged_ethnicity, unprivileged_groups=unprivileged_ethnicity)

In [26]:
print("Dataset Fairness Metrics w.r.t. Ethnicity: ")
print("Disparate Impact: ", dataset_metric_ethnicity.disparate_impact())
print("Statistical Parity Difference: ", dataset_metric_ethnicity.statistical_parity_difference())

Dataset Fairness Metrics w.r.t. Ethnicity: 
Disparate Impact:  0.9356058188162372
Statistical Parity Difference:  -0.04842549472956659


In [27]:
bld_race_train = BinaryLabelDataset(df=train_df, label_names=['loan_approved'], protected_attribute_names=['race'], favorable_label=1, unfavorable_label=0)
bld_race_test = BinaryLabelDataset(df=test_df, label_names=['loan_approved'], protected_attribute_names=['race'], favorable_label=1, unfavorable_label=0)
privileged_race = [{'race': 1}]
unprivileged_race = [{'race': 2, 'race': 3, 'race': 4, 'race': 5, 'race': 6}]
dataset_metric_race = BinaryLabelDatasetMetric(bld_race_train, privileged_groups=privileged_race, unprivileged_groups=unprivileged_race)

In [28]:
print("Dataset Fairness Metrics w.r.t. Race: ")
print("Disparate Impact: ", dataset_metric_race.disparate_impact())
print("Statistical Parity Difference: ", dataset_metric_race.statistical_parity_difference())

Dataset Fairness Metrics w.r.t. Race: 
Disparate Impact:  0.8368631316179284
Statistical Parity Difference:  -0.12449855674071242


In [29]:
bld_sex_train = BinaryLabelDataset(df=train_df, label_names=['loan_approved'], protected_attribute_names=['sex'], favorable_label=1, unfavorable_label=0)
bld_sex_test = BinaryLabelDataset(df=test_df, label_names=['loan_approved'], protected_attribute_names=['sex'], favorable_label=1, unfavorable_label=0)
privileged_sex = [{'sex': 1}]
unprivileged_sex = [{'sex': 0}]
dataset_metric_sex = BinaryLabelDatasetMetric(bld_sex_train, privileged_groups=privileged_sex, unprivileged_groups=unprivileged_sex)

In [30]:
print("Dataset Fairness Metrics w.r.t. Sex: ")
print("Disparate Impact: ", dataset_metric_sex.disparate_impact())
print("Statistical Parity Difference: ", dataset_metric_sex.statistical_parity_difference())

Dataset Fairness Metrics w.r.t. Sex: 
Disparate Impact:  0.9767675407948306
Statistical Parity Difference:  -0.017492347950814224


In [31]:
bld_age_train = BinaryLabelDataset(df=train_df, label_names=['loan_approved'], protected_attribute_names=['age'], favorable_label=1, unfavorable_label=0)
bld_age_test = BinaryLabelDataset(df=test_df, label_names=['loan_approved'], protected_attribute_names=['age'], favorable_label=1, unfavorable_label=0)
privileged_age = [{'age': 2, 'age': 3, 'age': 4, 'age': 5, 'age': 6}]
unprivileged_age = [{'age': 1, 'age': 7}]
dataset_metric_age = BinaryLabelDatasetMetric(bld_age_train, privileged_groups=privileged_age, unprivileged_groups=unprivileged_age)

In [32]:
print("Dataset Fairness Metrics w.r.t. Age: ")
print("Disparate Impact: ", dataset_metric_age.disparate_impact())
print("Statistical Parity Difference: ", dataset_metric_age.statistical_parity_difference())

Dataset Fairness Metrics w.r.t. Age: 
Disparate Impact:  0.9457166843702748
Statistical Parity Difference:  -0.03842803554826446


In [33]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()

In [34]:
xgb = XGBClassifier(scale_pos_weight=1, min_split_loss=0, max_depth=8, learning_rate=0.5, booster='dart')

xgb.fit(X_train, y_train)

In [35]:
def evaluate_metrics(model, bld_test, privileged, unprivileged):
    pred = model.predict(X_test)
    bld_pred = bld_test.copy()
    bld_pred.labels = pred.reshape(-1, 1)

    metrics = ClassificationMetric(bld_test, bld_pred, unprivileged_groups=unprivileged, privileged_groups=privileged)
    print("Disparate Impact: ", metrics.disparate_impact())
    print("Statistical Parity Difference: ", metrics.statistical_parity_difference())
    print("Average Odds Difference: ", metrics.average_odds_difference())
    print("Equal Opportunity Difference: ", metrics.equal_opportunity_difference())
    print("Theil Index: ", metrics.theil_index())

In [36]:
print("Fairness Metrics w.r.t. Ethnicity")
evaluate_metrics(xgb, bld_ethnicity_test, privileged_ethnicity, unprivileged_ethnicity)

Fairness Metrics w.r.t. Ethnicity
Disparate Impact:  0.9337525432503483
Statistical Parity Difference:  -0.054622836308257705
Average Odds Difference:  -0.033158073830614226
Equal Opportunity Difference:  -0.012441067223648017
Theil Index:  0.07490013945667533


In [37]:
print("Fairness Metrics w.r.t. Race")
evaluate_metrics(xgb, bld_race_test, privileged_race, unprivileged_race)

Fairness Metrics w.r.t. Race
Disparate Impact:  0.7749484427377212
Statistical Parity Difference:  -0.18807402807919227
Average Odds Difference:  -0.15315057654661113
Equal Opportunity Difference:  -0.09656041911476687
Theil Index:  0.07490013945667533


In [38]:
print("Fairness Metrics w.r.t. Sex")
evaluate_metrics(xgb, bld_sex_test, privileged_sex, unprivileged_sex)

Fairness Metrics w.r.t. Sex
Disparate Impact:  0.9600566965175855
Statistical Parity Difference:  -0.03319464707047781
Average Odds Difference:  -0.030225463819611192
Equal Opportunity Difference:  -0.001978385632363855
Theil Index:  0.07490013945667533


In [39]:
print("Fairness Metrics w.r.t. Age")
evaluate_metrics(xgb, bld_age_test, privileged_age, unprivileged_age)

Fairness Metrics w.r.t. Age
Disparate Impact:  0.9426736192791239
Statistical Parity Difference:  -0.04487307359838777
Average Odds Difference:  -0.028804461469031123
Equal Opportunity Difference:  -0.009220161093483115
Theil Index:  0.07490013945667533
