# Experiments with Home Credit

In [2]:
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, brier_score_loss, roc_curve
from sklearn.inspection import PartialDependenceDisplay
from lime import lime_tabular
import shap
from cfmining.algorithms import MAPOCAM, BruteForce, Greedy
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import MonotoneClassifier
from cfmining.visualization import buildTable, PlotCounterfactuals
from cfmining.mip_builder import RecourseBuilder
from cfmining.action_set import ActionSet
import joblib


import credit_pipeline.data_exploration as dex
from credit_pipeline.training import *


%load_ext autoreload
%autoreload 2

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Loading and Cleaning

In [3]:
seed_number = 0

In [4]:
path = "../data/HomeCredit/"
df = dex.read_csv_encoded(path, 'application_train.csv')

In [5]:
columns_to_drop = dex.check_missing(df, 50,  False)
columns_to_drop

['COMMONAREA_MEDI',
 'COMMONAREA_AVG',
 'COMMONAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_MEDI',
 'FONDKAPREMONT_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MEDI',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MODE',
 'FLOORSMIN_MEDI',
 'YEARS_BUILD_MEDI',
 'YEARS_BUILD_MODE',
 'YEARS_BUILD_AVG',
 'OWN_CAR_AGE',
 'LANDAREA_MEDI',
 'LANDAREA_MODE',
 'LANDAREA_AVG',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MODE',
 'EXT_SOURCE_1',
 'NONLIVINGAREA_MODE',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MEDI',
 'ELEVATORS_MEDI',
 'ELEVATORS_AVG',
 'ELEVATORS_MODE',
 'WALLSMATERIAL_MODE',
 'APARTMENTS_MEDI',
 'APARTMENTS_AVG',
 'APARTMENTS_MODE',
 'ENTRANCES_MEDI',
 'ENTRANCES_AVG',
 'ENTRANCES_MODE',
 'LIVINGAREA_AVG',
 'LIVINGAREA_MODE',
 'LIVINGAREA_MEDI',
 'HOUSETYPE_MODE']

In [6]:
df = df.drop(columns_to_drop, axis=1)

In [7]:
df_cols = df.columns.to_list()
obj_cols = dex.list_by_type(df, ['O'])

In [8]:
def days_to_years(dataframe, col_name):
        """
        Converts values from string to numeric.
        Uses the map function to convert the information on days employed to years employed
        """
        df_name = dataframe.copy()

        if col_name in df.columns:
            #Converts values from string to numeric.
            df_name[col_name] = pd.to_numeric(df_name[col_name], errors='coerce')

            #drops null values on the column
            df_name = df_name.dropna(subset=[col_name])

            #Use the map function to convert the information on days employed to years employed
            year = df_name.loc[:, col_name].map(lambda x: int(abs(x / 365)), na_action=None)
            df_name['YEARS'+col_name[4:]] = year

            #drops the column
            df_name = df_name.drop(col_name, axis=1)

        return df_name


df = days_to_years(df, "DAYS_EMPLOYED")
df = days_to_years(df, 'DAYS_BIRTH')

## Training Basic Models

In [9]:
X_acp = df.iloc[:, (df.columns != "TARGET") & (df.columns != "SK_ID_CURR")]
y_acp = df["TARGET"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_acp, 
    y_acp, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_acp
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_train
)

In [15]:
classifiers = {
    "Logistic Regression": LogisticRegression,
    "Random Forest": RandomForestClassifier,
    "LightGBM": LGBMClassifier,
    "MLPC" : MLPClassifier,
}

In [93]:
param_spaces = {
    "LogisticRegression": {
        'C': {'low': 0.001, 'high': 10, 'log': True, 'type':'float'},
        'max_iter': {'low': 1000, 'high': 1000, 'step':1, 'type':'int'},
        'penalty': {'choices': ['l2'], 'type':'categorical'},
        "class_weight" : {"choices" : ["balanced"], 'type':'categorical'},
    },
    "RandomForestClassifier": {
        'n_estimators': {'low':10, 'high':150, 'step':20, 'type':'int'},
        'max_depth': {'low':2, 'high':10, 'type':'int'},
        'criterion': {'choices':['gini', 'entropy'], 'type':'categorical'},
        'min_samples_leaf' : {"low" : 1, "high" : 51, "step" : 5, 'type':'int'},
        "max_features" : {"low" : 0.1, "high" : 1.0, "type" : "float"},
        "class_weight" : {"choices" : ["balanced"], 'type':'categorical'},
    },
    "LGBMClassifier": {
        'learning_rate': {'low': 0.01, 'high': 1.0, 'type': 'float', 'log': True},
        "num_leaves" : {"low" : 10, "high" : 100, "step" : 5, 'type':'int'},
        'max_depth': {'low': 2, 'high': 10, 'type': 'int'},
        'min_child_samples': {'low': 1, 'high': 51, 'step': 5, 'type': 'int'},
        'colsample_bytree': {'low': 0.1, 'high': 1.0, 'type': 'float'},
        'reg_alpha': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'reg_lambda': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'n_estimators': {'low': 10, 'high': 100, 'step': 10, 'type': 'int'},
        "is_unbalance" : {"choices" : [True], 'type':'categorical'},
        "verbose" : {"choices" : [-1], 'type':'categorical'},

    },
    "MLPClassifier": {
        "hidden_layer_sizes" : {"choices" : [
            [128, 64, 32],
            [128, 64, 32, 16],
            [256, 128, 64, 32, 16],
        ], 'type':'categorical'},
        "alpha" : {'low': 0.0001, 'high': 0.01, 'type': 'float', 'log': True},
        "learning_rate" : {'choices': ['constant', 'invscaling', 'adaptive'], 'type':'categorical'},
        "learning_rate_init" : {'low': 0.001, 'high': 0.1, 'type': 'float', 'log': True},
        "early_stopping" : {'choices': [True], 'type':'categorical'},
        "max_iter" : {"choices" : [50], 'type':'categorical'},
    }
}

In [None]:
study_logistic, model_logistic = optimize_model(LogisticRegression, param_spaces["LogisticRegression"], X_train, y_train, X_val , y_val, n_trials=10)
joblib.dump(model_logistic, "models/model_logistic.joblib")

In [79]:
print("Score for Logistic Regression: ", study_logistic.best_value)
print(study_logistic.best_params)

Score for Logistic Regression:  0.7414958768420731
{'C': 7.155682161754871, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': 'balanced'}


In [None]:
study_rf, model_rf = optimize_model(RandomForestClassifier, param_spaces["RandomForestClassifier"], X_train, y_train, X_val , y_val, n_trials=10)
joblib.dump(model_rf, "models/model_rf.joblib")

In [85]:
print("Score for Random Forest: ", study_rf.best_value)
print(study_rf.best_params)

Score for Random Forest:  0.7387247989964685
{'n_estimators': 70, 'max_depth': 10, 'criterion': 'gini', 'min_samples_leaf': 41, 'max_features': 0.5760054277776141, 'class_weight': 'balanced'}


In [None]:
study_lgbm, model_lgbm = optimize_model(LGBMClassifier, param_spaces["LGBMClassifier"], X_train, y_train, X_val , y_val, n_trials=10)
joblib.dump(model_lgbm, "models/model_lgbm.joblib")

In [96]:
print("Score for LGBM: ", study_lgbm.best_value)
print(study_lgbm.best_params)

Score for LGBM:  0.7471246993379925
{'learning_rate': 0.12520653814999466, 'num_leaves': 75, 'max_depth': 7, 'min_child_samples': 26, 'colsample_bytree': 0.4812893194050143, 'reg_alpha': 0.6458941130666561, 'reg_lambda': 0.4375872112626925, 'n_estimators': 90, 'is_unbalance': True, 'verbose': -1}


In [None]:
study_mlp, model_mlp = optimize_model(MLPClassifier, param_spaces["MLPClassifier"], X_train, y_train, X_val , y_val, n_trials=10)
joblib.dump(model_mlp, "models/model_mlp.joblib")

In [None]:
print("Score for MLP: ", study_mlp.best_value)
print(study_mlp.best_params)

Score for MLP:  0.7433528703800805
{'hidden_layer_sizes': [128, 64, 32], 'alpha': 0.0003527051808306031, 'learning_rate': 'invscaling', 'learning_rate_init': 0.010035211915818264, 'early_stopping': True, 'max_iter': 50}


## Model evaluation

In [86]:
model_logistic = joblib.load("models/model_logistic.joblib")
model_rf = joblib.load("models/model_rf.joblib")
model_lgbm = joblib.load("models/model_lgbm.joblib")
model_mlp = joblib.load("models/model_mlp.joblib")

In [87]:
def ks_threshold(y_true, y_score):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    opt_threshold = thresholds[np.argmax(tpr - fpr)]
    return opt_threshold

def get_metrics(y_true, y_score, y_pred):
    metrics_dict = {
        "roc_auc" : roc_auc_score(y_true, y_score),
        "balanced_accuracy" : balanced_accuracy_score(y_true, y_pred),
        "accuracy" : accuracy_score(y_true, y_pred),
        "precision" : precision_score(y_true, y_pred, zero_division=0),
        "recall" : recall_score(y_true, y_pred),
        "f1" : f1_score(y_true, y_pred),
        "brier_score" : brier_score_loss(y_true, y_score),
    }
    return metrics_dict

def get_fairness_metrics(y_true, y_score, y_pred, sensitive_attr):
    
    # statistical disparity
    p_y1_z1 = np.mean(y_pred[sensitive_attr == 1])
    p_y1_z0 = np.mean(y_pred[sensitive_attr == 0])
    spd = p_y1_z1 - p_y1_z0

    # equalized odds
    tpr_z1 = recall_score(y_true[sensitive_attr == 1], y_pred[sensitive_attr == 1])
    tpr_z0 = recall_score(y_true[sensitive_attr == 0], y_pred[sensitive_attr == 0])
    eod = tpr_z1 - tpr_z0

    # geometric mean of accuracies
    accuracy_z1 = accuracy_score(y_true[sensitive_attr == 1], y_pred[sensitive_attr == 1])
    accuracy_z0 = accuracy_score(y_true[sensitive_attr == 0], y_pred[sensitive_attr == 0])
    gma = np.sqrt(accuracy_z1 * accuracy_z0)

    metrics_dict = {
        "statistical_disparity" : spd,
        "equalized_odds" : eod,
        "geometric_mean_accuracy" : gma,
    }
    return metrics_dict


In [98]:
sensitive = X_test.CODE_GENDER == "F"

results = []
metrics_dict = {"model" : "LogisticRegression"}
y_score = model_logistic.predict_proba(X_test)[:, 1]
opt_thresh = ks_threshold(y_test, y_score)
y_pred = y_score > opt_thresh
metrics_dict["opt_threshold"] = opt_thresh
metrics_dict.update(get_metrics(y_test, y_score, y_pred))
metrics_dict.update(get_fairness_metrics(y_test, y_score, y_pred, sensitive))
results.append(metrics_dict)

metrics_dict = {"model" : "RandomForest"}
y_score = model_rf.predict_proba(X_test)[:, 1]
opt_thresh = ks_threshold(y_test, y_score)
y_pred = y_score > opt_thresh
metrics_dict["opt_threshold"] = opt_thresh
metrics_dict.update(get_metrics(y_test, y_score, y_pred))
metrics_dict.update(get_fairness_metrics(y_test, y_score, y_pred, sensitive))
results.append(metrics_dict)

metrics_dict = {"model" : "LGBM"}
y_score = model_lgbm.predict_proba(X_test)[:, 1]
opt_thresh = ks_threshold(y_test, y_score)
y_pred = y_score > opt_thresh
metrics_dict["opt_threshold"] = opt_thresh
metrics_dict.update(get_metrics(y_test, y_score, y_pred))
metrics_dict.update(get_fairness_metrics(y_test, y_score, y_pred, sensitive))
results.append(metrics_dict)

metrics_dict = {"model" : "MLP"}
y_score = model_mlp.predict_proba(X_test)[:, 1]
opt_thresh = ks_threshold(y_test, y_score)
y_pred = y_score > opt_thresh
metrics_dict["opt_threshold"] = opt_thresh
metrics_dict.update(get_metrics(y_test, y_score, y_pred))
metrics_dict.update(get_fairness_metrics(y_test, y_score, y_pred, sensitive))
results.append(metrics_dict)

pd.DataFrame(results)

Unnamed: 0,model,opt_threshold,roc_auc,balanced_accuracy,accuracy,precision,recall,f1,brier_score,statistical_disparity,equalized_odds,geometric_mean_accuracy
0,LogisticRegression,0.495933,0.749376,0.686794,0.68408,0.160717,0.69003,0.260711,0.203285,-0.178294,-0.163359,0.65866
1,RandomForest,0.459848,0.741841,0.678349,0.682064,0.157229,0.673917,0.254972,0.181141,-0.156568,-0.145677,0.660296
2,LGBM,0.474274,0.752688,0.690951,0.694763,0.165244,0.686405,0.266364,0.183641,-0.167742,-0.151423,0.671115
3,MLP,0.080415,0.738606,0.677339,0.656732,0.150755,0.701913,0.248202,0.069195,-0.156358,-0.136252,0.634871


## Explainability

**TODO**

*   Add easy way to include default parameters in optuna
*   Warnings on pipeline given numeric unique number of values
