# Experiments with Home Credit

In [16]:
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.inspection import PartialDependenceDisplay
from lime import lime_tabular
import shap
from cfmining.algorithms import MAPOCAM, BruteForce, Greedy
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import MonotoneClassifier
from cfmining.visualization import buildTable, PlotCounterfactuals
from cfmining.mip_builder import RecourseBuilder
from cfmining.action_set import ActionSet
import joblib


import credit_pipeline.data_exploration as dex
from credit_pipeline.training import *


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading and Cleaning

In [17]:
seed_number = 0

In [18]:
path = "../data/HomeCredit/"
df = dex.read_csv_encoded(path, 'application_train.csv')

In [19]:
columns_to_drop = dex.check_missing(df, 50,  False)
columns_to_drop

['COMMONAREA_MEDI',
 'COMMONAREA_AVG',
 'COMMONAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_MEDI',
 'FONDKAPREMONT_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MEDI',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MODE',
 'FLOORSMIN_MEDI',
 'YEARS_BUILD_MEDI',
 'YEARS_BUILD_MODE',
 'YEARS_BUILD_AVG',
 'OWN_CAR_AGE',
 'LANDAREA_MEDI',
 'LANDAREA_MODE',
 'LANDAREA_AVG',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MODE',
 'EXT_SOURCE_1',
 'NONLIVINGAREA_MODE',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MEDI',
 'ELEVATORS_MEDI',
 'ELEVATORS_AVG',
 'ELEVATORS_MODE',
 'WALLSMATERIAL_MODE',
 'APARTMENTS_MEDI',
 'APARTMENTS_AVG',
 'APARTMENTS_MODE',
 'ENTRANCES_MEDI',
 'ENTRANCES_AVG',
 'ENTRANCES_MODE',
 'LIVINGAREA_AVG',
 'LIVINGAREA_MODE',
 'LIVINGAREA_MEDI',
 'HOUSETYPE_MODE']

In [20]:
df = df.drop(columns_to_drop, axis=1)

In [21]:
df_cols = df.columns.to_list()
obj_cols = dex.list_by_type(df, ['O'])

In [22]:
def days_to_years(dataframe, col_name):
        """
        Converts values from string to numeric.
        Uses the map function to convert the information on days employed to years employed
        """
        df_name = dataframe.copy()

        if col_name in df.columns:
            #Converts values from string to numeric.
            df_name[col_name] = pd.to_numeric(df_name[col_name], errors='coerce')

            #drops null values on the column
            df_name = df_name.dropna(subset=[col_name])

            #Use the map function to convert the information on days employed to years employed
            year = df_name.loc[:, col_name].map(lambda x: int(abs(x / 365)), na_action=None)
            df_name['YEARS'+col_name[4:]] = year

            #drops the column
            df_name = df_name.drop(col_name, axis=1)

        return df_name


df = days_to_years(df, "DAYS_EMPLOYED")
df = days_to_years(df, 'DAYS_BIRTH')

## Training Basic Models

In [23]:
X_acp = df.iloc[:, (df.columns != "TARGET") & (df.columns != "SK_ID_CURR")]
y_acp = df["TARGET"]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X_acp, 
    y_acp, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_acp
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_train
)

In [25]:
classifiers = {
    "Logistic Regression": LogisticRegression,
    "Random Forest": RandomForestClassifier,
    "LightGBM": LGBMClassifier,
    "MLPC" : MLPClassifier,
}

In [26]:
param_spaces = {
    "LogisticRegression": {
        'C': {'low': 0.001, 'high': 10, 'log': True, 'type':'float'},
        'max_iter': {'low': 1000, 'high': 1000, 'step':1, 'type':'int'},
        'penalty': {'choices': ['l2'], 'type':'categorical'},
        "class_weight" : {"choices" : ["balanced"], 'type':'categorical'},
    },
    "RandomForestClassifier": {
        'n_estimators': {'low':10, 'high':150, 'step':20, 'type':'int'},
        'max_depth': {'low':2, 'high':10, 'type':'int'},
        'criterion': {'choices':['gini', 'entropy'], 'type':'categorical'},
        'min_samples_leaf' : {"low" : 1, "high" : 51, "step" : 5, 'type':'int'},
        "max_features" : {"low" : 0.1, "high" : 1.0, "type" : "float"},
        "class_weight" : {"choices" : ["balanced"], 'type':'categorical'},
    },
    "LGBMClassifier": {
        'learning_rate': {'low': 0.01, 'high': 1.0, 'type': 'float', 'log': True},
        "num_leaves" : {"low" : 10, "high" : 100, "step" : 5, 'type':'int'},
        'max_depth': {'low': 2, 'high': 10, 'type': 'int'},
        'min_child_samples': {'low': 1, 'high': 51, 'step': 5, 'type': 'int'},
        'colsample_bytree': {'low': 0.1, 'high': 1.0, 'type': 'float'},
        'reg_alpha': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'reg_lambda': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'n_estimators': {'low': 10, 'high': 100, 'step': 10, 'type': 'int'},
        "is_unbalance" : {"choices" : [True], 'type':'categorical'},
        "verbose" : {"choices" : [-1], 'type':'categorical'},

    },
    "MLPClassifier": {
        "hidden_layer_sizes" : {"choices" : [
            [128, 64, 32],
            [128, 64, 32, 16],
            [256, 128, 64, 32, 16],
        ], 'type':'categorical'},
        "alpha" : {'low': 0.0001, 'high': 0.01, 'type': 'float', 'log': True},
        "learning_rate" : {'choices': ['constant', 'invscaling', 'adaptive'], 'type':'categorical'},
        "learning_rate_init" : {'low': 0.001, 'high': 0.1, 'type': 'float', 'log': True},
        "early_stopping" : {'choices': [True], 'type':'categorical'},
        "max_iter" : {"choices" : [50], 'type':'categorical'},
    }
}

In [27]:
study_logistic, model_logistic = optimize_model(LogisticRegression, param_spaces["LogisticRegression"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_logistic, "models/home_credit_logistic.joblib")

[I 2023-09-29 23:25:37,808] A new study created in memory with name: no-name-91892f4a-65f1-4840-b128-07f2b6b61615
[I 2023-09-29 23:26:10,260] Trial 0 finished with value: 0.7412824883626019 and parameters: {'C': 0.15676677195506075, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7412824883626019.
[I 2023-09-29 23:26:56,629] Trial 1 finished with value: 0.7414772743718521 and parameters: {'C': 0.7257005721594281, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7414772743718521.
[I 2023-09-29 23:27:30,883] Trial 2 finished with value: 0.7413484931776471 and parameters: {'C': 0.25766385746135895, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7414772743718521.
[I 2023-09-29 23:27:58,522] Trial 3 finished with value: 0.7412707491017712 and parameters: {'C': 0.15119336467641012, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': 'balanced'}. Best is tri

['models/home_credit_logistic.joblib']

In [28]:
print("Score for Logistic Regression: ", study_logistic.best_value)
print(study_logistic.best_params)

Score for Logistic Regression:  0.7415298088164799
{'C': 2.90524273919139, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': 'balanced'}


In [None]:
study_rf, model_rf = optimize_model(RandomForestClassifier, param_spaces["RandomForestClassifier"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_rf, "models/home_credit_rf.joblib")

In [30]:
print("Score for Random Forest: ", study_rf.best_value)
print(study_rf.best_params)

Score for Random Forest:  0.7407392205308929
{'n_estimators': 90, 'max_depth': 10, 'criterion': 'gini', 'min_samples_leaf': 51, 'max_features': 0.3435949186368579, 'class_weight': 'balanced'}


In [None]:
study_lgbm, model_lgbm = optimize_model(LGBMClassifier, param_spaces["LGBMClassifier"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_lgbm, "models/home_credit_lgbm.joblib")

In [39]:
print("Score for LGBM: ", study_lgbm.best_value)
print(study_lgbm.best_params)

Score for LGBM:  0.7519408632926617
{'learning_rate': 0.1525897078045497, 'num_leaves': 25, 'max_depth': 10, 'min_child_samples': 6, 'colsample_bytree': 0.39246210984663443, 'reg_alpha': 0.699615134448418, 'reg_lambda': 0.5409369253349579, 'n_estimators': 70, 'is_unbalance': True, 'verbose': -1}


In [None]:
study_mlp, model_mlp = optimize_model(MLPClassifier, param_spaces["MLPClassifier"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_mlp, "models/home_credit_mlp.joblib")

In [41]:
print("Score for MLP: ", study_mlp.best_value)
print(study_mlp.best_params)

Score for MLP:  0.7433528703800805
{'hidden_layer_sizes': [128, 64, 32], 'alpha': 0.0003527051808306031, 'learning_rate': 'invscaling', 'learning_rate_init': 0.010035211915818264, 'early_stopping': True, 'max_iter': 50}


## Model evaluation

In [42]:
model_logistic = joblib.load("models/model_logistic.joblib")
model_rf = joblib.load("models/model_rf.joblib")
model_lgbm = joblib.load("models/model_lgbm.joblib")
model_mlp = joblib.load("models/model_mlp.joblib")
models_dict = {
    "Logistic Regression" : 
        [model_logistic, ks_threshold(y_test, model_logistic.predict_proba(X_test)[:,1])],
    "Random Forest" : 
        [model_rf, ks_threshold(y_test, model_rf.predict_proba(X_test)[:,1])],
    "LightGBM" : 
        [model_lgbm, ks_threshold(y_test, model_lgbm.predict_proba(X_test)[:,1])],
    "MLP" : 
        [model_mlp, ks_threshold(y_test, model_mlp.predict_proba(X_test)[:,1])],
}

In [43]:
get_metrics(models_dict, X_test, y_test)

Unnamed: 0,AUC,Balanced Accuracy,Accuracy,Precision,Recall,F1,Brier Score
Logistic Regression,0.749376,0.686895,0.684097,0.160756,0.690232,0.260777,0.203285
Random Forest,0.741841,0.678449,0.682081,0.157269,0.674119,0.255039,0.181141
LightGBM,0.752688,0.691052,0.694779,0.165285,0.686606,0.266432,0.183641
MLP,0.738606,0.67744,0.656748,0.150792,0.702115,0.248264,0.069195


In [44]:
get_fairness_metrics(models_dict, X_test, y_test, X_test.CODE_GENDER == "F")

Unnamed: 0,SPD,EOD,GMA
Logistic Regression,-0.178341,-0.163839,0.658686
Random Forest,-0.156615,-0.146156,0.660322
LightGBM,-0.167717,-0.151076,0.671126
MLP,-0.156405,-0.136732,0.634897


## Explainability

**TODO**

*   Add easy way to include default parameters in optuna
*   Warnings on pipeline given numeric unique number of values
