# Experiments with Taiwan

In [1]:
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.inspection import PartialDependenceDisplay
from lime import lime_tabular
import shap
from cfmining.algorithms import MAPOCAM, BruteForce, Greedy
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import MonotoneClassifier
from cfmining.visualization import buildTable, PlotCounterfactuals
from cfmining.mip_builder import RecourseBuilder
from cfmining.action_set import ActionSet
import joblib


import credit_pipeline.data_exploration as dex
from credit_pipeline.training import *


%load_ext autoreload
%autoreload 2

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Loading and Cleaning

In [2]:
seed_number = 0

In [3]:
path = "../data/Taiwan/"
df = dex.read_csv_encoded(path, 'Taiwan.csv')
df.columns = df.iloc[0, :].tolist()
df = df.iloc[1:, :]
df = df.drop(columns = ["ID"])

In [4]:
columns_to_drop = dex.check_missing(df, 50,  False)
columns_to_drop

[]

In [5]:
num_cols = ["LIMIT_BAL", "AGE", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3",
            "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2",
            "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6", "default payment next month"]
df[num_cols] = df[num_cols].astype(float)

In [6]:
df_cols = df.columns.to_list()
obj_cols = dex.list_by_type(df, ['O'])
obj_cols

['SEX',
 'EDUCATION',
 'MARRIAGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6']

In [7]:
print(dex.list_no_variation_cols(df))
print(dex.list_contin_cols(df))
print(dex.list_by_unique(df, 2))

[]
['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
['SEX', 'default payment next month']


In [8]:
df["SEX"] = df["SEX"].apply(lambda x : "female" if x == "2" else "male")

In [9]:
X_acp = df.iloc[:, (df.columns != "default payment next month")]
y_acp = df["default payment next month"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_acp, 
    y_acp, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_acp
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_train
)

## Training Basic Models

In [11]:
n_trials = 100

In [12]:
classifiers = {
    "Logistic Regression": LogisticRegression,
    "Random Forest": RandomForestClassifier,
    "LightGBM": LGBMClassifier,
    "MLPC" : MLPClassifier,
}

In [13]:
param_spaces = {
    "LogisticRegression": {
        'C': {'low': 0.001, 'high': 10, 'log': True, 'type':'float'},
        'max_iter': {'low': 1000, 'high': 1000, 'step':1, 'type':'int'},
        'penalty': {'choices': ["l1", 'l2'], 'type':'categorical'},
        "class_weight" : {"choices" : [None, "balanced"], 'type':'categorical'},
        "solver" : {"choices" : ["liblinear"], "type" : "categorical"}
    },
    "RandomForestClassifier": {
        'n_estimators': {'low':10, 'high':150, 'step':20, 'type':'int'},
        'max_depth': {'low':2, 'high':10, 'type':'int'},
        'criterion': {'choices':['gini', 'entropy'], 'type':'categorical'},
        'min_samples_leaf' : {"low" : 1, "high" : 51, "step" : 5, 'type':'int'},
        "max_features" : {"low" : 0.1, "high" : 1.0, "type" : "float"},
        "class_weight" : {"choices" : [None, "balanced"], 'type':'categorical'},
    },
    "LGBMClassifier": {
        'learning_rate': {'low': 0.01, 'high': 1.0, 'type': 'float', 'log': True},
        "num_leaves" : {"low" : 5, "high" : 100, "step" : 5, 'type':'int'},
        'max_depth': {'low': 2, 'high': 10, 'type': 'int'},
        'min_child_samples': {'low': 1, 'high': 51, 'step': 5, 'type': 'int'},
        'colsample_bytree': {'low': 0.1, 'high': 1.0, 'type': 'float'},
        'reg_alpha': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'reg_lambda': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'n_estimators': {'low': 10, 'high': 100, 'step': 10, 'type': 'int'},
        "class_weight" : {"choices" : [None, "balanced"], 'type':'categorical'},
        "verbose" : {"choices" : [-1], 'type':'categorical'},
    },
    "MLPClassifier": {
        "hidden_layer_sizes" : {"choices" : [
            [128, 64, 32],
            [128, 64, 32, 16],
            [256, 128, 64, 32, 16],
        ], 'type':'categorical'},
        "alpha" : {'low': 0.0001, 'high': 0.01, 'type': 'float', 'log': True},
        "learning_rate" : {'choices': ['constant', 'invscaling', 'adaptive'], 'type':'categorical'},
        "learning_rate_init" : {'low': 0.001, 'high': 0.1, 'type': 'float', 'log': True},
        "early_stopping" : {'choices': [True], 'type':'categorical'},
        "max_iter" : {"choices" : [50], 'type':'categorical'},
    }
}

In [14]:
study_logistic, model_logistic = optimize_model(LogisticRegression, param_spaces["LogisticRegression"], X_train, y_train, X_val , y_val, n_trials=n_trials)
joblib.dump(model_logistic, "models/taiwan_logistic.joblib")

  0%|          | 0/100 [00:00<?, ?it/s]

['models/taiwan_logistic.joblib']

In [15]:
print("Score for Logistic Regression: ", study_logistic.best_value)
print(study_logistic.best_params)

Score for Logistic Regression:  0.7743629583279172
{'C': 0.008506332302143265, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'liblinear'}


In [26]:
study_rf, model_rf = optimize_model(RandomForestClassifier, param_spaces["RandomForestClassifier"], X_train, y_train, X_val , y_val, n_trials=n_trials)
joblib.dump(model_rf, "models/taiwan_rf.joblib")

  0%|          | 0/100 [00:00<?, ?it/s]

['models/taiwan_rf.joblib']

In [27]:
print("Score for Random Forest: ", study_rf.best_value)
print(study_rf.best_params)

Score for Random Forest:  0.7854082215632396
{'n_estimators': 70, 'max_depth': 10, 'criterion': 'entropy', 'min_samples_leaf': 41, 'max_features': 0.6223181749192774, 'class_weight': 'balanced'}


In [17]:
study_lgbm, model_lgbm = optimize_model(LGBMClassifier, param_spaces["LGBMClassifier"], X_train, y_train, X_val , y_val, n_trials=n_trials)
joblib.dump(model_lgbm, "models/taiwan_lgbm.joblib")

  0%|          | 0/100 [00:00<?, ?it/s]

['models/taiwan_lgbm.joblib']

In [18]:
print("Score for LGBM: ", study_lgbm.best_value)
print(study_lgbm.best_params)

Score for LGBM:  0.7877014355542254
{'learning_rate': 0.05699910588443634, 'num_leaves': 45, 'max_depth': 10, 'min_child_samples': 16, 'colsample_bytree': 0.3234057672082941, 'reg_alpha': 0.45527006502059564, 'reg_lambda': 0.9669089166677883, 'n_estimators': 80, 'class_weight': 'balanced', 'verbose': -1}


In [None]:
study_mlp, model_mlp = optimize_model(MLPClassifier, param_spaces["MLPClassifier"], X_train, y_train, X_val , y_val, n_trials=n_trials)
joblib.dump(model_mlp, "models/taiwan_mlp.joblib")

In [None]:
print("Score for MLP: ", study_mlp.best_value)
print(study_mlp.best_params)

Score for MLP:  0.7859573737025651
{'hidden_layer_sizes': [256, 128, 64, 32, 16], 'alpha': 0.0007524655395096057, 'learning_rate': 'constant', 'learning_rate_init': 0.0025686779592416402, 'early_stopping': True, 'max_iter': 50}


## Model evaluation

In [28]:
models = {
    "LogisticRegression" : joblib.load("models/taiwan_logistic.joblib"),
    "MLP" : joblib.load("models/taiwan_mlp.joblib"),
    "RandomForest" : joblib.load("models/taiwan_rf.joblib"),
    "LightGBM" : joblib.load("models/taiwan_lgbm.joblib"),
}
ks_threshold_dict = {}
models_dict = {}
for n, m in models.items():
    ks_threshold_dict[n] = ks_threshold(y_test, m.predict_proba(X_test)[:,1])
    models_dict[n] = [
        m,
        ks_threshold_dict[n]
    ]

In [29]:
get_metrics(models_dict, X_train, y_train).round(3)

Unnamed: 0,AUC,Balanced Accuracy,Accuracy,Precision,Recall,F1,Brier Score
LogisticRegression,0.773,0.708,0.77,0.483,0.597,0.534,0.186
MLP,0.795,0.716,0.721,0.422,0.706,0.529,0.133
RandomForest,0.828,0.737,0.798,0.537,0.627,0.579,0.163
LightGBM,0.858,0.772,0.787,0.513,0.744,0.607,0.159


In [30]:
get_metrics(models_dict, X_val, y_val).round(3)

Unnamed: 0,AUC,Balanced Accuracy,Accuracy,Precision,Recall,F1,Brier Score
LogisticRegression,0.774,0.706,0.771,0.486,0.589,0.533,0.185
MLP,0.786,0.712,0.719,0.419,0.699,0.524,0.136
RandomForest,0.785,0.709,0.783,0.508,0.576,0.54,0.173
LightGBM,0.788,0.712,0.753,0.458,0.638,0.533,0.175


In [31]:
get_metrics(models_dict, X_test, y_test).round(3)

Unnamed: 0,AUC,Balanced Accuracy,Accuracy,Precision,Recall,F1,Brier Score
LogisticRegression,0.762,0.701,0.769,0.481,0.58,0.526,0.186
MLP,0.779,0.711,0.717,0.417,0.699,0.522,0.136
RandomForest,0.782,0.712,0.782,0.507,0.586,0.544,0.174
LightGBM,0.786,0.718,0.754,0.461,0.652,0.54,0.176


In [32]:
models_dict_fairness = {}
for n, m in models_dict.items():
    models_dict_fairness[n] = (m[0].predict_proba(X_test)[:,1] > m[1]).astype(int)

In [33]:
get_fairness_metrics(models_dict_fairness, y_test, X_test.SEX == "female", 0)

Unnamed: 0,DPD,EOD,AOD,APVD,GMA,balanced_accuracy
LogisticRegression,0.047421,0.0388,0.02721,0.013324,0.765189,0.700673
MLP,0.065822,0.056914,0.04455,0.013682,0.713066,0.710294
RandomForest,0.031926,0.02399,0.009129,0.018778,0.779349,0.711768
LightGBM,0.044468,0.034923,0.022814,0.017942,0.751287,0.717265


## Explainability

**TODO**

*   Add easy way to include default parameters in optuna
*   Warnings on pipeline given numeric unique number of values
