# Experiments with German

In [15]:
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.inspection import PartialDependenceDisplay
from lime import lime_tabular
import shap
from cfmining.algorithms import MAPOCAM, BruteForce, Greedy
from cfmining.criteria import PercentileCalculator, PercentileCriterion, PercentileChangesCriterion, NonDomCriterion
from cfmining.predictors import MonotoneClassifier
from cfmining.visualization import buildTable, PlotCounterfactuals
from cfmining.mip_builder import RecourseBuilder
from cfmining.action_set import ActionSet
import joblib


import credit_pipeline.data_exploration as dex
from credit_pipeline.training import *


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading and Cleaning

In [16]:
seed_number = 0

In [17]:
path = "../data/German/"
df = dex.read_csv_encoded(path, 'german_raw.csv')

In [18]:
columns_to_drop = dex.check_missing(df, 50,  False)
columns_to_drop

[]

In [19]:
df_cols = df.columns.to_list()
obj_cols = dex.list_by_type(df, ['O'])
obj_cols

['Gender', 'PurposeOfLoan']

In [20]:
print(dex.list_no_variation_cols(df))
print(dex.list_contin_cols(df))
print(dex.list_by_unique(df, 2))

[]
[]
['GoodCustomer', 'Gender', 'ForeignWorker', 'Single', 'NumberOfLiableIndividuals', 'HasTelephone', 'CheckingAccountBalance_geq_0', 'CheckingAccountBalance_geq_200', 'SavingsAccountBalance_geq_100', 'SavingsAccountBalance_geq_500', 'MissedPayments', 'NoCurrentLoan', 'CriticalAccountOrLoansElsewhere', 'OtherLoansAtBank', 'HasCoapplicant', 'HasGuarantor', 'OwnsHouse', 'RentsHouse', 'Unemployed', 'YearsAtCurrentJob_lt_1', 'YearsAtCurrentJob_geq_4', 'JobClassIsSkilled']


In [7]:
df.GoodCustomer = df.GoodCustomer.apply(lambda x: 1 if x == 1 else 0)

## Training Basic Models

In [8]:
X_acp = df.iloc[:, (df.columns != "GoodCustomer")]
y_acp = df["GoodCustomer"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_acp, 
    y_acp, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_acp
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_train
)

In [10]:
classifiers = {
    "Logistic Regression": LogisticRegression,
    "Random Forest": RandomForestClassifier,
    "LightGBM": LGBMClassifier,
    "MLPC" : MLPClassifier,
}

In [11]:
param_spaces = {
    "LogisticRegression": {
        'C': {'low': 0.001, 'high': 10, 'log': True, 'type':'float'},
        'max_iter': {'low': 1000, 'high': 1000, 'step':1, 'type':'int'},
        'penalty': {'choices': ['l2'], 'type':'categorical'},
        "class_weight" : {"choices" : [None, "balanced"], 'type':'categorical'},
    },
    "RandomForestClassifier": {
        'n_estimators': {'low':10, 'high':150, 'step':20, 'type':'int'},
        'max_depth': {'low':2, 'high':10, 'type':'int'},
        'criterion': {'choices':['gini', 'entropy'], 'type':'categorical'},
        'min_samples_leaf' : {"low" : 1, "high" : 51, "step" : 5, 'type':'int'},
        "max_features" : {"low" : 0.1, "high" : 1.0, "type" : "float"},
        "class_weight" : {"choices" : [None, "balanced"], 'type':'categorical'},
    },
    "LGBMClassifier": {
        'learning_rate': {'low': 0.01, 'high': 1.0, 'type': 'float', 'log': True},
        "num_leaves" : {"low" : 5, "high" : 100, "step" : 5, 'type':'int'},
        'max_depth': {'low': 2, 'high': 10, 'type': 'int'},
        'min_child_samples': {'low': 1, 'high': 51, 'step': 5, 'type': 'int'},
        'colsample_bytree': {'low': 0.1, 'high': 1.0, 'type': 'float'},
        'reg_alpha': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'reg_lambda': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        'n_estimators': {'low': 10, 'high': 100, 'step': 10, 'type': 'int'},
        "class_weight" : {"choices" : [None, "balanced"], 'type':'categorical'},
        "verbose" : {"choices" : [-1], 'type':'categorical'},

    },
    "MLPClassifier": {
        "hidden_layer_sizes" : {"choices" : [
            [128, 64, 32],
            [128, 64, 32, 16],
            [256, 128, 64, 32, 16],
        ], 'type':'categorical'},
        "alpha" : {'low': 0.0001, 'high': 0.01, 'type': 'float', 'log': True},
        "learning_rate" : {'choices': ['constant', 'invscaling', 'adaptive'], 'type':'categorical'},
        "learning_rate_init" : {'low': 0.001, 'high': 0.1, 'type': 'float', 'log': True},
        "early_stopping" : {'choices': [True], 'type':'categorical'},
        "max_iter" : {"choices" : [50], 'type':'categorical'},
    }
}

In [None]:
study_logistic, model_logistic = optimize_model(LogisticRegression, param_spaces["LogisticRegression"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_logistic, "models/german_logistic.joblib")

In [79]:
print("Score for Logistic Regression: ", study_logistic.best_value)
print(study_logistic.best_params)

Score for Logistic Regression:  0.7239583333333334
{'C': 0.0010419804639040255, 'max_iter': 1000, 'penalty': 'l2', 'class_weight': None}


In [None]:
study_rf, model_rf = optimize_model(RandomForestClassifier, param_spaces["RandomForestClassifier"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_rf, "models/german_rf.joblib")

In [81]:
print("Score for Random Forest: ", study_rf.best_value)
print(study_rf.best_params)

Score for Random Forest:  0.7658110119047619
{'n_estimators': 130, 'max_depth': 4, 'criterion': 'entropy', 'min_samples_leaf': 1, 'max_features': 0.21386703871573098, 'class_weight': None}


In [None]:
study_lgbm, model_lgbm = optimize_model(LGBMClassifier, param_spaces["LGBMClassifier"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_lgbm, "models/german_lgbm.joblib")

In [83]:
print("Score for LGBM: ", study_lgbm.best_value)
print(study_lgbm.best_params)

Score for LGBM:  0.7501860119047619
{'learning_rate': 0.011328189091755097, 'num_leaves': 90, 'max_depth': 3, 'min_child_samples': 6, 'colsample_bytree': 0.3495715373272639, 'reg_alpha': 0.49445533037344624, 'reg_lambda': 0.707992442416903, 'n_estimators': 100, 'class_weight': None, 'verbose': -1}


In [None]:
study_mlp, model_mlp = optimize_model(MLPClassifier, param_spaces["MLPClassifier"], X_train, y_train, X_val , y_val, n_trials=100)
joblib.dump(model_mlp, "models/german_mlp.joblib")

In [85]:
print("Score for MLP: ", study_mlp.best_value)
print(study_mlp.best_params)

Score for MLP:  0.7336309523809524
{'hidden_layer_sizes': [128, 64, 32, 16], 'alpha': 0.005641404688490753, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0011283714315737572, 'early_stopping': True, 'max_iter': 50}


## Model evaluation

In [12]:
model_logistic = joblib.load("models/german_logistic.joblib")
model_rf = joblib.load("models/german_rf.joblib")
model_lgbm = joblib.load("models/german_lgbm.joblib")
model_mlp = joblib.load("models/german_mlp.joblib")
models_dict = {
    "Logistic Regression" : 
        [model_logistic, ks_threshold(y_test, model_logistic.predict_proba(X_test)[:,1])],
    "Random Forest" : 
        [model_rf, ks_threshold(y_test, model_rf.predict_proba(X_test)[:,1])],
    "LightGBM" : 
        [model_lgbm, ks_threshold(y_test, model_lgbm.predict_proba(X_test)[:,1])],
    "MLP" : 
        [model_mlp, ks_threshold(y_test, model_mlp.predict_proba(X_test)[:,1])],
}

In [13]:
get_metrics(models_dict, X_test, y_test)

Unnamed: 0,AUC,Balanced Accuracy,Accuracy,Precision,Recall,F1,Brier Score
Logistic Regression,0.802024,0.753571,0.715,0.910891,0.657143,0.763485,0.198794
Random Forest,0.809048,0.770238,0.725,0.929293,0.657143,0.769874,0.180871
LightGBM,0.800833,0.758333,0.735,0.899083,0.7,0.787149,0.191351
MLP,0.81131,0.760714,0.705,0.935484,0.621429,0.746781,0.169211


In [14]:
get_fairness_metrics(models_dict, X_test, y_test, X_test.Gender == "Female", 1)

Unnamed: 0,DPD,EOD,AOD,APVD,GMA
Logistic Regression,-0.285895,-0.265081,-0.211295,0.006749,0.692425
Random Forest,-0.246829,-0.225432,-0.165829,-0.005284,0.705491
LightGBM,-0.137747,-0.083263,-0.072767,-0.061343,0.730922
MLP,-0.180365,-0.139054,-0.10982,-0.032786,0.698625


## Explainability

**TODO**

*   Add easy way to include default parameters in optuna
*   Warnings on pipeline given numeric unique number of values
