# Experiments with Home Credit

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score
import pickle as pkl

import credit_pipeline.data_exploration as dex
from credit_pipeline.training import *

% load_ext autoreload
% autoreload 2

UsageError: Line magic function `%` not found.


## Loading and Cleaning

In [2]:
seed_number = 0

In [3]:
path = "../data/HomeCredit/"
df = dex.read_csv_encoded(path, 'application_train.csv')

In [4]:
columns_to_drop = dex.check_missing(df, 50,  False)
columns_to_drop

['COMMONAREA_MEDI',
 'COMMONAREA_AVG',
 'COMMONAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_MEDI',
 'FONDKAPREMONT_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAPARTMENTS_MEDI',
 'FLOORSMIN_AVG',
 'FLOORSMIN_MODE',
 'FLOORSMIN_MEDI',
 'YEARS_BUILD_MEDI',
 'YEARS_BUILD_MODE',
 'YEARS_BUILD_AVG',
 'OWN_CAR_AGE',
 'LANDAREA_MEDI',
 'LANDAREA_MODE',
 'LANDAREA_AVG',
 'BASEMENTAREA_MEDI',
 'BASEMENTAREA_AVG',
 'BASEMENTAREA_MODE',
 'EXT_SOURCE_1',
 'NONLIVINGAREA_MODE',
 'NONLIVINGAREA_AVG',
 'NONLIVINGAREA_MEDI',
 'ELEVATORS_MEDI',
 'ELEVATORS_AVG',
 'ELEVATORS_MODE',
 'WALLSMATERIAL_MODE',
 'APARTMENTS_MEDI',
 'APARTMENTS_AVG',
 'APARTMENTS_MODE',
 'ENTRANCES_MEDI',
 'ENTRANCES_AVG',
 'ENTRANCES_MODE',
 'LIVINGAREA_AVG',
 'LIVINGAREA_MODE',
 'LIVINGAREA_MEDI',
 'HOUSETYPE_MODE']

In [5]:
df = df.drop(columns_to_drop, axis=1)

In [6]:
df_cols = df.columns.to_list()
obj_cols = dex.list_by_type(df, ['O'])
obj_cols

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'EMERGENCYSTATE_MODE']

In [7]:
def days_to_years(dataframe, col_name):
        """
        Converts values from string to numeric.
        Uses the map function to convert the information on days employed to years employed
        """
        df_name = dataframe.copy()

        if col_name in df.columns:
            #Converts values from string to numeric.
            df_name[col_name] = pd.to_numeric(df_name[col_name], errors='coerce')

            #drops null values on the column
            df_name = df_name.dropna(subset=[col_name])

            #Use the map function to convert the information on days employed to years employed
            year = df_name.loc[:, col_name].map(lambda x: int(abs(x / 365)), na_action=None)
            df_name['YEARS'+col_name[4:]] = year

            #drops the column
            df_name = df_name.drop(col_name, axis=1)

        return df_name


df = days_to_years(df, "DAYS_EMPLOYED")
df = days_to_years(df, 'DAYS_BIRTH')

## Training Basic Models

In [8]:
X_acp = df.iloc[:, (df.columns != "TARGET") & (df.columns != "SK_ID_CURR")]
y_acp = df["TARGET"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_acp, 
    y_acp, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_acp
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size = 0.2, 
    random_state=seed_number, 
    stratify=y_train
)

In [20]:
classifiers = {
    "Logistic Regression": LogisticRegression,
    "Random Forest": RandomForestClassifier,
    "LightGBM": LGBMClassifier,
    "MLPC" : MLPClassifier,
}

In [21]:
?MLPClassifier

[0;31mInit signature:[0m
[0mMLPClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mhidden_layer_sizes[0m[0;34m=[0m[0;34m([0m[0;36m100[0m[0;34m,[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mactivation[0m[0;34m=[0m[0;34m'relu'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msolver[0m[0;34m=[0m[0;34m'adam'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malpha[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m=[0m[0;34m'constant'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate_init[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpower_t[0m[0;34m=[0m[0;36m0.5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;36m200[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshuffle[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0

In [22]:
param_spaces = {
    "LogisticRegression": {
        'C': {'low': 0.001, 'high': 10, 'log': True, 'type':'float'},
        'max_iter': {'low': 1000, 'high': 1000, 'step':1, 'type':'int'},
        'penalty': {'choices': ['l2'], 'type':'categorical'}
    },
    "RandomForestClassifier": {
        'n_estimators': {'low':10, 'high':150, 'step':1, 'type':'int'},
        'max_depth': {'low':2, 'high':10, 'type':'int'},
        'criterion': {'choices':['gini', 'entropy'], 'type':'categorical'}
    },
    "LGBMClassifier": {
        'learning_rate': {'low': 0.01, 'high': 1.0, 'type': 'float', 'log': True},
        'num_leaves': {'low': 2, 'high': 15, 'step': 1, 'type': 'int'},
        'max_depth': {'low': 1, 'high': 10, 'type': 'int'},
        'min_child_samples': {'low': 1, 'high': 50, 'step': 1, 'type': 'int'},
        'subsample': {'low': 0.1, 'high': 1.0, 'type': 'float'},
        'colsample_bytree': {'low': 0.1, 'high': 1.0, 'type': 'float'},
        'reg_alpha': {'low': 0.0, 'high': 0.5, 'type': 'float'},
        'reg_lambda': {'low': 0.0, 'high': 1.0, 'type': 'float'},
        # 'n_estimators': {'low': 50, 'high': 200, 'step': 1, 'type': 'int'},
        'verbose': {'low':-1, 'high': -1, 'type': 'int'}
    },
    "MLPClassifier": {
        "hidden_layer_sizes" : {'low': 1, 'high': 10, 'step': 1, 'type': 'int'},
        "alpha" : {'low': 0.0001, 'high': 0.01, 'type': 'float', 'log': True},
        "learning_rate" : {'choices': ['constant', 'invscaling', 'adaptive'], 'type':'categorical'},
        "learning_rate_init" : {'low': 0.001, 'high': 0.1, 'type': 'float', 'log': True},
    }
}

In [26]:
study_logistic = optimize_model(LogisticRegression, param_spaces["LogisticRegression"], X_train, y_train, X_val , y_val, n_trials=100)

[I 2023-09-28 10:43:22,426] A new study created in memory with name: no-name-13bc3690-d567-481f-bb59-7ba14b4d2c10
[I 2023-09-28 10:43:36,949] Trial 0 finished with value: 0.7398696802891076 and parameters: {'C': 0.15676677195506075, 'max_iter': 1000, 'penalty': 'l2'}. Best is trial 0 with value: 0.7398696802891076.
[I 2023-09-28 10:43:57,359] Trial 1 finished with value: 0.7411638934402414 and parameters: {'C': 0.7257005721594281, 'max_iter': 1000, 'penalty': 'l2'}. Best is trial 1 with value: 0.7411638934402414.
[I 2023-09-28 10:44:11,129] Trial 2 finished with value: 0.740479737779758 and parameters: {'C': 0.25766385746135895, 'max_iter': 1000, 'penalty': 'l2'}. Best is trial 1 with value: 0.7411638934402414.
[I 2023-09-28 10:44:21,935] Trial 3 finished with value: 0.7398212426182926 and parameters: {'C': 0.15119336467641012, 'max_iter': 1000, 'penalty': 'l2'}. Best is trial 1 with value: 0.7411638934402414.
[I 2023-09-28 10:44:30,818] Trial 4 finished with value: 0.7380759056486272 

In [28]:
print("Score for Logistic Regression: ", study_logistic.best_value)
print(study_logistic.best_params)
pkl.dump(study_logistic, open("study_logistic.pkl", "wb"))

Score for Logistic Regression:  0.7412956303231619
{'C': 8.16453047345026, 'max_iter': 1000, 'penalty': 'l2'}


In [None]:
study_rf = optimize_model(RandomForestClassifier, param_spaces["RandomForestClassifier"], X_train, y_train, X_val , y_val, n_trials=100)

In [15]:
print("Score for Random Forest: ", study_rf.best_value)
print(study_rf.best_params)
pkl.dump(study_rf, open("study_rf.pkl", "wb"))

Score for Random Forest:  0.7394031490386275
{'n_estimators': 146, 'max_depth': 10, 'criterion': 'entropy'}


In [None]:
study_lgbm = optimize_model(LGBMClassifier, param_spaces["LGBMClassifier"], X_train, y_train, X_val , y_val, n_trials=100)

In [18]:
print("Score for LGBM: ", study_lgbm.best_value)
print(study_lgbm.best_params)
pkl.dump(study_lgbm, open("study_lgbm.pkl", "wb"))

Score for LGBM:  0.7538204864963433
{'learning_rate': 0.16993343789723322, 'num_leaves': 15, 'max_depth': 4, 'min_child_samples': 40, 'subsample': 0.40547037083183, 'colsample_bytree': 0.9811676942834585, 'reg_alpha': 0.26067703219886407, 'reg_lambda': 0.9849015614655063, 'verbose': -1}


In [None]:
study_mlp = optimize_model(MLPClassifier, param_spaces["MLPClassifier"], X_train, y_train, X_val , y_val, n_trials=100)

In [24]:
print("Score for MLP: ", study_mlp.best_value)
print(study_mlp.best_params)
pkl.dump(study_mlp, open("study_mlp.pkl", "wb"))

Score for MLP:  0.7447455369100395
{'hidden_layer_sizes': 8, 'alpha': 0.0009663190086858645, 'learning_rate': 'adaptive', 'learning_rate_init': 0.0014289217221067222}


In [29]:
print("Score for Logistic Regression: ", study_logistic.best_value)
print("Score for Random Forest: ", study_rf.best_value)
print("Score for LGBM: ", study_lgbm.best_value)
print("Score for MLP: ", study_mlp.best_value)

Score for Logistic Regression:  0.7412956303231619
Score for Random Forest:  0.7394031490386275
Score for LGBM:  0.7538204864963433
Score for MLP:  0.7447455369100395


In [None]:
def get_metrics(name_model_dict, X, y, threshold = 0.5):
    models_dict = {}
    for name, model in name_model_dict.items():
        if type(model) == list:
            y_prob = model[0].predict_proba(X)[:,1]
            threshold_model = model[1]
            y_pred = (y_prob >= threshold_model).astype('int')
        else:
            y_prob = model.predict_proba(X)[:,1]
            y_pred = (y_prob >= threshold).astype('int')

        models_dict[name] = (y_pred, y_prob)

    def get_metrics_df(models_dict, y_true,):
        metrics_dict = {
            "Overall AUC": (
                lambda x: roc_auc_score(y_true, x), False),
            " ------": (lambda x: "", True),
            "Balanced Accuracy": (
                lambda x: balanced_accuracy_score(y_true, x), True),
            "Accuracy": (
                lambda x: accuracy_score(y_true, x), True),
            "Precision": (
                lambda x: precision_score(y_true, x, zero_division=0), True),
            "Recall": (
                lambda x: recall_score(y_true, x), True),
            "F1": (
                lambda x: f1_score(y_true, x), True),
        }
        df_dict = {}
        for metric_name, (metric_func, use_preds) in metrics_dict.items():
            df_dict[metric_name] = [metric_func(preds) if use_preds else metric_func(scores)
                                    for model_name, (preds, scores) in models_dict.items()]
        return pd.DataFrame.from_dict(df_dict, orient="index", columns=models_dict.keys())

    return get_metrics_df(models_dict, y)

**TODO**

*   Add easy way to include default parameters in optuna
*   Improve create_pipeline function
*   Integrate pipeline package
*   Search hyperparameters range for classifiers
*   Warnings on pipeline given numeric unique number of values
