In [2]:
import joblib
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import xgboost as xgb
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.regularizers import l2

In [19]:
loans = joblib.load("./data/final_model_data.joblib")

In [17]:
loans.head()

Unnamed: 0,loan_amnt,term,emp_length,home_ownership,annual_inc,purpose,dti,delinq_2yrs,cr_hist_age_mths,fico_range_low,...,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,recovered_percentage,issue_d,grade,sub_grade,expected_return,date
1909203,2125.0,36 months,3 years,MORTGAGE,45000.0,home_improvement,15.88,1.0,137.0,660.0,...,146436.0,23662.0,3200.0,22483.0,1.0,Aug-2012,C,C5,2737.8,2012-08-01
1909437,28000.0,60 months,10+ years,MORTGAGE,150000.0,debt_consolidation,5.96,1.0,198.0,685.0,...,417242.0,27924.0,31432.0,25523.0,0.446871,Aug-2012,E,E4,46381.2,2012-08-01
1909457,14125.0,36 months,3 years,RENT,36000.0,debt_consolidation,18.93,0.0,107.0,665.0,...,58087.0,57612.0,7900.0,47687.0,1.0,Aug-2012,D,D3,18575.64,2012-08-01
1909558,16000.0,60 months,10+ years,RENT,58900.0,debt_consolidation,33.53,0.0,141.0,700.0,...,83557.0,69752.0,1800.0,69657.0,0.327324,Aug-2012,E,E1,25696.8,2012-08-01
1909566,2800.0,36 months,4 years,RENT,89000.0,debt_consolidation,15.18,0.0,295.0,670.0,...,51892.0,50262.0,2500.0,44992.0,1.0,Aug-2012,C,C2,3509.64,2012-08-01


In [4]:
train, test = train_test_split(loans, test_size=0.2, shuffle=False)
train_nn, train_xgb, test_nn, test_xgb = train.copy(), train.copy(), test.copy(), test.copy()
print(f"The test set contains {len(test):,} loans.")

onehot_cols = ["term", "application_type", "home_ownership", "purpose"]
ordinal_cols = {
    "emp_length": [
        "< 1 year",
        "1 year",
        "2 years",
        "3 years",
        "4 years",
        "5 years",
        "6 years",
        "7 years",
        "8 years",
        "9 years",
        "10+ years",
    ]
}

The test set contains 222,035 loans.


In [5]:
def run_pipeline_cv(data, onehot_cols, ordinal_cols, model_type="nn", batch_size=None):
    X = data.drop(columns=["recovered_percentage"])
    y = data["recovered_percentage"]

    transformer = DataFrameMapper(
        [
            (onehot_cols, OneHotEncoder(drop="if_binary", handle_unknown="ignore")),
            (
                list(ordinal_cols.keys()),
                OrdinalEncoder(categories=list(ordinal_cols.values())),
            ),
        ],
        default=StandardScaler(),
    )

    cv = KFold(n_splits=5, shuffle=True, random_state=0)

    history_list = []
    model_list = []
    transformer_list = []

    for train_idx, valid_idx in cv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        X_train = transformer.fit_transform(X_train)
        X_valid = transformer.transform(X_valid)

        if model_type == "nn":
            input_nodes = X_train.shape[1]
            output_nodes = 1

            learning_rate = 0.002
            momentum = 0.9
            optimizer = SGD(learning_rate=learning_rate, momentum=momentum)

            reg_strength = 0.001  # Set regularization strength
            regularizer = l2(reg_strength)  # Create an L2 regularizer

            model = Sequential()
            model.add(Input((input_nodes,)))
            model.add(Dense(64, activation="relu", kernel_regularizer=regularizer))
            model.add(Dropout(0.3, seed=0))
            model.add(Dense(32, activation="relu", kernel_regularizer=regularizer))
            model.add(Dropout(0.3, seed=1))
            model.add(Dense(16, activation="relu", kernel_regularizer=regularizer))
            model.add(Dropout(0.3, seed=2))
            model.add(Dense(output_nodes, kernel_regularizer=regularizer))
            model.compile(optimizer=optimizer, loss="mean_squared_logarithmic_error")

            history = model.fit(
                X_train,
                y_train,
                batch_size=batch_size,
                epochs=25,
                validation_data=(X_valid, y_valid),
                verbose=2,
            )
            history_list.append(history.history)
        elif model_type == "xgb":
            model = xgb.XGBRegressor(objective="reg:squarederror")
            model.fit(X_train, y_train)
            history_list.append(None)

        model_list.append(model)
        transformer_list.append(transformer)

    return history_list, model_list, transformer_list

In [6]:
def make_predictions(data, model_list, transformer_list):
    predictions_list = []

    for i in range(len(model_list)):
        model = model_list[i]
        transformer = transformer_list[i]

        X_test = transformer.transform(
            data.drop(
                columns=[
                    "recovered_percentage",
                    "issue_d",
                    "date",
                    "grade",
                    "sub_grade",
                    "expected_return",
                ]
            )
        )

        predictions = model.predict(X_test)
        predictions_list.append(predictions)

    data["model_predictions"] = np.mean(predictions_list, axis=0)

In [7]:
# Run the pipeline with cross-validation for neural network
nn_history_list, nn_model_list, nn_transformer_list = run_pipeline_cv(
    train_nn.drop(columns=["issue_d", "date", "grade", "sub_grade", "expected_return"]),
    onehot_cols,
    ordinal_cols,
    model_type="nn",
    batch_size=128,
)

# Make predictions for neural network
make_predictions(test_nn, nn_model_list, nn_transformer_list)

# Run the pipeline with cross-validation for XGBoost
xgb_history_list, xgb_model_list, xgb_transformer_list = run_pipeline_cv(
    train_xgb.drop(columns=["issue_d", "date", "grade", "sub_grade", "expected_return"]),
    onehot_cols,
    ordinal_cols,
    model_type="xgb",
)

# Make predictions for XGBoost
make_predictions(test_xgb, xgb_model_list, xgb_transformer_list)

# Define the weights for each model
xgb_weight = 0.6
nn_weight = 0.4

# Calculate the weighted average of the predictions
test["model_predictions"] = (xgb_weight * test_xgb["model_predictions"]) + (nn_weight * test_nn["model_predictions"])

Epoch 1/25
5551/5551 - 9s - loss: 0.1339 - val_loss: 0.1059 - 9s/epoch - 2ms/step
Epoch 2/25
5551/5551 - 8s - loss: 0.0898 - val_loss: 0.0760 - 8s/epoch - 1ms/step
Epoch 3/25
5551/5551 - 8s - loss: 0.0657 - val_loss: 0.0568 - 8s/epoch - 1ms/step
Epoch 4/25
5551/5551 - 7s - loss: 0.0502 - val_loss: 0.0445 - 7s/epoch - 1ms/step
Epoch 5/25
5551/5551 - 7s - loss: 0.0403 - val_loss: 0.0367 - 7s/epoch - 1ms/step
Epoch 6/25
5551/5551 - 7s - loss: 0.0339 - val_loss: 0.0316 - 7s/epoch - 1ms/step
Epoch 7/25
5551/5551 - 7s - loss: 0.0299 - val_loss: 0.0284 - 7s/epoch - 1ms/step
Epoch 8/25
5551/5551 - 7s - loss: 0.0273 - val_loss: 0.0263 - 7s/epoch - 1ms/step
Epoch 9/25
5551/5551 - 7s - loss: 0.0256 - val_loss: 0.0250 - 7s/epoch - 1ms/step
Epoch 10/25
5551/5551 - 7s - loss: 0.0245 - val_loss: 0.0241 - 7s/epoch - 1ms/step
Epoch 11/25
5551/5551 - 9s - loss: 0.0238 - val_loss: 0.0235 - 9s/epoch - 2ms/step
Epoch 12/25
5551/5551 - 8s - loss: 0.0233 - val_loss: 0.0230 - 8s/epoch - 1ms/step
Epoch 13/25
5

In [8]:
import joblib

# Save the neural network models and data transformers
for i, model in enumerate(nn_model_list):
    model.save(f"nn_loan_risk_model_{i}")
joblib.dump(nn_transformer_list, "nn_data_transformer.joblib")

# Save the XGBoost models and data transformers
for i, model in enumerate(xgb_model_list):
    model.save_model(f"xgb_loan_risk_model_{i}.json")
joblib.dump(xgb_transformer_list, "xgb_data_transformer.joblib")


2023-04-26 16:13:13.654522: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,64]
	 [[{{node inputs}}]]
2023-04-26 16:13:13.671502: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,32]
	 [[{{node inputs}}]]
2023-04-26 16:13:13.686070: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,16]
	 [[{{node inputs}}]]
2023-04-26 16:13:13

INFO:tensorflow:Assets written to: nn_loan_risk_model_0/assets


INFO:tensorflow:Assets written to: nn_loan_risk_model_0/assets
2023-04-26 16:13:14.787737: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,64]
	 [[{{node inputs}}]]
2023-04-26 16:13:14.806300: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,32]
	 [[{{node inputs}}]]
2023-04-26 16:13:14.823238: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype f

INFO:tensorflow:Assets written to: nn_loan_risk_model_1/assets


INFO:tensorflow:Assets written to: nn_loan_risk_model_1/assets
2023-04-26 16:13:15.785499: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,64]
	 [[{{node inputs}}]]
2023-04-26 16:13:15.799192: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,32]
	 [[{{node inputs}}]]
2023-04-26 16:13:15.812988: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype f

INFO:tensorflow:Assets written to: nn_loan_risk_model_2/assets


INFO:tensorflow:Assets written to: nn_loan_risk_model_2/assets
2023-04-26 16:13:16.909236: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,64]
	 [[{{node inputs}}]]
2023-04-26 16:13:16.922932: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,32]
	 [[{{node inputs}}]]
2023-04-26 16:13:16.936467: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype f

INFO:tensorflow:Assets written to: nn_loan_risk_model_3/assets


INFO:tensorflow:Assets written to: nn_loan_risk_model_3/assets
2023-04-26 16:13:17.759357: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,64]
	 [[{{node inputs}}]]
2023-04-26 16:13:17.773364: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,32]
	 [[{{node inputs}}]]
2023-04-26 16:13:17.786861: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype f

INFO:tensorflow:Assets written to: nn_loan_risk_model_4/assets


INFO:tensorflow:Assets written to: nn_loan_risk_model_4/assets


['xgb_data_transformer.joblib']

In [9]:
def compare_models(test):
    loan_grades=["A", "B", "C", "D", "E", "F", "G"]
    model_predictions = "model_predictions"
    recovered_percentage = "recovered_percentage"
    lc_prefix = "lc_"
    ty_prefix = "ty_"
    results = {}

    start_index = 0
    for grade in loan_grades:
        lc_grade = test[test["grade"] == grade]
        lc_grade_avg = round(np.mean(lc_grade[recovered_percentage]), 5)
        lc_key = lc_prefix + grade.lower()
        results[lc_key] = lc_grade_avg

        ty_grade = test.sort_values(model_predictions, axis="index", ascending=False).iloc[
            start_index : len(lc_grade) + start_index
        ]
        start_index += len(lc_grade)
        ty_grade_avg = round(np.mean(ty_grade[recovered_percentage]), 5)
        ty_key = ty_prefix + grade.lower()
        results[ty_key] = ty_grade_avg

        print(f"LendingClub gave {len(lc_grade):,} loans in the test set a(n) {grade} grade")
        print(f"Average `recovered_percentage` on LendingClub's grade {grade} loans: {lc_grade_avg}")
        print(f"Average `recovered_percentage` on Ty's grade {grade} loans (with cross-validation): {ty_grade_avg}")

    return results

results = compare_models(test)
print(results)

LendingClub gave 38,779 loans in the test set a(n) A grade
Average `recovered_percentage` on LendingClub's grade A loans: 0.96021
Average `recovered_percentage` on Ty's grade A loans (with cross-validation): 0.96487
LendingClub gave 61,844 loans in the test set a(n) B grade
Average `recovered_percentage` on LendingClub's grade B loans: 0.90378
Average `recovered_percentage` on Ty's grade B loans (with cross-validation): 0.9082
LendingClub gave 69,218 loans in the test set a(n) C grade
Average `recovered_percentage` on LendingClub's grade C loans: 0.82924
Average `recovered_percentage` on Ty's grade C loans (with cross-validation): 0.83755
LendingClub gave 33,142 loans in the test set a(n) D grade
Average `recovered_percentage` on LendingClub's grade D loans: 0.76535
Average `recovered_percentage` on Ty's grade D loans (with cross-validation): 0.76014
LendingClub gave 12,654 loans in the test set a(n) E grade
Average `recovered_percentage` on LendingClub's grade E loans: 0.70825
Average

In [10]:
from sklearn.metrics import mean_absolute_error

# MAE
mae = mean_absolute_error(test["recovered_percentage"], test["model_predictions"])
print("MAE:")
print(round(mae, 5))

MAE:
0.2021
