In [6]:
import joblib
from sklearn.model_selection import train_test_split, KFold

loans = joblib.load("../data/final_model_data.joblib")

loans = loans.head(10000)

train, test = train_test_split(loans, test_size=0.2, shuffle=False)
train_nn, train_xgb, train_classification, test_nn, test_xgb, test_classification = train.copy(), train.copy(), train.copy(), test.copy(), test.copy(), test.copy()
print(f"The test set contains {len(test):,} loans.")

onehot_cols = ["term", "application_type", "home_ownership", "purpose"]
ordinal_cols = {
    "emp_length": [
        "< 1 year",
        "1 year",
        "2 years",
        "3 years",
        "4 years",
        "5 years",
        "6 years",
        "7 years",
        "8 years",
        "9 years",
        "10+ years",
    ]
}
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor


def run_pipeline_cv(data, onehot_cols, ordinal_cols, model_type="nn", batch_size=None):
    X = data.drop(columns=["recovered_percentage"])
    y = data["recovered_percentage"]

    transformer = DataFrameMapper(
        [
            (onehot_cols, OneHotEncoder(drop="if_binary", handle_unknown="ignore")),
            (
                list(ordinal_cols.keys()),
                OrdinalEncoder(categories=list(ordinal_cols.values())),
            ),
        ],
        default=StandardScaler(),
    )

    cv = KFold(n_splits=5, shuffle=True, random_state=0)

    history_list = []
    model_list = []
    transformer_list = []

    for train_idx, valid_idx in cv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        X_train = transformer.fit_transform(X_train)
        X_valid = transformer.transform(X_valid)

        if model_type == "xgb":
            model = XGBRegressor(objective="reg:squarederror")

            model.fit(X_train, y_train)

            history_list.append(None)

        model_list.append(model)
        transformer_list.append(transformer)

    return history_list, model_list, transformer_list


def make_predictions(data, model_list, transformer_list):
    predictions_list = []

    for i in range(len(model_list)):
        model = model_list[i]
        transformer = transformer_list[i]

        X_test = transformer.transform(
            data.drop(
                columns=[
                    "recovered_percentage",
                    "issue_d",
                    "date",
                    "grade",
                    "sub_grade",
                    "expected_return",
                ]
            )
        )

        predictions = model.predict(X_test)
        predictions_list.append(predictions)

    data["model_predictions"] = np.mean(predictions_list, axis=0)

# Run the pipeline with cross-validation for XGBoost
xgb_history_list, xgb_model_list, xgb_transformer_list = run_pipeline_cv(
    train_xgb.drop(columns=["issue_d", "date", "grade", "sub_grade", "expected_return"]),
    onehot_cols,
    ordinal_cols,
    model_type="xgb",
)

The test set contains 2,000 loans.


In [7]:
from sklearn_pandas import DataFrameMapper
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, Dropout
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector

train, test = train_classification, test_classification
print(f"The test set contains {len(test):,} loans.")
onehot_cols = ["term", "application_type", "home_ownership", "purpose"]
ordinal_cols = {
    "emp_length": [
        "< 1 year",
        "1 year",
        "2 years",
        "3 years",
        "4 years",
        "5 years",
        "6 years",
        "7 years",
        "8 years",
        "9 years",
        "10+ years",
    ],
}

def run_pipeline_cv_nn(
        data,
        onehot_cols,
        ordinal_cols,
        batch_size,
):
    X = data.drop(
        columns=["issue_d", "date", "grade", "grade_encoded", "sub_grade", "recovered_percentage", "expected_return"])
    y = data["grade_encoded"]

    transformer = DataFrameMapper(
        [
            (onehot_cols, OneHotEncoder(drop="if_binary", handle_unknown="error")),
            (
                list(ordinal_cols.keys()),
                OrdinalEncoder(categories=list(ordinal_cols.values())),
            ),
        ],
        default=StandardScaler(),
    )

    cv = KFold(n_splits=5, shuffle=True, random_state=0)

    history_list = []
    model_list = []
    transformer_list = []

    for train_idx, valid_idx in cv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        print(X_train.keys())
        X_train = transformer.fit_transform(X_train)
        X_valid = transformer.transform(X_valid)

        input_nodes = X_train.shape[1]

        # Define the model architecture
        model = Sequential()
        model.add(Input(shape=input_nodes))
        model.add(Dense(64, activation="relu"))
        model.add(Dropout(0.3, seed=0))
        model.add(Dense(32, activation="relu"))
        model.add(Dropout(0.3, seed=1))
        model.add(Dense(16, activation="relu"))
        model.add(Dropout(0.3, seed=1))
        model.add(Dense(7, activation="softmax"))
        model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

        # # Define the EarlyStopping callback
        # early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

        history = model.fit(
            X_train,
            y_train,
            batch_size=batch_size,
            epochs=300,
            validation_data=(X_valid, y_valid),
            verbose=2,
            # callbacks=[early_stopping],  # Add the EarlyStopping callback
        )

        history_list.append(history.history)
        model_list.append(model)
        transformer_list.append(transformer)

    return history_list, model_list, transformer_list


# Encode loan grades as integers
grade_encoder = {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
}
train["grade_encoded"] = train["grade"].apply(lambda x: grade_encoder[x])
test["grade_encoded"] = test["grade"].apply(lambda x: grade_encoder[x])

# Run the pipeline with cross-validation
nn_history_list, nn_model_list, nn_transformer_list = run_pipeline_cv_nn(
    train,
    onehot_cols,
    ordinal_cols,
    batch_size=128,
)


def run_pipeline_cv_svm(
        data,
        onehot_cols,
        ordinal_cols,
):
    X = data.drop(
        columns=["issue_d", "date", "grade", "grade_encoded", "sub_grade", "recovered_percentage", "expected_return"])
    y = data["grade_encoded"]

    transformer = DataFrameMapper(
        [
            (onehot_cols, OneHotEncoder(drop="if_binary", handle_unknown="error")),
            (
                list(ordinal_cols.keys()),
                OrdinalEncoder(categories=list(ordinal_cols.values())),
            ),
        ],
        default=StandardScaler(),
    )

    cv = KFold(n_splits=5, shuffle=True, random_state=0)

    history_list = []
    model_list = []
    transformer_list = []

    for train_idx, valid_idx in cv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        print("before trans: ")
        print(X_train.shape)
        print(X_valid.shape)
        X_train = transformer.fit_transform(X_train)
        X_valid = transformer.transform(X_valid)

        print("after trans: ")
        print(X_train.shape)
        print(X_valid.shape)

        # Define the parameter grid to search
        param_grid = {
            'C': [0.1, 1, 10],
            'gamma': [0.01, 0.1, 1]
        }

        # Define the SVM model
        svm = SVC(kernel='linear')

        # Use GridSearchCV to search for the best hyperparameters
        grid_search = GridSearchCV(svm, param_grid, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Print the best hyperparameters and their corresponding score
        print("Best hyperparameters:", grid_search.best_params_)
        print("Best score:", grid_search.best_score_)

        history_list.append(None)
        model_list.append(grid_search.best_estimator_)
        transformer_list.append(transformer)

    return history_list, model_list, transformer_list


# Run the pipeline with cross-validation
svm_history_list, svm_model_list, svm_transformer_list = run_pipeline_cv_svm(
    train,
    onehot_cols,
    ordinal_cols,
)

def run_pipeline_cv_dt(data, onehot_cols, ordinal_cols):
    X = data.drop(
        columns=["issue_d", "date", "grade", "grade_encoded", "sub_grade", "recovered_percentage", "expected_return"])
    y = data["grade_encoded"]

    transformer = ColumnTransformer(
        [
            ('onehot', OneHotEncoder(drop="if_binary", handle_unknown="error"), onehot_cols),
            ('ordinal', OrdinalEncoder(categories=list(ordinal_cols.values())), list(ordinal_cols.keys())),
            ('imputer', SimpleImputer(strategy='median'), selector(dtype_include='number'))
        ],
        remainder=StandardScaler()
    )

    cv = KFold(n_splits=5, shuffle=True, random_state=0)

    history_list = []
    model_list = []
    transformer_list = []

    for train_idx, valid_idx in cv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        pipeline = Pipeline([
            ('preprocess', transformer),
            ('model', DecisionTreeClassifier())
        ])

        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_valid)

        accuracy = accuracy_score(y_valid, y_pred)
        precision = precision_score(y_valid, y_pred, average='weighted')
        recall = recall_score(y_valid, y_pred, average='weighted')
        f1 = f1_score(y_valid, y_pred, average='weighted')

        history_list.append({
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })
        model_list.append(pipeline)
        transformer_list.append(transformer)

    return history_list, model_list, transformer_list


# Run the pipeline with cross-validation for Decision Trees
dt_history_list, dt_model_list, dt_transformer_list = run_pipeline_cv_dt(
    train,
    onehot_cols,
    ordinal_cols,
)

from sklearn.metrics import accuracy_score
from scipy.stats import mode

# Load the test set and extract the true labels
X_test = test.drop(
    columns=["issue_d", "date", "grade", "grade_encoded", "sub_grade", "recovered_percentage", "expected_return"])
y_test = test["grade_encoded"]

# Make predictions using the ensemble model
predictions_list_nn = []
predictions_list = []

for i in range(len(nn_model_list)):
    transformer = nn_transformer_list[i]
    model = nn_model_list[i]

    X_test_transformed = transformer.transform(X_test)
    y_pred = model.predict(X_test_transformed)
    predictions_list_nn.append(y_pred)

for i in range(len(svm_model_list)):
    transformer = svm_transformer_list[i]
    model = svm_model_list[i]

    X_test_transformed = transformer.transform(X_test)
    y_pred = model.predict(X_test_transformed)
    predictions_list.append(y_pred)

for model in dt_model_list:
    y_pred = model.predict(X_test)
    predictions_list.append(y_pred)

# Convert the predictions to class labels
class_predictions_list = [np.argmax(predictions, axis=1) for predictions in predictions_list_nn]

for predictions in predictions_list:
    class_predictions_list.append(predictions)

# Combine the predictions from each model
class_predictions_array = np.array(class_predictions_list)

# Take the majority vote for each test sample
final_predictions = mode(class_predictions_array, axis=0).mode[0]

The test set contains 2,000 loans.
Index(['loan_amnt', 'term', 'emp_length', 'home_ownership', 'annual_inc',
       'purpose', 'dti', 'delinq_2yrs', 'cr_hist_age_mths', 'fico_range_low',
       'fico_range_high', 'inq_last_6mths', 'inv_mths_since_last_delinq',
       'inv_mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'collections_12_mths_ex_med',
       'inv_mths_since_last_major_derog', 'application_type',
       'annual_inc_joint', 'dti_joint', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths',
       'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths',
       'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op',
       'inv_mo_sin_rcnt_rev_tl_op', 'inv_mo_sin_rcnt_tl', 'mort_acc',
       'inv_mths_since_recent_bc', 'inv_mths_since_recent_bc_dlq',
       'inv_mths_since_recent_inq', 'inv_mths_since_recent_revol_delinq',
       'num_accts_ever_120_pd', 'num_actv_

In [8]:
# Make predictions for XGBoost
make_predictions(test_xgb, xgb_model_list, xgb_transformer_list)

# Combine the predictions from all models
class_predictions_array = np.array(class_predictions_list)
final_predictions = mode(class_predictions_array, axis=0).mode[0]

# Add predicted return percentage and grade to the test dataset
test_xgb['predicted_return_percentage'] = test_xgb['model_predictions']
test_xgb['predicted_grade'] = final_predictions

# Print the updated dataset
print(test_xgb[['predicted_return_percentage', 'predicted_grade']])


         predicted_return_percentage  predicted_grade
1900401                     0.903391                1
1900402                     0.900195                1
1900403                     0.921092                1
1900404                     0.813835                1
1900405                     0.920004                1
...                              ...              ...
1902619                     0.933729                1
1902621                     0.986809                1
1902622                     0.908161                1
1902623                     0.929424                1
1902624                     0.998673                1

[2000 rows x 2 columns]


In [9]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer



# Calculate the Fraction of Return and predicted Loan Grade for each loan
fraction_of_return = test_xgb["predicted_return_percentage"]
grade = test_xgb['predicted_grade']

# Define the initial thresholds
initial_min_fraction_of_return = 0
initial_max_loan_grade = 6  # Assuming Grade B is the minimum acceptable loan grade

# Initialize variables to track the best threshold values and maximum profit
best_min_fraction_of_return = initial_min_fraction_of_return
best_min_loan_grade = initial_max_loan_grade
max_profit = 0

# Iterate through different threshold values to find the maximum profit
for min_fraction_of_return in np.arange(initial_min_fraction_of_return, 1.0, 0.001):
    for max_loan_grade in [0, 1, 2, 3, 4, 5, 6]:
        # Filter the loans that meet the threshold criteria
        approved_loans = test_xgb[
            (fraction_of_return >= min_fraction_of_return) & (grade <= max_loan_grade)
            ]

        # Calculate the profit for each approved loan
        approved_loans["profit"] = approved_loans["expected_return"]*fraction_of_return - approved_loans["loan_amnt"]

        # Calculate the total company profit
        total_profit = approved_loans["profit"].sum()

        # Update the maximum profit and best threshold values if a higher profit is achieved
        if total_profit > max_profit:
            max_profit = total_profit
            best_min_fraction_of_return = min_fraction_of_return
            best_min_loan_grade = max_loan_grade

# Print the best threshold values and maximum profit
print("Best Threshold Values:")
print(f"Minimum acceptable Fraction of Return: {best_min_fraction_of_return:.2f}")
print(f"Minimum acceptable Loan Grade: {best_min_loan_grade}")
print(f"Maximum Profit: {max_profit:.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approved_loans["profit"] = approved_loans["expected_return"]*fraction_of_return - approved_loans["loan_amnt"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approved_loans["profit"] = approved_loans["expected_return"]*fraction_of_return - approved_loans["loan_amnt"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

Best Threshold Values:
Minimum acceptable Fraction of Return: 0.69
Minimum acceptable Loan Grade: 1
Maximum Profit: 4629703.15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approved_loans["profit"] = approved_loans["expected_return"]*fraction_of_return - approved_loans["loan_amnt"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approved_loans["profit"] = approved_loans["expected_return"]*fraction_of_return - approved_loans["loan_amnt"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v