In [1]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('C:\\Users\\joelp\\Downloads\\LoanDefaultDataset\\Loan_Default.csv')
print(df)


            ID  year loan_limit             Gender approv_in_adv loan_type  \
0        24890  2019         cf  Sex Not Available         nopre     type1   
1        24891  2019         cf               Male         nopre     type2   
2        24892  2019         cf               Male           pre     type1   
3        24893  2019         cf               Male         nopre     type1   
4        24894  2019         cf              Joint           pre     type1   
...        ...   ...        ...                ...           ...       ...   
148665  173555  2019         cf  Sex Not Available         nopre     type1   
148666  173556  2019         cf               Male         nopre     type1   
148667  173557  2019         cf               Male         nopre     type1   
148668  173558  2019         cf             Female         nopre     type1   
148669  173559  2019         cf             Female         nopre     type1   

       loan_purpose Credit_Worthiness open_credit business_or_c

In [3]:
# Create missing indicator for each row that has missing data, and also replace the missing value with the mean of the column's values
# Also one_hot_encode the dataset

# df.isnull() returns a Series containing values True or False. Taking their sum will return positive number if there is at least one NaN value
columns_with_missing_data = df.columns[df.isnull().sum() > 0]
for column in columns_with_missing_data:
    df[f"{column}_missing"] = df[column].isnull().astype(int)
    # If column is numeric, use mean imputation
    if pd.api.types.is_numeric_dtype(df[column]):
        df[column].fillna(value=df[column].mean(), inplace=True)

columns_to_one_hot_encode = []
for column in df.columns:
    if not pd.api.types.is_numeric_dtype(df[column]):
        columns_to_one_hot_encode.append(column)
        df
        

df_dummies = pd.get_dummies(data=df, columns=columns_to_one_hot_encode, drop_first=False, dtype=int)
df.drop(labels=columns_to_one_hot_encode, axis=1, inplace=True)
df = pd.concat([df, df_dummies], axis=1)

print(df.shape)
print(df.loc[0])

(148670, 113)
ID                         24890.000000
year                        2019.000000
loan_amount               116500.000000
rate_of_interest               4.045476
Interest_rate_spread           0.441656
                              ...      
Region_North-East              0.000000
Region_central                 0.000000
Region_south                   1.000000
Security_Type_Indriect         0.000000
Security_Type_direct           1.000000
Name: 0, Length: 113, dtype: float64


In [4]:
# Remove the first two rows (ID and year not useful features)
df = df.drop(df.columns[[0,1]], axis=1)

# Remove outliers using interquartile range method (IQR)
# for column in df.columns:
#     if pd.api.types.is_numeric_dtype(df[column]):
#         Q1 = df[column].quantile(0.25)
#         Q3 = df[column].quantile(0.75)
#         IQR = Q3 - Q1
#         lower_bound = Q1 - 1.5 * IQR
#         upper_bound = Q3 + 1.5 * IQR
#         df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print(df.shape)
print(df.head())



(148670, 109)
   loan_amount  rate_of_interest  Interest_rate_spread  Upfront_charges  \
0       116500          4.045476              0.441656      3224.996127   
1       206500          4.045476              0.441656      3224.996127   
2       406500          4.560000              0.200000       595.000000   
3       456500          4.250000              0.681000      3224.996127   
4       696500          4.000000              0.304200         0.000000   

    term  property_value   income  Credit_Score        LTV  Status  ...  \
0  360.0   118000.000000   1740.0           758  98.728814       1  ...   
1  360.0   497893.465696   4980.0           552  72.746457       1  ...   
2  360.0   508000.000000   9480.0           834  80.019685       0  ...   
3  360.0   658000.000000  11880.0           587  69.376900       0  ...   
4  360.0   758000.000000  10440.0           602  91.886544       0  ...   

   age_<25  age_>74  submission_of_application_not_inst  \
0        0        0      

In [5]:
# Convert the data into numpy arrays, ready to be processed by the model
X_df = df.drop(columns=["Status"])
Y_df = df.iloc[:, 9]
X_train_df, X_temp_df, Y_train_df, Y_temp_df = train_test_split(X_df, Y_df, test_size=0.2, random_state=42)
X_cv_df, X_test_df, Y_cv_df, Y_test_df = train_test_split(X_temp_df, Y_temp_df, test_size=0.5, random_state=42)

# Normalize the data
scaler = StandardScaler()
columns_to_standardize = ['loan_amount', 'rate_of_interest', "Interest_rate_spread", "Upfront_charges", "property_value", "income", "Credit_Score", "LTV", "dtir1"]
scaler.fit(X_train_df[columns_to_standardize])
X_train_df[columns_to_standardize] = scaler.transform(X_train_df[columns_to_standardize])
X_cv_df[columns_to_standardize] = scaler.transform(X_cv_df[columns_to_standardize])
X_test_df[columns_to_standardize] = scaler.transform(X_test_df[columns_to_standardize])

X_train = X_train_df.values.T
X_cv = X_cv_df.values.T
X_test = X_test_df.values.T

Y_train = Y_train_df.values.reshape(1, -1)
Y_cv = Y_cv_df.values.reshape(1, -1)
Y_test = Y_test_df.values.reshape(1, -1)

print(X_train.shape)
print(Y_train.shape)

print(X_cv.shape)
print(Y_cv.shape)

(107, 118936)
(1, 118936)
(107, 14867)
(1, 14867)


In [75]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

In [117]:
# Forward propagation methods of the feed-forward neural network (FFN) 

def forward_prop_layer(W, b, Aprev):
    '''
    Inputs:  W - weight matrix for current layer (k x n)
             A - activation matrix from previous layer (n x m)
             b - bias vector for current player (n x 1)

    Outputs: Z - pre-activation output matrix resulting from this layer (k, m)
    '''

    Z_prebatch = np.matmul(W, Aprev) + b
    return Z_prebatch

def forward_prop_batch_norm_layer(gamma, beta, Z_prebatch):
    '''
    Inputs:  Gamma - Gamma vector for current batchnorm layer (k x 1)
             Beta  - Beta vector for current batchnorm layer (k x 1)
             Z     - pre-activation output matrix from regular forward prop layer (k x m)

    Outputs: A     - Activation matrix resulting from this layer
    '''

    u = np.mean(Z_prebatch, axis=1).reshape(-1, 1) # .reshape(-1, 1) makes sure u and var have 2 dimensions (k,1) instead of (k,))
    var = np.var(Z_prebatch, axis=1).reshape(-1, 1)

    Z_norm = (Z_prebatch - u) / np.sqrt(var + (10 ** (-7)))
    Z_updated = np.multiply(gamma, Z_norm) + beta
    new_A = relu(Z_updated)

    return (new_A, Z_updated, Z_norm, u, var)






In [156]:
def forward_prop(X, parameters, num_layers):
    '''
    Inputs:  X - input batch matrix (n x m)
             parameters - dictionary containing weight, scale, shift, and bias matrices
    Outputs: Y - output vector containing predictions (1 x m)
    '''

    A = X
    cache = {"Z0" : np.array([[1]])}

    for l in range(1, num_layers + 1):
        W = parameters[f"W{l}"]
        b = parameters[f"b{l}"]
        gamma = parameters[f"gamma{l}"]
        beta = parameters[f"beta{l}"]

        Z = forward_prop_layer(W, b, A)
        (A_updated, Z_updated, Z_norm, u, var) = forward_prop_batch_norm_layer(gamma, beta, Z)
        
        if l < 5: 
            Z = Z_updated
            A = A_updated
        else:
            A = sigmoid(Z)


        cache[f"A{l}"] = A
        cache[f"Z{l}"] = Z
        cache[f"Z_norm{l}"] = Z_norm
        cache[f"mean{l}"] = u
        cache[f"var{l}"] = var


    cache["A0"] = X
    Y_pred = cache["A5"]

    return (Y_pred, cache)
    



In [155]:
def initialize_parameters(layer_dims):
    '''
    Inputs: layer_dims - list containing number of neurons for each layer
    Outputs: parameters - dictionary containing all the weight, bias, scale, and shift matrices
    '''
    parameters = {}
    adam_cache = {}

    for l in range(1, len(layer_dims)):
        # Use He initialization for relu (hidden layers) to help alleviate vanishing/exploding gradients 
        # Use Glorot initialization for sigmoid (output layer) to help alleviate vanishing/exploding gradients
        standard_dev = np.sqrt(2.0 / layer_dims[l-1]) if l < len(layer_dims) - 1 else np.sqrt(6.0 / (layer_dims[l-1] + layer_dims[l]))

        # Initialize the trainable parameters
        parameters[f"W{l}"] = np.random.normal(loc=0, scale=standard_dev, size=(layer_dims[l], layer_dims[l-1]))
        parameters[f"b{l}"] = np.zeros((layer_dims[l], 1))
        parameters[f"gamma{l}"] = np.ones((layer_dims[l], 1))
        parameters[f"beta{l}"] = np.ones((layer_dims[l], 1))

        # Initialize the adam optimizer parameters as all zeros
        adam_cache[f"VdW{l}"] = np.zeros_like(parameters[f"W{l}"])
        adam_cache[f"SdW{l}"] = np.zeros_like(parameters[f"W{l}"])
        adam_cache[f"Vdb{l}"] = np.zeros_like(parameters[f"b{l}"])
        adam_cache[f"Sdb{l}"] = np.zeros_like(parameters[f"b{l}"])
        adam_cache[f"Vdgamma{l}"] = np.zeros_like(parameters[f"gamma{l}"])
        adam_cache[f"Sdgamma{l}"] = np.zeros_like(parameters[f"gamma{l}"])
        adam_cache[f"Vdbeta{l}"] = np.zeros_like(parameters[f"beta{l}"])
        adam_cache[f"Sdbeta{l}"] = np.zeros_like(parameters[f"beta{l}"])


    return (parameters, adam_cache)


In [154]:
def backprop_layer_including_batchnorm(dY, mean, variance, gamma, Z_norm, Z, Zprev, Aprev, W, regularization_parameter, is_last_layer):
    '''
    Inputs: matrices calculated from forward prop for this layer
            regularization_parameter: regularization parameter lambda from adding regularization to the model
    Outputs: dY_prev_layer - output matrix from batchnorm backprop for this layer
    '''

    m = Z.shape[1]

    # Calculate scale and shift gradients
    dbeta = np.sum(dY, axis=1, keepdims=True)
    dgamma = np.sum(dY * Z_norm, axis=1, keepdims=True)

    # calculate gradient of normalized activations
    dZ_norm = dY * gamma

    # calculate mean and variance gradients
    dvar = np.sum(np.multiply(dZ_norm, Z - mean) * -0.5 * np.power(variance + 10 ** (-7), -1.5), axis=1, keepdims=True)
    dmean = np.sum(dZ_norm * -1 / (np.sqrt(variance + 10 ** (-7))), axis=1, keepdims=True)

    # Calculate the regular backprop gradients
    dZ = dZ_norm / (np.sqrt(variance + 10 ** (-7))) + dvar * 2 * (Z - mean) / m + dmean / m

    # no batchnorm for the last layer
    if is_last_layer:
        dZ = dY
        
    dW = (1/m) * np.matmul(dZ, Aprev.T) + regularization_parameter * W
    db = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dAprev = np.matmul(W.T, dZ)
    if Zprev.shape != (1,1): dY_for_previous_layer = np.multiply(dAprev, relu_derivative(Zprev))
    else: dY_for_previous_layer = np.array([[0]])

    return (dY_for_previous_layer, dW, db, dgamma, dbeta)



In [153]:
def backprop(A, Y, forward_prop_cache, parameters, num_layers):
    '''
    Inputs: Y - vector containing output predictions from forward prop
            forward_prop_cache: dictionary containing relevant matrices from forward propagation
    Outputs: back_prop_cache: dictionary containing relevant matrices from backpropagation, used for calculating new gradient values
    '''

    backprop_cache = {}

    # Calculate gradient for output layer
    backprop_cache[f"dY{num_layers}"] = A - Y
    dY = A - Y
    #max_norm = 5.0

    # Update gradients for rest of layers
    for l in range(num_layers, 0, -1):
        mean = forward_prop_cache[f"mean{l}"]
        variance = forward_prop_cache[f"var{l}"]
        gamma = parameters[f"gamma{l}"]
        beta = parameters[f"beta{l}"]
        Z_norm = forward_prop_cache[f"Z_norm{l}"]
        Z = forward_prop_cache[f"Z{l}"]
        Zprev = forward_prop_cache[f"Z{l-1}"]
        Aprev = forward_prop_cache[f"A{l-1}"]
        W = parameters[f"W{l}"]

        is_last_layer = True if l == num_layers else False
        (dY, dW, db, dgamma, dbeta) = backprop_layer_including_batchnorm(dY, mean, variance, gamma, Z_norm, Z, Zprev, Aprev, W, 0.0001, is_last_layer)

        backprop_cache[f"dY{l-1}"] = dY
        backprop_cache[f"dW{l}"] = dW
        backprop_cache[f"db{l}"] = db
        backprop_cache[f"dgamma{l}"] = dgamma
        backprop_cache[f"dbeta{l}"] = dbeta


    return backprop_cache




In [15]:
def compute_binary_cross_entropy_cost_without_regularization(y, yhat):
    '''
    Input: y - row vector  containing actual labels of the training examples
           yhat - row vector containing predicted probabilities by model
    Output: total cost of predictions
    '''
    # Clip yhat to avoid log(0) errors (log(0) = -inf)
    yhat = np.clip(yhat, 1e-15, 1 - 1e-15)
    # Compute binary cross-entropy (element-wise)
    cost = -np.mean(y * np.log(yhat) + (1 - y) * np.log(1 - yhat))
    
    return cost

In [165]:
def model(X, Y, learning_rate=0.001, num_iterations=60000):
    '''
    Hidden Layer 1: 220 neurons
    Hidden Layer 2: 100 neurons
    Hidden Layer 3: 50 neurons
    Hidden Layer 4: 30 neurons
    Hidden Layer 5: 1 neuron (sigmoid activation determining output for binary classification)

    Inputs: X - input matrix containing training examples
            Y - row vector containing the correct labels of the training examples
            learning_rate: initial learning rate (alpha) of the model
            num_iterations: 
    '''
    layer_dims = [X.shape[0], 220, 100, 50, 30, 1]
    (parameters, adam_cache) = initialize_parameters(layer_dims)
    num_layers = len(layer_dims) - 1

    beta1 = 0.9
    beta2 = 0.999
    epsilon = 1e-7 
    
    # Split the dataset into mini-batches
    shuffled_indices = np.random.permutation(X.shape[1])
    X = X[:, shuffled_indices]
    Y = Y[:, shuffled_indices]
    m = X.shape[1]
    batch_size = 512
    num_batches = int(np.ceil(m / batch_size))
    batches = []
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, m)
        curr_batch_X = X[:, start_index:end_index]
        curr_batch_Y = Y[:, start_index:end_index]
        batches.append((curr_batch_X, curr_batch_Y))

    batch_index = 0
    curr_epoch = -1

    for i in range(num_iterations + 1):
        (batch_X, batch_Y) = batches[batch_index]
        batch_index = (batch_index + 1) % num_batches
        if batch_index == 0: curr_epoch += 1

        min_learning_rate = 1e-6
        curr_learning_rate = max(min_learning_rate, learning_rate * np.exp(-0.96 * curr_epoch))

        # Perform forward propagation
        (yhat, forward_prop_cache) = forward_prop(np.array(batch_X), parameters, num_layers)

        # perform backpropagation
        A_last_layer = forward_prop_cache[f"A{num_layers}"]
        backprop_cache = backprop(A_last_layer, np.array(batch_Y), forward_prop_cache, parameters, num_layers)
    
        # Update trainable parameters using Adam optimizer for all the layers
        for l in range(1, num_layers + 1):
            dW = backprop_cache[f"dW{l}"]
            db = backprop_cache[f"db{l}"]
            dgamma = backprop_cache[f"dgamma{l}"]
            dbeta = backprop_cache[f"dbeta{l}"]

            VdW = adam_cache[f"VdW{l}"]
            SdW = adam_cache[f"SdW{l}"]
            Vdb = adam_cache[f"Vdb{l}"]
            Sdb = adam_cache[f"Sdb{l}"]
            Vdgamma = adam_cache[f"Vdgamma{l}"]
            Sdgamma = adam_cache[f"Sdgamma{l}"]
            Vdbeta = adam_cache[f"Vdbeta{l}"]
            Sdbeta = adam_cache[f"Sdbeta{l}"]
            
            adam_cache[f"VdW{l}"] = (beta1 * VdW + (1. - beta1) * dW)
            adam_cache[f"Vdb{l}"] = (beta1 * Vdb + (1. - beta1) * db)
            adam_cache[f"SdW{l}"] = (beta2 * SdW + (1. - beta2) * (dW ** 2))
            adam_cache[f"Sdb{l}"] = (beta2 * Sdb + (1. - beta2) * (db ** 2))
            adam_cache[f"Vdgamma{l}"] = (beta1 * Vdgamma + (1. - beta1) * dgamma)
            adam_cache[f"Vdbeta{l}"] = (beta1 * Vdbeta + (1. - beta1) * dbeta)
            adam_cache[f"Sdgamma{l}"] = (beta2 * Sdgamma + (1. - beta2) * (dgamma ** 2))
            adam_cache[f"Sdbeta{l}"] = (beta2 * Sdbeta + (1. - beta2) * (dbeta ** 2))

            parameters[f"W{l}"] -= learning_rate * adam_cache[f"VdW{l}"] / (np.sqrt(adam_cache[f"SdW{l}"]) + epsilon)
            parameters[f"b{l}"] -= learning_rate * adam_cache[f"Vdb{l}"] / (np.sqrt(adam_cache[f"Sdb{l}"]) + epsilon)
            parameters[f"gamma{l}"] -= learning_rate * adam_cache[f"Vdgamma{l}"] / (np.sqrt(adam_cache[f"Sdgamma{l}"]) + epsilon)
            parameters[f"beta{l}"] -= learning_rate * adam_cache[f"Vdbeta{l}"] / (np.sqrt(adam_cache[f"Sdbeta{l}"]) + epsilon)

            
        if i % 1000 == 0:
            print(f"Current loss for iteration {i}: {compute_binary_cross_entropy_cost_without_regularization(batch_Y, yhat)}")
        

            
    return parameters
        

In [172]:
def make_inferences(X_input, Y_output, parameters):
    (Y_test_predictions, _) = forward_prop(X_input, parameters, 5)
    Y_test_predictions = (Y_test_predictions >= 0.5).astype(int)
    accuracy = np.sum(Y_test_predictions == Y_output) / Y_output.shape[1]
    return (accuracy, Y_test_predictions)

In [168]:
parameters = model(X_train, Y_train, num_iterations=10000)

Current loss for iteration 0: 1.1836756067005112
Current loss for iteration 1000: 0.3960378725094277
Current loss for iteration 2000: 0.22784225409457046
Current loss for iteration 3000: 0.17619029061668623
Current loss for iteration 4000: 0.18216429235222448
Current loss for iteration 5000: 0.10414489897263797
Current loss for iteration 6000: 0.19193754744101318
Current loss for iteration 7000: 0.15970130557893447
Current loss for iteration 8000: 0.08449635169678574
Current loss for iteration 9000: 0.05201329071935923
Current loss for iteration 10000: 0.05668125170625836


In [174]:
# Calculate f1 score
def confusion_matrix_calculations(Y_pred, Y):
    true_positives = np.sum((Y_pred == 1) & (Y == 1))
    false_positives = np.sum((Y_pred == 1) & (Y == 0))
    false_negatives = np.sum((Y_pred == 0) & (Y == 1))

    if true_positives + false_positives > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0

    if true_positives + false_negatives > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0

    if precision + recall > 0:
        f1_score = 2 * precision * recall / (precision + recall) 
    else:
        f1_score = 0

    return f1_score



In [176]:
cv_accuracy, y_cv_predictions = make_inferences(X_cv, Y_cv, parameters)
print("Cross Validation accuracy: " + str(cv_accuracy))
f1_score = confusion_matrix_calculations(y_cv_predictions, Y_cv)
print("Cross Validation f1_score: " + str(f1_score))

Cross Validation accuracy: 0.9809645523642968
Cross Validation f1_score: 0.9602025031641119


In [178]:
test_accuracy, y_test_predictions = make_inferences(X_test, Y_test, parameters)
print("Test Validation accuracy: " + str(test_accuracy))
f1_score = confusion_matrix_calculations(y_test_predictions, Y_test)
print("Test f1_score: " + str(f1_score))

Test Validation accuracy: 0.9827133920764108
Test f1_score: 0.963374661536269
