In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [None]:
# Load and preprocess data
bc_data = pd.read_csv('Data/loan_application_encoded.csv').fillna(0)
bc_data = bc_data.drop(columns=[
    'CODE_GENDER_XNA', 
    'NAME_EDUCATION_TYPE_Academic degree', 
    'NAME_INCOME_TYPE_Other', 
    'NAME_INCOME_TYPE_Unemployed', 
    'NAME_HOUSING_TYPE_Co-op apartment', 
    'NAME_HOUSING_TYPE_Office apartment',
    'SK_ID_CURR',
    'AMT_CREDIT_y'
])
X = bc_data.drop(columns=['TARGET'])
y = bc_data['TARGET']

# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

# Standardize the feature values 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the binary classification neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(16, activation='sigmoid'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
adam_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=.01)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Train the model
model.fit(X_train, y_train, epochs=1000, batch_size=1024, validation_split=0.5, callbacks=[early_stopping_callback], class_weight=class_weights_dict)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.9).astype(int) # Convert probabilities to binary predictions
precision = precision_score(y_test, y_pred_binary, zero_division=0)
recall = recall_score(y_test, y_pred_binary, zero_division=0)
f1 = f1_score(y_test, y_pred_binary)

# CONFUSION MATRIX
cm = confusion_matrix(y_test, y_pred_binary)
cm_df = pd.DataFrame(cm, 
                     index=['Actual Negative', 'Actual Positive'], 
                     columns=['Negative', 'Positive'])

# Print the results
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

In [None]:
print(f'Samples Trained: {len(X_train):,}')
print(f'Samples Tested: {len(X_test):,}')
print("Confusion Matrix:")
cm_df

In [None]:
from sklearn.metrics import roc_auc_score

# Assuming your test set is X_test and y_test
y_pred_probs = model.predict(X_test)

# Calculate AUC
auc = roc_auc_score(y_test, y_pred_probs)
print(f"AUC: {auc:.2f}")

# Random Forest Classifier for feature importance

In [None]:
# Load and preprocess data
rf_data = pd.read_csv('Data/loan_application_encoded.csv').fillna(0)
rf_data = rf_data.drop(columns=[
    'CODE_GENDER_XNA', 
    'NAME_EDUCATION_TYPE_Academic degree', 
    'NAME_INCOME_TYPE_Other', 
    'NAME_INCOME_TYPE_Unemployed', 
    'NAME_HOUSING_TYPE_Co-op apartment', 
    'NAME_HOUSING_TYPE_Office apartment',
    'SK_ID_CURR',
    'AMT_CREDIT_y'
])
X = rf_data.drop(columns=['TARGET'])
y = rf_data['TARGET']

# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3)

# Standardize the feature values 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# *************************************************************


# Initialize the RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Evaluate the model on the test set
rf_accuracy = rf.score(X_test, y_test)
print(f'Random Forest Test Accuracy: {rf_accuracy:.4f}')

# Get feature importances
importances = rf.feature_importances_

# Get the feature names
feature_names = X.columns  

# Sort the feature importances in descending order and get the indices
indices = np.argsort(importances)[::-1]

# Plot the feature importances in a horizontal bar chart
plt.figure(figsize=(10, 12))  # Adjust the figure size as needed
plt.title('Feature Importance')
plt.barh(range(X_train.shape[1]), importances[indices], align='center')
plt.yticks(range(X_train.shape[1]), [feature_names[i] for i in indices])
plt.gca().invert_yaxis()  # Invert y-axis to have the most important at the top
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, PReLU, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf


# Load and preprocess data
vs_data = pd.read_csv('Data/loan_application_encoded.csv').fillna(0)
vs_data = vs_data.drop(columns=[
    'CODE_GENDER_XNA', 
    'NAME_EDUCATION_TYPE_Academic degree', 
    'NAME_INCOME_TYPE_Other', 
    'NAME_INCOME_TYPE_Unemployed', 
    'NAME_HOUSING_TYPE_Co-op apartment', 
    'NAME_HOUSING_TYPE_Office apartment',
    'SK_ID_CURR',
    'AMT_CREDIT_y'
])

# Separate features and target
X = vs_data.drop(columns=['TARGET'])
y = vs_data['TARGET']

# Store the loan amounts for later use
loan_amounts = vs_data['AMT_CREDIT_x']

# Combine features and loan amounts before resampling
data_with_loan_amounts = X.copy()
data_with_loan_amounts['LOAN_AMOUNT'] = loan_amounts

# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(data_with_loan_amounts, y)

# Now, X_resampled includes the resampled loan amounts
loan_amounts_resampled = X_resampled['LOAN_AMOUNT']
X_resampled = X_resampled.drop(columns=['LOAN_AMOUNT'])

# Split the resampled data
X_train, X_test, y_train, y_test, loan_amounts_train, loan_amounts_test = train_test_split(
X_resampled, y_resampled, loan_amounts_resampled, test_size=0.2)

# Standardize the feature values 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the binary classification neural network model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64),
    PReLU(),
    Dropout(0.3),
    Dense(32),
    PReLU(),
    Dropout(0.3),
    Dense(16, activation='sigmoid'),
    Dense(1, activation='sigmoid')
])

# Compile the model
adam_optimizer = Adam(learning_rate=.01)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Train the model
model.fit(X_train, y_train, epochs=1000, batch_size=1024, validation_split=0.5, callbacks=[early_stopping_callback], class_weight=class_weights_dict)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.9).astype(int) # Convert probabilities to binary predictions

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_test, y_pred_binary, zero_division=0)
recall = recall_score(y_test, y_pred_binary, zero_division=0)
f1 = f1_score(y_test, y_pred_binary)

# CONFUSION MATRIX
cm = confusion_matrix(y_test, y_pred_binary)
cm_df = pd.DataFrame(cm, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive'])

# Financial Impact Calculation
true_negatives_loan_amount = loan_amounts_test[(y_test == 0) & (y_pred_binary.flatten() == 0)].sum()
false_negatives_loan_amount = loan_amounts_test[(y_test == 1) & (y_pred_binary.flatten() == 0)].sum()
false_positives_loan_amount = loan_amounts_test[(y_test == 0) & (y_pred_binary.flatten() == 1)].sum()



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 14: early stopping


In [11]:
# Print the results
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
print()
print(cm_df)

Test Loss: 0.1572
Test Accuracy: 0.9498
Precision: 0.9999
Recall: 0.8984
F1-Score: 0.9465

                 Predicted Negative  Predicted Positive
Actual Negative               56801                   3
Actual Positive                5716               50554


In [12]:
all_loans = vs_data['AMT_CREDIT_x'].sum()
test_pct_amt = all_loans * .2
granted_loans = true_negatives_loan_amount+false_negatives_loan_amount
missed_good_loans = false_positives_loan_amount

print("All Loans = ${:,.0f}".format(all_loans))
print("Tested Loans Total Amount= ${:,.0f}".format(test_pct_amt))
print("Granted Loans Amount = ${:,.0f}".format(granted_loans))
print("Good Loans = ${:,.0f}".format(true_negatives_loan_amount), "   ({:.1f}%)".format((true_negatives_loan_amount/granted_loans)*100))
print("Bad Loans = ${:,.0f}".format(false_negatives_loan_amount), "   ({:.1f}%)".format((false_negatives_loan_amount/granted_loans)*100))
print("Missed Good Loans = ${:,.0f}".format(missed_good_loans))
print()
print("Ratio of Bad Loans to Missed Good Loans = {:,.1f}".format((false_negatives_loan_amount/missed_good_loans)))

All Loans = $184,205,824,196
Tested Loans Total Amount= $36,841,164,839
Granted Loans Amount = $37,252,810,560
Good Loans = $34,053,508,310    (91.4%)
Bad Loans = $3,199,302,251    (8.6%)
Missed Good Loans = $1,273,941

Ratio of Bad Loans to Missed Good Loans = 2,511.3


THRESHOLD = .9
Good Loans = $34,126,695,189    (91.3%)
Bad Loans = $3,269,430,841    (8.7%)
Missed Good Loans = $0
Ratio of Bad Loans to Opportunity Cost = 0

THRESHOLD=.5
Good Loans = $34,066,727,604    (91.5%)
Bad Loans = $3,158,148,609    (8.5%)
Missed Good Loans = $8,310,645
Ratio of Bad Loans to Opportunity Cost = 380.0

THRESHOLD=.3
Good Loans = $33,960,204,603    (92.0%)
Bad Loans = $2,964,625,763    (8.0%)
Missed Good Loans = $273,406,572
Ratio of Bad Loans to Missed Good Loans = 10.8

THRESHOLD=.2
Good Loans = $32,413,849,016    (92.7%)
Bad Loans = $2,553,196,626    (7.3%)
Missed Good Loans = $1,729,664,811
Ratio of Bad Loans to Missed Good Loans = 1.5

THRESHOLD=.175
Good Loans = $31,749,593,998    (93.3%)
Bad Loans = $2,297,873,785    (6.7%)
Missed Good Loans = $2,373,905,250
Ratio of Bad Loans to Missed Good Loans = 1.0

THRESHOLD=.15
Good Loans = $30,236,415,988    (93.8%)
Bad Loans = $2,007,389,410    (6.2%)
Missed Good Loans = $3,737,143,102
Ratio of Bad Loans to Missed Good Loans = 0.5

THRESHOLD=.1
Good Loans = $25,457,557,820    (94.9%)
Bad Loans = $1,381,609,020    (5.1%)
Missed Good Loans = $8,680,486,666
Ratio of Bad Loans to Missed Good Loans = 0.2