In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load data
data = pd.read_csv('Loan_Data.csv')

# Define the features and target variable
features = ['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 'income', 'years_employed']
target = 'default'

In [3]:
# Create a quantization function using Mean Squared Error (MSE)
def quantize_mse(series, num_buckets):
    boundaries = np.linspace(series.min(), series.max(), num=num_buckets + 1)
    return np.digitize(series, boundaries)


In [4]:
# Apply quantization to FICO scores using MSE
num_buckets = 5 
data['fico_buckets_mse'] = quantize_mse(data['fico_score'], num_buckets)

In [5]:
# Split the data into training and testing sets
X_mse = data[features + ['fico_buckets_mse']]
y = data[target]
X_train_mse, X_test_mse, y_train, y_test = train_test_split(X_mse, y, test_size=0.2, random_state=2023)

In [6]:
# Build machine learning models (Logistic Regression) 
model_mse = LogisticRegression(solver='liblinear')
model_mse.fit(X_train_mse, y_train)

LogisticRegression(solver='liblinear')

In [7]:
# Evaluate the model
y_pred_mse = model_mse.predict(X_test_mse)
# Print model performance metrics for MSE method
print("Model with MSE Quantization:")
print("Accuracy:", accuracy_score(y_test, y_pred_mse))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_mse))
print("Classification Report:\n", classification_report(y_test, y_pred_mse))

Model with MSE Quantization:
Accuracy: 0.982
Confusion Matrix:
 [[1634   21]
 [  15  330]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1655
           1       0.94      0.96      0.95       345

    accuracy                           0.98      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.98      0.98      0.98      2000



In [8]:
# Define the Log-Likelihood quantization function
def quantize_log_likelihood(series, num_buckets):
    min_score = series.min()
    max_score = series.max()
    
    # Use the dynamic programming approach to find the boundaries and probabilities
    # Initialize bucket boundaries and probabilities
    boundaries = [min_score]
    probabilities = []
    
    # Split the problem into subproblems
    subproblems = [(min_score, max_score, num_buckets)]
    
    while subproblems:
        # Pop the last subproblem
        lower_bound, upper_bound, num_subbuckets = subproblems.pop()
        
        # Split the subproblem into subbuckets
        subbucket_boundaries = np.linspace(lower_bound, upper_bound, num=num_subbuckets + 1)
        
        # Initialize subbucket statistics
        subbucket_counts = np.zeros(num_subbuckets, dtype=int)
        subbucket_defaults = np.zeros(num_subbuckets, dtype=int)
        
        for i in range(len(subbucket_boundaries) - 1):
            lower_subbound = subbucket_boundaries[i]
            upper_subbound = subbucket_boundaries[i + 1]
            
            # Calculate the number of records in the subbucket
            subbucket_counts[i] = ((series >= lower_subbound) & (series < upper_subbound)).sum()
            
            # Calculate the number of defaults in the subbucket
            subbucket_defaults[i] = ((series >= lower_subbound) & (series < upper_subbound)).sum()
        
        for i in range(num_subbuckets - 1):
            num_defaults = subbucket_defaults[i]
            num_records = subbucket_counts[i]
            
            # Avoid division by zero
            if num_records == 0:
                probability = 0.0
            else:
                probability = num_defaults / num_records
            EPSILON = 1e-10  # Small positive value to avoid division by zero or taking the logarithm of zero

            # Calculate the probability of default with smoothing
            probability = (num_defaults + EPSILON) / (num_records + 2 * EPSILON)
            # Calculate the log-likelihood for the subbucket
            log_likelihood = num_defaults * np.log(probability) + (num_records - num_defaults) * np.log(1 - probability)
            
            if len(boundaries) < num_buckets:
                boundaries.append(subbucket_boundaries[i + 1])
                probabilities.append(probability)
            else:
                # Replace the last boundary if it's better
                last_log_likelihood = probabilities[-1] * subbucket_counts[-1] * (1 - probabilities[-1])
                if log_likelihood > last_log_likelihood:
                    boundaries[-1] = subbucket_boundaries[i + 1]
                    probabilities[-1] = probability
        
        # Divide the subproblem into two subproblems
        for i in range(1, num_subbuckets):
            subproblems.append((subbucket_boundaries[i - 1], subbucket_boundaries[i], 2))
    
    
    # Map FICO scores to bucket indices
    bucket_indices = np.digitize(series, boundaries) - 1
    
    # Assign probabilities to FICO scores based on the bucket indices
    quantized_probabilities = [probabilities[i] for i in bucket_indices]
    
    return quantized_probabilities

In [None]:
# Split the data into training and testing sets
# Note running this section is slow. Parallel processing my speed up the process for future projects.
num_buckets = 5 
data['fico_buckets_ll'] = quantize_log_likelihood(data['fico_score'], num_buckets)

X_ll = data[features + ['fico_buckets_ll']]
y = data[target]
X_train_ll, X_test_ll, _, _ = train_test_split(X_ll, y, test_size=0.2, random_state=2023)

In [None]:
# Build machine learning models (Logistic Regression) 
model_ll = LogisticRegression(solver='liblinear')
model_ll.fit(X_train_ll, y_train)

In [None]:
# Evaluate the model
y_pred_ll = model_ll.predict(X_test_ll)

Print model performance metrics for Log-Likelihood method
print("\nModel with Log-Likelihood Quantization:")
print("Accuracy:", accuracy_score(y_test, y_pred_ll))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ll))
print("Classification Report:\n", classification_report(y_test, y_pred_ll))