In [1]:
import pandas as pd
from scipy.optimize import minimize
import numpy as np

# Load the sample data
data = '../data/Loan_Data.csv'

df = pd.read_csv(data)

In [4]:
# Sort the DataFrame by FICO score
df.sort_values('fico_score', inplace=True)

# Define the number of buckets
num_buckets = 5

# Function to calculate mean squared error
def mean_squared_error(boundaries, data):
    buckets = np.digitize(data, np.sort(boundaries))
    bucket_means = [np.mean(data[buckets == i]) for i in range(1, num_buckets + 1)]
    mse = np.mean((data - np.array(bucket_means)[buckets - 1])**2)
    return mse

# Function to calculate log-likelihood
def log_likelihood(boundaries, data, labels):
    buckets = np.digitize(data, np.sort(boundaries))
    ni = np.bincount(buckets)[1:]
    ki = np.bincount(labels, weights=np.ones(len(labels)))[1:]
    pi = ki / ni
    log_likelihood = np.sum(ki * np.log(pi / (1 - pi)))
    return -log_likelihood

# Initial guess for boundaries
initial_boundaries = np.linspace(df['fico_score'].min(), df['fico_score'].max(), num_buckets + 1)[1:-1]

# Optimize for mean squared error
result_mse = minimize(mean_squared_error, initial_boundaries, args=(df['fico_score'],), method='Nelder-Mead')

# Optimize for log-likelihood
labels = df['default']
result_ll = minimize(log_likelihood, initial_boundaries, args=(df['fico_score'], labels), method='Nelder-Mead')

# Print the results
print("Mean Squared Error Optimal Boundaries:", result_mse.x)
print("Log-Likelihood Optimal Boundaries:", result_ll.x)

  log_likelihood = np.sum(ki * np.log(pi / (1 - pi)))


Mean Squared Error Optimal Boundaries: [846.74226842 864.16909573 877.76493106 875.95738459]
Log-Likelihood Optimal Boundaries: [496.4 584.8 673.2 761.6]
