# Bucket Optimization
In this task, we will provide a general approach to generating k number of bins or buckets for FICO scores with data-driven ranges that yields the most reliable probability of default for each bin. That is, we want to be more confident in the threshold for default using data and statistics. This minimizes our reliance on domain knowledge, but at the same time increases our vulnerability to model inaccuracy caused by dirty or ineffective datasets.

We will only concern ourselves with generating bin ranges through statistical means. We will use a simpler unsupervised learning approach: 1-dimensional K-means clustering -- to create <i>k</i> number of bins that are defined by proximity with respect to one variable (i.e., FICO score). Each ith bin is assigned a label <i>i</i> (i.e., 1, 2, ... k) such that: $1 \leq k < n $ and is treated an ordinal variable. Lower values represent higher creditworthiness.

### Implementation Plan
1. Sort dataframe by FICO scores
2. Initialize k bin ranges by percentile and set a centroid (mean of each bin range )
3. For each sample, compute distance to all 

In [2]:
import pandas as pd
import numpy as np

# load data
file_path = "data/Loan_Data.csv"
df = pd.read_csv(file_path)

# sort data by FICO score
sorted_df = df.sort_values(by = "fico_score", inplace = False)
sorted_df

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.752520,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.830850,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0
...,...,...,...,...,...,...,...,...
9995,3972488,0,3033.647103,2553.733144,42691.62787,5,697,0
9996,6184073,1,4146.239304,5458.163525,79969.50521,8,615,0
9997,6694516,2,3088.223727,4813.090925,38192.67591,5,596,0
9998,3942961,0,3288.901666,1043.099660,50929.37206,2,647,0


In [4]:
def knn(sorted_df, num_bins: int):
    # 1: initialize bins and compute centroids (mean) for each bin
    # divide (min=300, max=850) into k percentiles
    centroids = [i for i in range(min, max, step= (min - max) / num_bins]
    
    # 2: define bin ranges as midpoints of each centroid
    # for i, i+1 in range(i - 1):
    # 

    # 3: for each sample, compute distance to all bin centroids

    # 4: update computed mean of all samples in each bin range as new centroid

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
2092,7264776,1,4457.914800,12233.495010,98913.32028,3,408,0
6556,6901345,3,5281.352243,16411.518010,79905.09892,1,409,1
7001,2585781,4,6734.984475,26384.584390,97668.03091,2,418,1
5521,1252008,5,5176.915602,22990.265430,82417.59227,2,425,1
2629,1337395,5,4271.314690,22756.281030,83475.30929,4,438,1
...,...,...,...,...,...,...,...,...
703,2967189,2,4803.995367,12920.809080,82500.95110,8,828,0
7575,2214311,1,3590.741275,8071.647089,71796.25458,8,831,0
4768,2713140,0,3566.709054,4400.583413,60204.65020,4,831,0
9660,5062008,0,780.213472,273.300211,10258.24788,3,835,0


In [84]:
import numpy as np
import pandas as pd

def compute_sse(prefix_sum, prefix_sq, a, b):
    """
    Compute SSE (sum of squared errors) for scores[a:b].
    prefix_sum, prefix_sq are cumulative sums over sorted scores.
    """
    n = b - a
    total = prefix_sum[b] - prefix_sum[a]
    total_sq = prefix_sq[b] - prefix_sq[a]
    mean = total / n
    return total_sq - 2 * mean * total + n * mean**2


def optimal_bucketing(scores, k):
    """
    Dynamic programming solution for 1D k-means (MSE minimization).
    Input: sorted numpy array 'scores', number of buckets k
    Output: bucket boundaries and assignments
    """
    n = len(scores)

    # Prefix sums for fast SSE computation
    prefix_sum = np.zeros(n + 1)
    prefix_sq = np.zeros(n + 1)
    for i in range(n):
        prefix_sum[i+1] = prefix_sum[i] + scores[i]
        prefix_sq[i+1] = prefix_sq[i] + scores[i]**2

    # DP table and backtracking
    dp = np.full((n+1, k+1), np.inf)
    backtrack = np.zeros((n+1, k+1), dtype=int)
    dp[0,0] = 0

    for j in range(1, k+1):
        for i in range(j, n+1):
            for t in range(j-1, i):
                cost = dp[t,j-1] + compute_sse(prefix_sum, prefix_sq, t, i)
                if cost < dp[i,j]:
                    dp[i,j] = cost
                    backtrack[i,j] = t

    # Recover boundaries
    boundaries = []
    i, j = n, k
    while j > 0:
        t = backtrack[i,j]
        boundaries.append(scores[t])
        i, j = t, j-1
    boundaries = sorted(boundaries[1:] + [scores[-1]])  # drop first, keep max

    return boundaries


def assign_buckets(df, col, boundaries):
    """
    Assign each row in df to a bucket based on boundaries.
    Lower buckets = lower FICO = worse rating.
    """
    df = df.copy()
    df['rating'] = np.digitize(df[col], boundaries, right=True) + 1
    return df


# === Example Usage ===
if __name__ == "__main__":
    # Fake borrower dataset
    # data = {
    #     "fico_score": [500, 510, 520, 600, 610, 700, 710, 720, 800, 810],
    #     "income": [30, 35, 32, 60, 65, 80, 85, 82, 100, 105],
    #     "years_employed": [1,2,2,5,6,10,9,12,20,22],
    # }
    # df = pd.DataFrame(data)

    k = 3
    sorted_scores = np.sort(df['fico_score'].values)
    boundaries = optimal_bucketing(sorted_scores, k)

    df_binned = assign_buckets(df, "fico_score", boundaries)

    print("Bucket boundaries:", boundaries)
    print(df_binned.sort_values("fico_score"))


Bucket boundaries: [408, 600, 850]
      customer_id  credit_lines_outstanding  loan_amt_outstanding  \
2092      7264776                         1           4457.914800   
6556      6901345                         3           5281.352243   
7001      2585781                         4           6734.984475   
5521      1252008                         5           5176.915602   
2629      1337395                         5           4271.314690   
...           ...                       ...                   ...   
703       2967189                         2           4803.995367   
7575      2214311                         1           3590.741275   
4768      2713140                         0           3566.709054   
9660      5062008                         0            780.213472   
2659      1096584                         0           4107.822428   

      total_debt_outstanding       income  years_employed  fico_score  \
2092            12233.495010  98913.32028               3      

In [85]:
# approach 2

def initialize_centroids(scores, k):
    """
    Initialize centroids using evenly spaced quantiles of the data.
    """
    percentiles = np.linspace(0, 100, k + 2)[1:-1]
    return np.percentile(scores, percentiles)


def assign_buckets(scores, centroids):
    """
    Assign each score to nearest centroid.
    """
    boundaries = [(centroids[i] + centroids[i+1]) / 2 for i in range(len(centroids)-1)]
    return np.digitize(scores, boundaries)


def update_centroids(scores, labels, k):
    """
    Recompute centroids as the mean of assigned points.
    """
    new_centroids = []
    for j in range(k):
        cluster_points = scores[labels == j]
        if len(cluster_points) > 0:
            new_centroids.append(cluster_points.mean())
        else:
            # fallback if bucket is empty
            new_centroids.append(np.random.choice(scores))
    return np.array(new_centroids)


def kmeans_1d(df, col, k, max_iter=100, tol=1e-3):
    """
    Perform 1D k-means clustering on a DataFrame column.
    Returns the DataFrame with a new 'rating' column.
    """
    scores = df[col].values
    centroids = initialize_centroids(scores, k)

    for _ in range(max_iter):
        labels = assign_buckets(scores, centroids)
        new_centroids = update_centroids(scores, labels, k)
        shift = np.abs(new_centroids - centroids).max()
        if shift < tol:
            break
        centroids = new_centroids

    # Assign final buckets as ratings (1 = worst, k = best)
    labels = assign_buckets(scores, centroids)
    df = df.copy()
    df['rating'] = labels + 1
    return df, centroids


In [86]:
df_buckets, centroids = kmeans_1d(df, col="fico_score", k=3)
print(df_buckets.sort_values("fico_score"))
print("Centroids:", centroids)

      customer_id  credit_lines_outstanding  loan_amt_outstanding  \
2092      7264776                         1           4457.914800   
6556      6901345                         3           5281.352243   
7001      2585781                         4           6734.984475   
5521      1252008                         5           5176.915602   
2629      1337395                         5           4271.314690   
...           ...                       ...                   ...   
703       2967189                         2           4803.995367   
7575      2214311                         1           3590.741275   
4768      2713140                         0           3566.709054   
9660      5062008                         0            780.213472   
2659      1096584                         0           4107.822428   

      total_debt_outstanding       income  years_employed  fico_score  \
2092            12233.495010  98913.32028               3         408   
6556            16411.518