In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/Task 3 and 4_Loan_Data (1).csv')
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


### Preprocessing

In [3]:
data = df.sort_values(by='fico_score')
fico_scores = data['fico_score'].values
defaults = data['default'].values

In [4]:
N = len(fico_scores)
prefix_defaults = np.zeros(N+1)
prefix_counts = np.arange(N+1)

for i in range(1, N+1):
    prefix_defaults[i] = prefix_defaults[i-1] + defaults[i-1]

- The prefix_defaults list gives us the total no. of defaults in the range (0,i).
- If you want to know the no. of defaults between j and i you use this list : prefix_defaults[i] - prefix_defaults[j].

In [6]:
K = 5  # user input
dp = np.full((N+1, K+1), -np.inf)
backtrack = np.zeros((N+1, K+1), dtype=int)

dp[0][0] = 0  # to handle base case

- The dp[i][k] nested list stores the maximum log-loss for splitting upto i data points into k buckets.
- backtrack list stores the index of the best splitting point (fico score).

### Log-Loss Function

In [5]:
def bucket_stats(i, j):
    ni = j - i + 1
    ki = prefix_defaults[j+1] - prefix_defaults[i]
    pi = ki / ni if ni > 0 else 1e-10
    if pi == 0 or pi == 1:
        return -1e6
    return ki * np.log(pi) + (ni - ki) * np.log(1 - pi)

- We are usng the log loss because we want our bins to refect on default probabilities, not just seperate through fico scores.

### Optimization

In [7]:
for i in range(1, N+1):
    for k in range(1, K+1):
        for j in range(k-1, i):
            ll = bucket_stats(j, i-1)
            if dp[j][k-1] + ll > dp[i][k]:
                dp[i][k] = dp[j][k-1] + ll
                backtrack[i][k] = j

- we broke the problem into sub problems, storing each result of the sub-problem.

In [8]:
boundaries = []
i, k = N, K
while k > 0:
    j = backtrack[i][k]
    boundaries.append(fico_scores[j])
    i = j
    k -= 1

boundaries = sorted(boundaries)

In [14]:
print(boundaries)
boundaries = [int(b) for b in boundaries]
print(boundaries)

[np.int64(408), np.int64(521), np.int64(580), np.int64(640), np.int64(697)]
[408, 521, 580, 640, 697]


### Rating

In [11]:
def assign_rating(score, boundaries):
    all_bounds = [-float('inf')] + sorted(boundaries) + [float('inf')]
    for i in range(len(all_bounds)-1):
        if all_bounds[i] < score <= all_bounds[i+1]:
            return len(all_bounds) - 1 - i

In [12]:
df['fico_rating'] = df['fico_score'].apply(lambda x: assign_rating(x, boundaries))

In [13]:
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default,fico_rating
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0,3
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1,4
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0,3
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0,3
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0,3
