In [3]:
import pandas as pd
import numpy as np

# Load the data into a pandas DataFrame
data = pd.read_csv("data/Task_3_and_4_Loan_Data.csv")

# Define the number of buckets
num_buckets = 10

# Compute the boundaries of the buckets using equal-width bins
min_score = data['fico_score'].min()
max_score = data['fico_score'].max()
width = (max_score - min_score) / num_buckets
boundaries = np.arange(min_score, max_score + width, width)

# Map each FICO score to its corresponding bucket
data['Bucket'] = pd.cut(data['fico_score'], bins=boundaries, labels=False)

# Compute the mean default rate for each bucket
default_rates = data.groupby('Bucket')['default'].mean()

# Compute the squared error for each bucket
squared_errors = (default_rates - default_rates.mean()) ** 2

# Compute the total squared error
total_squared_error = squared_errors.sum()

print('Total squared error:', total_squared_error)


Total squared error: 0.6442145853209599


In [4]:
# using the method of maximizing log-likelihood function

from scipy.optimize import minimize

# Load the data into a pandas DataFrame
data = pd.read_csv("data/Task_3_and_4_Loan_Data.csv")

# Define the number of buckets
num_buckets = 10

# Compute the boundaries of the buckets using equal-width bins
min_score = data['fico_score'].min()
max_score = data['fico_score'].max()
width = (max_score - min_score) / num_buckets
boundaries = np.arange(min_score, max_score + width, width)

# Map each FICO score to its corresponding bucket
data['Bucket'] = pd.cut(data['fico_score'], bins=boundaries, labels=False)

# Define the log-likelihood function for a binomial distribution
def log_likelihood(params):
    p = params[0]
    n = params[1]
    y = data['default']
    x = data['Bucket']
    ll = np.sum(y * np.log(p) + (n - y) * np.log(1 - p))
    return -ll

# Find the maximum likelihood estimates for p and n
result = minimize(log_likelihood, [0.5, 1000], method='Nelder-Mead')
p_mle = result.x[0]
n_mle = result.x[1]

print('Maximum likelihood estimates:')
print('p:', p_mle)
print('n:', n_mle)


Maximum likelihood estimates:
p: -0.07187500000000158
n: 1270.3125


  ll = np.sum(y * np.log(p) + (n - y) * np.log(1 - p))


In [11]:
# this is another way. Compute the probability of default in each bucket
num_defaults_in_bucket = np.array([10, 20, 30, 40, 50])
prob_defaults_in_bucket = np.zeros_like(num_defaults_in_bucket)
num_records_in_bucket = np.histogram(data['fico_score'], bins=buckets)[0]
nonzero_mask = num_records_in_bucket != 0
prob_defaults_in_bucket[nonzero_mask] = num_defaults_in_bucket[nonzero_mask] / num_records_in_bucket[nonzero_mask]


In [None]:
import numpy as np

# Define the bucket boundaries and number of records in each bucket
buckets = np.array([0, 200, 400, 600, 800, 1000])
num_records = np.array([1000, 2000, 3000, 4000, 5000])

# Define the number of defaults in each bucket
num_defaults = np.array([10, 20, 30, 40, 50])

# Compute the probability of default in each bucket
prob_defaults = num_defaults / num_records

# Define the function to compute the penalty term
def penalty_term(buckets):
    diffs = np.diff(buckets)
    return np.sum(diffs ** 2)

# Define the function to compute the log-likelihood
def log_likelihood(buckets):
    # Compute the number of records and defaults in each bucket
    num_records_in_bucket = np.histogram(data['fico_score'], bins=buckets)[0]
    num_defaults_in_bucket = np.histogram(data[data['default'] == 1]['fico_score'], bins=buckets)[0]

    # Compute the probability of default in each bucket
    prob_defaults_in_bucket = num_defaults_in_bucket / num_records_in_bucket

    # Compute the penalty term
    penalty = penalty_term(buckets)

    # Compute the log-likelihood
    ll = np.sum(num_defaults_in_bucket * np.log(prob_defaults_in_bucket) + (num_records_in_bucket - num_defaults_in_bucket) * np.log(1 - prob_defaults_in_bucket)) - penalty

    return -ll

# Find the optimal bucket boundaries using dynamic programming
def find_optimal_buckets(num_buckets):
    # Initialize the dynamic programming table
    dp_table = np.zeros((len(num_records) + 1, num_buckets + 1))

    # Fill in the table using dynamic programming
    for i in range(1, len(num_records) + 1):
        for j in range(1, num_buckets + 1):
            if j == 1:
                dp_table[i][j] = log_likelihood([buckets[0], buckets[i]])
            elif i <= j:
                dp_table[i][j] = float('inf')
            else:
                min_val = float('inf')
                for k in range(j - 1, i):
                    val = dp_table[k][j - 1] + log_likelihood([buckets[0], *buckets[k:i], buckets[-1]])
                    if val < min_val:
                        min_val = val
                dp_table[i][j] = min_val

    # Extract the optimal bucket boundaries from the table
    optimal_buckets = [buckets[0]]
    i = len(num_records)
    j = num_buckets
    while j > 1:
        for k in range(j - 1, i):
            if dp_table[k][j - 1] + log_likelihood([buckets[0], *buckets[k:i], buckets[-1]]) == dp_table[i][j]:
                optimal_buckets.append(buckets[k])
                i = k
                j -= 1
                break

    optimal_buckets.append(buckets[-1])
    optimal_buckets.reverse()

    return optimal_buckets

# Find the optimal bucket boundaries for a given number of buckets
optimal_buckets = find_optimal_buckets(5)

print(optimal_buckets)

In [23]:
import pandas as pd
from math import log
import os

cwd = os.getcwd()

print("Current working directory: {0}".format(cwd))

print ("os.getcwd() returns an object of type {0}".format(type(cwd)))

# copy the filepath 
os.chdir ("C:\Users\kmudhanyana\Downloads\JP Morgan & Co virtual internship\data")

x = data['default'].to_list()
y = data['fico_score'].to_list()
n = len(x)
print (len(x), len(y))

default = [0 for i in range(851)]
total = [0 for i in range(851)]

for i in range(n):
    y[i] = int(y[i])
    default[y[i]-300] += x[i]
    total[y[i]-300] += 1
    
for i in range(0, 551):
    default[i] += default[i-1]
    total[i] += total[i-1]
    
import numpy as np
    
def log_likelihood(n, k):
    p = k/n
    if (p==0 or p==1):
        return 0
    return k*np.log(p)+ (n-k)*np.log(1-p)

r = 10
dp = [[[-10**18, 0] for i in range(551)] for j in range(r+1)]

for i in range(r+1):
    for j in range(551):
        if (i==0):
            dp[i][j][0] = 0
        else:
            for k in range(j):
                if (total[j]==total[k]):
                    continue
                if (i==1):
                    dp[i][j][0] = log_likelihood(total[j], default[j])
                else:
                    if (dp[i][j][0] < (dp[i-1][k][0] + log_likelihood(total[j]-total[k], default[j] - default[k]))):
                        dp[i][j][0] = log_likelihood(total[j]-total[k], default[j]-default[k]) + dp[i-1][k][0]
                        dp[i][j][1] = k
                                                     
print (round(dp[r][550][0], 4))
                                                     
k = 550
l = []
while r >= 0:
    l.append(k+300)
    k = dp[r][k][1]
    r -= 1

print(l)


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (3711519784.py, line 12)