In [None]:
# TASK 4 â€“ LOAN DEFAULT RISK MODEL
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("application_train.csv")

# Keep important columns only (simplify for project)
df = df.select_dtypes(include=['int64','float64'])

df.fillna(df.median(), inplace=True)

X = df.drop('TARGET', axis=1)
y = df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:,1]

def business_cost(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cost = fp*1000 + fn*10000
    return cost

thresholds = np.arange(0.1, 0.9, 0.01)
costs = []

for t in thresholds:
    y_pred = (y_prob > t).astype(int)
    cost = business_cost(y_test, y_pred)
    costs.append(cost)

optimal_threshold = thresholds[np.argmin(costs)]

print("Optimal Threshold:", optimal_threshold)
print("Minimum Cost:", min(costs))

import matplotlib.pyplot as plt

plt.plot(thresholds, costs)
plt.title("Business Cost vs Threshold")
plt.xlabel("Threshold")
plt.ylabel("Total Business Cost")
plt.show()
