In [2]:
# import important libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

In [6]:
# Step 1: Load the dataset
data = pd.read_csv("Task 3 and 4_Loan_Data (1).csv")
data.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [7]:
# Step 2: View first few rows and check columns
print("Missing Value per column:\n", data.isnull().sum(),"\n")

Missing Value per column:
 customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64 



In [8]:
fico_col = None
default_col = None

for col in data.columns:
    if 'fico' in col.lower():
        fico_col = col
    if 'default' in col.lower() or 'status' in col.lower():
        default_col = col

print(f"Detected FICO column: {fico_col}")
print(f"Detected Default column: {default_col}\n")


Detected FICO column: fico_score
Detected Default column: default



In [9]:
# Drop missing FICO scores
data = data.dropna(subset=[fico_col])

In [10]:
bins = [300, 579, 669, 739, 799, 850]
labels = ['Very_High_Risk', 'High_Risk', 'Medium_Risk', 'Low_Risk', 'Very_Low_Risk']
data['FICO_Category'] = pd.cut(data[fico_col], bins=bins, labels=labels, include_lowest=True)


In [11]:
# Step 6: Encode categories to numeric form
le = LabelEncoder()
data['FICO_Code'] = le.fit_transform(data['FICO_Category'])


In [12]:
# Step 7: Prepare training and testing data
X = data[['FICO_Code']]
y = data[default_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Build logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Step 10: Evaluate performance
print("Model Evaluation Results:\n")
print("AUC Score:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Evaluation Results:

AUC Score: 0.5739290236842838

Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.90      1652
           1       0.00      0.00      0.00       348

    accuracy                           0.83      2000
   macro avg       0.41      0.50      0.45      2000
weighted avg       0.68      0.83      0.75      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:

# Step 11: Probability of default by FICO category
pd_default_rate = data.groupby('FICO_Category')[default_col].mean().sort_index()
print("\nAverage Probability of Default by FICO Category:\n")
print(pd_default_rate)



Average Probability of Default by FICO Category:

FICO_Category
Very_High_Risk    0.433634
High_Risk         0.174003
Medium_Risk       0.073802
Low_Risk          0.030227
Very_Low_Risk     0.029412
Name: default, dtype: float64


  pd_default_rate = data.groupby('FICO_Category')[default_col].mean().sort_index()


In [15]:
# Step 12: Optional - Save cleaned data with category column
data.to_csv("FICO_PD_Categorized_Output.csv", index=False)
print("\nProcessed data saved to: FICO_PD_Categorized_Output.csv")



Processed data saved to: FICO_PD_Categorized_Output.csv
