In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Load dataset (replace with your file path)
df = pd.read_csv('home_loan_default_risk.csv')

# Basic info
print(df.head())
print(df.info())

# Handle missing values
imputer = SimpleImputer(strategy='median')

# Original code tried to impute 'AMT_INCOME_TOTAL' and 'AMT_CREDIT'.
# These columns are not present in the loaded DataFrame.
# Based on the df.head() and df.info(), we will use 'LOAN' as the credit amount.
# Other numerical columns that may have missing values are also imputed.
df[['LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']] = imputer.fit_transform(df[['LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']])

# Encode categorical variables
le = LabelEncoder()
# Original code tried to encode 'NAME_CONTRACT_TYPE', which is not present.
# We will use 'REASON' and 'JOB' as categorical features, filling NaNs with 'Unknown' first.
df['REASON'] = df['REASON'].fillna('Unknown')
df['JOB'] = df['JOB'].fillna('Unknown')
df['REASON_ENCODED'] = le.fit_transform(df['REASON'])
df['JOB_ENCODED'] = le.fit_transform(df['JOB'])

# Feature engineering: Create debt-to-income ratio
# Original code tried to create 'DEBT_TO_INCOME' using 'AMT_CREDIT' and 'AMT_INCOME_TOTAL', which are missing.
# The 'DEBTINC' column already exists and serves this purpose, so we will use it directly.

# Select features and target
# Original code used 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'DEBT_TO_INCOME', 'NAME_CONTRACT_TYPE'.
# Based on the available data, we will use 'LOAN', 'DEBTINC', 'REASON_ENCODED', 'JOB_ENCODED'
# and other relevant numerical columns. 'BAD' appears to be the target variable.
features = ['LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC', 'REASON_ENCODED', 'JOB_ENCODED']
X = df[features]
y = df['BAD'] # 'BAD' is likely the target variable indicating default

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

   BAD  LOAN  MORTDUE     VALUE   REASON     JOB   YOJ  DEROG  DELINQ  \
0    1  1100  25860.0   39025.0  HomeImp   Other  10.5    0.0     0.0   
1    1  1300  70053.0   68400.0  HomeImp   Other   7.0    0.0     2.0   
2    1  1500  13500.0   16700.0  HomeImp   Other   4.0    0.0     0.0   
3    1  1500      NaN       NaN      NaN     NaN   NaN    NaN     NaN   
4    0  1700  97800.0  112000.0  HomeImp  Office   3.0    0.0     0.0   

        CLAGE  NINQ  CLNO  DEBTINC  
0   94.366667   1.0   9.0      NaN  
1  121.833333   0.0  14.0      NaN  
2  149.466667   1.0  10.0      NaN  
3         NaN   NaN   NaN      NaN  
4   93.333333   0.0  14.0      NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BAD      5960 non-null   int64  
 1   LOAN     5960 non-null   int64  
 2   MORTDUE  5442 non-null   float64
 3   VALUE    5848 non-null   float64
 4  

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

In [8]:
# Example new data (as a DataFrame)
# The features here must match the 'features' list used in the training step (cell pIxcqV4aALkO)
new_applicant_data = pd.DataFrame({
    'LOAN': [15000.0],
    'MORTDUE': [50000.0],
    'VALUE': [70000.0],
    'YOJ': [5.0],
    'DEROG': [0.0],
    'DELINQ': [0.0],
    'CLAGE': [100.0],
    'NINQ': [1.0],
    'CLNO': [10.0],
    'DEBTINC': [30.0],
    'REASON_ENCODED': [0], # Example encoded value, assuming 'DebtCon' or similar was encoded to 0
    'JOB_ENCODED': [0]    # Example encoded value, assuming 'Office' or similar was encoded to 0
})

# Ensure the order of columns matches the 'features' list used during training
new_applicant_data = new_applicant_data[features]

# Preprocess (only transform, as scaler was fit on training data)
# If new_applicant_data had missing values in numerical columns, you would also apply 'imputer.transform()'
new_data_scaled = scaler.transform(new_applicant_data)

# Predict
prediction = model.predict(new_data_scaled)
probability = model.predict_proba(new_data_scaled)[:, 1]
print(f"Default Prediction: {prediction[0]} (1=Default), Probability: {probability[0]:.2f}")

Default Prediction: 0 (1=Default), Probability: 0.02
