In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import statistics as st

In [2]:
#Read the dataset
df = pd.read_csv('/content/train_loan.csv')

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
df['Dependents'].unique()

array(['0', '1', '2', '3+', nan], dtype=object)

In [7]:
#Dependents is showing as object because 3+
df['Dependents'] = df['Dependents'].replace('3+', 3)

In [8]:
df['Dependents'] = df['Dependents'].astype(float)


In [9]:
#Check for null values
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [10]:
columns = ['Gender', 'Married', 'Self_Employed', 'Dependents','Loan_Amount_Term']

for col in columns:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)

In [11]:
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].median()  )
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median() )

In [12]:
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0


In [13]:
#Encoding
categorical_cols = ['Gender', 'Married',  'Education', 'Self_Employed', 'Property_Area']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [14]:
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,0.0,5849,0.0,128.0,360.0,1.0,Y,True,False,False,False,False,True
1,LP001003,1.0,4583,1508.0,128.0,360.0,1.0,N,True,True,False,False,False,False
2,LP001005,0.0,3000,0.0,66.0,360.0,1.0,Y,True,True,False,True,False,True
3,LP001006,0.0,2583,2358.0,120.0,360.0,1.0,Y,True,True,True,False,False,True
4,LP001008,0.0,6000,0.0,141.0,360.0,1.0,Y,True,False,False,False,False,True


In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Loan_Status'] = le.fit_transform(df['Loan_Status'])

In [16]:
df.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,0.0,5849,0.0,128.0,360.0,1.0,1,True,False,False,False,False,True
1,LP001003,1.0,4583,1508.0,128.0,360.0,1.0,0,True,True,False,False,False,False
2,LP001005,0.0,3000,0.0,66.0,360.0,1.0,1,True,True,False,True,False,True
3,LP001006,0.0,2583,2358.0,120.0,360.0,1.0,1,True,True,True,False,False,True
4,LP001008,0.0,6000,0.0,141.0,360.0,1.0,1,True,False,False,False,False,True


In [17]:
#Split into X and Y
x= df.drop(['Loan_ID', 'Loan_Status'], axis=1)
y=df['Loan_Status']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=42)

#Model Creation

In [18]:
model1 = DecisionTreeClassifier()
model2  = RandomForestClassifier()

In [19]:
model1.fit(x_train,y_train)
model2.fit(x_train,y_train)

In [21]:
#Model Performance

from sklearn.metrics import accuracy_score

y_pred1 = model1.predict(x_test)
y_pred2 = model2.predict(x_test)

accuracy1 = accuracy_score(y_test, y_pred1)
accuracy2 = accuracy_score(y_test, y_pred2)

print(f"Decision Tree Accuracy: {accuracy1:.4f}")
print(f"Random Forest Accuracy: {accuracy2:.4f}")

Decision Tree Accuracy: 0.7073
Random Forest Accuracy: 0.7724
