In [1]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle

In [2]:
# Data Exploration

df = pd.read_csv('loan predictor.csv')
df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [3]:
df = df.dropna()

In [None]:
# Converting all the categorical data into numeric 

replace_dict = {
    'Married': {'No': 0, 'Yes': 1},
    'Gender': {'Male': 1, 'Female': 0},
    'Dependents': {'0': 0, '1': 1, '2': 2, '3+': 3},
    'Self_Employed': {'No': 0, 'Yes': 1},
    'Property_Area': {'Rural': 0, 'Semiurban': 1, 'Urban': 2},
    'Education': {'Graduate': 1, 'Not Graduate': 0},
    'Loan_Status': {'N': 0, 'Y': 1}
}

for column, replacements in replace_dict.items():   
    df[column] = df[column].replace(replacements)

  df[column] = df[column].replace(replacements)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 480 entries, 1 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            480 non-null    object 
 1   Gender             480 non-null    int64  
 2   Married            480 non-null    int64  
 3   Dependents         480 non-null    int64  
 4   Education          480 non-null    int64  
 5   Self_Employed      480 non-null    int64  
 6   ApplicantIncome    480 non-null    int64  
 7   CoapplicantIncome  480 non-null    float64
 8   LoanAmount         480 non-null    float64
 9   Loan_Amount_Term   480 non-null    float64
 10  Credit_History     480 non-null    float64
 11  Property_Area      480 non-null    int64  
 12  Loan_Status        480 non-null    int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 52.5+ KB


In [6]:
# add the applicant and coapplicant income together

df['ApplicantIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df = df.drop('CoapplicantIncome', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 480 entries, 1 to 613
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Loan_ID           480 non-null    object 
 1   Gender            480 non-null    int64  
 2   Married           480 non-null    int64  
 3   Dependents        480 non-null    int64  
 4   Education         480 non-null    int64  
 5   Self_Employed     480 non-null    int64  
 6   ApplicantIncome   480 non-null    float64
 7   LoanAmount        480 non-null    float64
 8   Loan_Amount_Term  480 non-null    float64
 9   Credit_History    480 non-null    float64
 10  Property_Area     480 non-null    int64  
 11  Loan_Status       480 non-null    int64  
dtypes: float64(4), int64(7), object(1)
memory usage: 48.8+ KB


In [7]:
# split and train dataset

X = df.drop(columns=['Loan_ID','Loan_Status'],axis=1)
y = df['Loan_Status']

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

X_train, X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=42)

print(X.shape, X_train.shape, X_test.shape)


(480, 10) (336, 10) (144, 10)


In [8]:
X.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area'],
      dtype='object')

In [10]:
# Model training and Evaluation using SVM

rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [11]:
# Metrics Evaluation

def model_result(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Display evaluation metrics
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)


# Result
model_result(y_test, y_pred)

Accuracy: 0.78
Precision: 0.78
Recall: 0.94
F1 Score: 0.85
Confusion Matrix:
[[18 26]
 [ 6 94]]


In [12]:
# Serialising

with open('predictor.pkl', 'wb') as file:
    pickle.dump(rfc, file)