# Loan Approval Prediction

## Exploratory Data Analysis

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
df = pd.read_csv("data/loan_prediction.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Handling null values

In [28]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [29]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [30]:
limit = len(df)*0.05
cols_to_drop = df.columns[df.isna().sum() <= limit]
df.dropna(subset=cols_to_drop, inplace=True)
df.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        30
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       48
Property_Area         0
Loan_Status           0
dtype: int64

In [31]:
df['Self_Employed'].value_counts()

Self_Employed
No     451
Yes     72
Name: count, dtype: int64

In [32]:
df['Self_Employed'].fillna("No", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Self_Employed'].fillna("No", inplace=True)


In [33]:
df.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       48
Property_Area         0
Loan_Status           0
dtype: int64

In [34]:
df["Credit_History"].fillna(1.0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Credit_History"].fillna(1.0, inplace=True)


In [35]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [36]:
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [37]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


#### Drop some unnecessary columns

In [38]:
df.drop(["Loan_ID", "Dependents", "Self_Employed", "CoapplicantIncome", "Loan_Amount_Term"], axis=1, inplace=True)

In [39]:
for col in df:
    print(f"{col}: {df[col].unique()}")

Gender: ['Male' 'Female']
Married: ['Yes' 'No']
Education: ['Graduate' 'Not Graduate']
ApplicantIncome: [ 4583  3000  2583  6000  5417  2333  3036  4006 12841  3200  2500  3073
  1853  1299  4950  3596  3510  4887  7660  5955  2600  3717  9560  2799
  4226  1442  3750  4166  3167  4692  3500 12500  3667  3748  3600  1800
  2400  3941  5649  5821  2645  4000  1928  3086  4230  4616 11500  2708
  2132  3366  8080  3357  3029  2609  5726 10750  7100  4300  3208  1875
  5266  1000  3333  3846  1378  3988  2366  8566  5695  2958  6250  3273
  4133  3620  2484  1977  4188  1759  4288  4843  3052 11417  7333  3800
  2071  5316  2929  5050 14583  2214  5568 10408  2137  2957  3692 10513
  6080 20166  2014  2718  3459  4895  3316 14999  4200  5042  6950  2698
 11757  2330 14866  1538 10000  4860  6277  2577  9166  2281  3254 39999
  9538  2980  1863  7933  3089  4167  9323  2439  2237  8000  1820  3522
  5708  4344  3497  2045  5516  6400  1916  4600 33846  3625 39147  2178
  2383  9328  4885 1

### Applying label encoder for categorical data

In [40]:
from sklearn.preprocessing import LabelEncoder
# Encode categorical features
encoder = LabelEncoder()
for col in df.select_dtypes(include="object"):
    df[col] = encoder.fit_transform(df[col])

In [41]:
df.head()

Unnamed: 0,Gender,Married,Education,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
1,1,1,0,4583,128.0,1.0,0,0
2,1,1,0,3000,66.0,1.0,2,1
3,1,1,1,2583,120.0,1.0,2,1
4,1,0,0,6000,141.0,1.0,2,1
5,1,1,0,5417,267.0,1.0,2,1


In [42]:
df['Education'] = df['Education'].replace({0:1, 1:0})
df.head()

Unnamed: 0,Gender,Married,Education,ApplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
1,1,1,1,4583,128.0,1.0,0,0
2,1,1,1,3000,66.0,1.0,2,1
3,1,1,0,2583,120.0,1.0,2,1
4,1,0,1,6000,141.0,1.0,2,1
5,1,1,1,5417,267.0,1.0,2,1


### Splitting Train test data

In [43]:
from sklearn.model_selection import train_test_split
y = df["Loan_Status"]
x = df.drop("Loan_Status", axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [44]:
x_train.shape

(442, 7)

### Scaling data

In [45]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Machine Learning

In [46]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train_scaled, y_train)

In [47]:
model.score(x_test_scaled, y_test)

0.7927927927927928

### Save the model

In [48]:
import joblib
joblib.dump(model, 'model.pkl')
print("Model saved as model.pkl")

Model saved as model.pkl
