# Libraries

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import  StandardScaler
from sklearn.ensemble import  RandomForestClassifier
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

# data

In [2]:
df = pd.read_csv('loan_prediction.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.drop('Loan_ID', axis=1, inplace=True)

In [6]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
df.isnull().sum() / df.shape[0] * 100
## since percentage of missing values is less than 5%-10% we may drop the rows with missing values

Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [5]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [12]:
# filling missing values in categorical columns with mode

df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0],inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df.isna().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
## filling missing values in numerical columns with median and others with mode

df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode(), inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)   ## since it is a categorical column
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# Exploratoary data analysis

In [16]:
loan_status_counts = df['Loan_Status'].value_counts()

fig = px.pie(loan_status_counts,
             names=loan_status_counts.index,
             title='Loan Status Distribution')
fig.show()

In [18]:
gender_counts = df['Gender'].value_counts() 
fig = px.bar(gender_counts,
             x=gender_counts.index,
             y=gender_counts.values,
             title='Gender distribution')
fig.show()

In [19]:
married_counts = df['Married'].value_counts()
fig = px.bar(married_counts,
             x=married_counts.index,
             y=married_counts.values,
             title='Married distribution')  
fig.show()

In [20]:
education_counts = df['Education'].value_counts()

fig = px.bar(education_counts,
                x=education_counts.index,
                y=education_counts.values,
                title='Education distribution')

fig.show()

In [21]:
## plot of education and loan status
fig = px.histogram(df, x='Education', color='Loan_Status', title='Education and Loan Status')
fig.show()

In [24]:
## pie plot of education and loan status
fig = px.sunburst(df, path=['Education', 'Gender','Loan_Status'], title='Education and Loan Status', color='Loan_Status')
fig.show()

In [25]:
## applicant income distribution
fig = px.histogram(df, x='ApplicantIncome', title='Applicant Income Distribution')
fig.show()

In [None]:
fig_income = px.box(df,
                    x='ApplicantIncome',
                    color='Loan_Status',
                    title='Loan status vs appli')