In [25]:
import numpy as np
import pandas as pd
import plotly.express as px

In [26]:
df=pd.read_csv("loan data.csv")

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            587 non-null    object 
 1   Gender             575 non-null    object 
 2   Married            584 non-null    object 
 3   Dependents         572 non-null    object 
 4   Education          587 non-null    object 
 5   Self_Employed      558 non-null    object 
 6   ApplicantIncome    587 non-null    int64  
 7   CoapplicantIncome  587 non-null    float64
 8   LoanAmount         566 non-null    float64
 9   Loan_Amount_Term   574 non-null    float64
 10  Credit_History     539 non-null    float64
 11  Property_Area      587 non-null    object 
 12  Loan_Status        587 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 59.7+ KB


In [28]:
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001073,Male,Yes,2,Not Graduate,No,4226,1040.0,110.0,360.0,1.0,Urban,Y
1,LP001086,Male,No,0,Not Graduate,No,1442,0.0,35.0,360.0,1.0,Urban,N
2,LP001087,Female,No,2,Graduate,,3750,2083.0,120.0,360.0,1.0,Semiurban,Y
3,LP001091,Male,Yes,1,Graduate,,4166,3369.0,201.0,360.0,,Urban,N
4,LP001095,Male,No,0,Graduate,No,3167,0.0,74.0,360.0,1.0,Urban,N


In [29]:
df.isnull().sum()

Loan_ID               0
Gender               12
Married               3
Dependents           15
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     13
Credit_History       48
Property_Area         0
Loan_Status           0
dtype: int64

In [30]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,587.0,587.0,566.0,574.0,539.0
mean,5453.069847,1598.238365,146.667845,341.811847,0.844156
std,6221.319869,2939.437737,86.128239,65.69106,0.363044
min,150.0,0.0,9.0,12.0,0.0
25%,2885.5,0.0,100.0,360.0,1.0
50%,3816.0,1126.0,128.0,360.0,1.0
75%,5807.5,2264.5,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [31]:
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed']:
    df[col] = df[col].fillna(df[col].mode(dropna=True)[0])


In [32]:

df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term']=df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode(dropna=True)[0])
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].mode(dropna=True)[0])

In [33]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

EDA

In [34]:
loan_status_count = df['Loan_Status'].value_counts()
fig_loan_status = px.pie(loan_status_count, 
                         names=loan_status_count.index,
                         color_discrete_sequence= ["#FF0303", "#163F61"],
                         title='Loan Approval Status')
fig_loan_status.show()

In [35]:
gender_count = df['Gender'].value_counts()
fig_gender = px.bar(gender_count, 
                    x=gender_count.index, 
                    y=gender_count.values, 
                    color_discrete_sequence= [ "#008CFF"],
                    title='Gender Distribution')
fig_gender.show()

In [36]:
married_count = df['Married'].value_counts()
fig_married = px.bar(married_count, 
                     x=married_count.index, 
                     y=married_count.values, 
                     title='Marital Status Distribution')
fig_married.show()

In [37]:
education_count = df['Education'].value_counts()
fig_education = px.bar(education_count, 
                       x=education_count.index, 
                       y=education_count.values, 
                       color_discrete_sequence= [ "#04FF00"],
                       title='Education Distribution')
fig_education.show()

In [38]:
fig_applicant_income = px.histogram(df, x='ApplicantIncome', 
                                    title='Applicant Income Distribution')
fig_applicant_income.show()

In [39]:
fig_income = px.box(df, x='Loan_Status', 
                    y='ApplicantIncome',
                    color="Loan_Status", 
                    color_discrete_sequence= ["#FF0303", "#163F61"],
                    title='Loan_Status vs ApplicantIncome')
fig_income.show()

- "Application Income" column contains outliers which need to be solve first by using IQR method

In [40]:
# Calculate the IQR
Q1 = df['ApplicantIncome'].quantile(0.25)
Q3 = df['ApplicantIncome'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df = df[(df['ApplicantIncome'] >= lower_bound) & (df['ApplicantIncome'] <= upper_bound)]

In [41]:
fig_coapplicant_income = px.box(df, 
                                x='Loan_Status', 
                                y='CoapplicantIncome',
                                color="Loan_Status", 
                                title='Loan_Status vs CoapplicantIncome')
fig_coapplicant_income.show()

- "loan co-applicant" column also contains outliers.

In [42]:
# Calculate the IQR
Q1 = df['CoapplicantIncome'].quantile(0.25)
Q3 = df['CoapplicantIncome'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df = df[(df['CoapplicantIncome'] >= lower_bound) & (df['CoapplicantIncome'] <= upper_bound)]

In [43]:
fig_loan_amount = px.box(df, x='Loan_Status', 
                         y='LoanAmount', 
                         color="Loan_Status",
                         title='Loan_Status vs LoanAmount')
fig_loan_amount.show()

In [44]:
fig_credit_history = px.histogram(df, x='Credit_History', color='Loan_Status', 
                                  barmode='group',
                                  color_discrete_sequence= ["#13BFBC", "#16C890"],
                                  title='Loan_Status vs Credit_His')
fig_credit_history.show()

In [45]:
fig_property_area = px.histogram(df, x='Property_Area', color='Loan_Status', 
                                 barmode='group',
                                title='Loan_Status vs Property_Area')
fig_property_area.show()

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 523 entries, 0 to 586
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            523 non-null    object 
 1   Gender             523 non-null    object 
 2   Married            523 non-null    object 
 3   Dependents         523 non-null    object 
 4   Education          523 non-null    object 
 5   Self_Employed      523 non-null    object 
 6   ApplicantIncome    523 non-null    int64  
 7   CoapplicantIncome  523 non-null    float64
 8   LoanAmount         523 non-null    float64
 9   Loan_Amount_Term   523 non-null    float64
 10  Credit_History     523 non-null    float64
 11  Property_Area      523 non-null    object 
 12  Loan_Status        523 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 57.2+ KB


In [47]:
df.to_csv("data prep.csv",index=False)