# Load Prediction


In [246]:
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

In [247]:
pio.templates.default = "plotly_dark"

## Load the data


In [248]:
data = pd.read_csv("train_ctrUa4K.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Inspecting


In [249]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [250]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [251]:
print(f"Percentage of Missing Values")
round((data.isna().sum() / data.isna().count()) * 100, 2)

Percentage of Missing Values


Loan_ID              0.00
Gender               2.12
Married              0.49
Dependents           2.44
Education            0.00
Self_Employed        5.21
ApplicantIncome      0.00
CoapplicantIncome    0.00
LoanAmount           3.58
Loan_Amount_Term     2.28
Credit_History       8.14
Property_Area        0.00
Loan_Status          0.00
dtype: float64

In [252]:
data.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

## Exploratory Data Analysis


### Univariate Analysis


#### Categorical Variables


In [253]:
cat_columns = [
    ["Gender", "Married", "Dependents"],
    ["Education", "Self_Employed", "Loan_Amount_Term"],
    ["Credit_History", "Property_Area", "Loan_Status"],
]
flattened_columns = [column for sublist in cat_columns for column in sublist]
print(flattened_columns)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']


In [254]:
fig = make_subplots(rows=3, cols=3, subplot_titles=flattened_columns)
for i in range(0, 3):
    for j in range(0, 3):
        fig.add_trace(
            go.Histogram(x=data[cat_columns[i][j]]),
            row=i + 1,
            col=j + 1,
        )
fig.update_traces(hoverinfo="y+text")
fig.update_layout(height=1000)
fig.show()

We'll use mode to replace the missing values


#### Continious Variables


In [255]:
cont_columns = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount"]

In [256]:
fig = make_subplots(rows=3, subplot_titles=cont_columns)
for i in range(0, 3):
    fig.add_trace(go.Box(x=data[cont_columns[i]]), row=i + 1, col=1)
fig.update_layout(height=1000)
fig.show()

So many outliers, we'll use median to fill the missing values


### Multivariate Analysis

#### Gender and Education

In [257]:
fig = px.histogram(
    data_frame=data,
    x=data["Gender"],
    color=data["Education"],
    barmode="group",
    text_auto=True,
    histnorm="probability",
    title="Gender and Education",
)
fig.show()

#### Loan Amount and Gender

In [258]:
fig = px.histogram(
    data_frame=data,
    x=data["LoanAmount"],
    color=data["Gender"],
    barmode="group",
    histnorm="probability density",
    text_auto=True,
)
fig.show()

#### Income and Gender

In [259]:
fig = px.box(
    data,
    x=data["ApplicantIncome"],
    y=data["Gender"],
    title="Applicant Income with Gender",
)
fig.show()

In [260]:
fig = px.box(
    data,
    x=data["CoapplicantIncome"],
    y=data["Gender"],
    title="Co-applicant Income with Gender",
)
fig.show()

#### Income and Loan Amount

In [261]:
fig = px.scatter(
    data,
    x=data["LoanAmount"],
    y=data["ApplicantIncome"],
    title="Applicant Income and Loan Amount",
    color=data["Gender"],
    size=data["ApplicantIncome"],
    height=600,
)
fig.show()

## Data Preprocessing

### Missing Values

In [262]:
from sklearn.impute import SimpleImputer


cat_imputer = SimpleImputer(
    strategy="most_frequent", missing_values=np.nan
)  # Mode for Categorical Variables
cont_imputer = SimpleImputer(
    strategy="median", missing_values=np.nan
)  # Median because of outliers

In [263]:
for i in flattened_columns:
    data[i] = cat_imputer.fit_transform(data[[i]])[:, 0]
data["LoanAmount"] = cont_imputer.fit_transform(data[["LoanAmount"]])[:, 0]

In [264]:
print(f"Percentage of Missing Values after Preprocessing")
round((data.isna().sum() / data.isna().count()) * 100, 2)

Percentage of Missing Values after Preprocessing


Loan_ID              0.0
Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

### Duplicate Values

In [265]:
duplicated_data = data[data.duplicated()]

In [266]:
print(f"Total no of rows : {data.shape[0]}")
print(f"No of duplicated rows is {duplicated_data.shape[0]}")
print(
    f"The percentage of Duplicated values : {round((duplicated_data.shape[0] / data.shape[0]) * 100, 2)}%"
)

Total no of rows : 614
No of duplicated rows is 0
The percentage of Duplicated values : 0.0%


### Dummy Variables

In [267]:
from sklearn.calibration import LabelEncoder


le = LabelEncoder()

In [268]:
data = data.drop("Loan_ID", axis=1)
encoded_data = data.copy()
for i in data.columns:
    if data[i].dtype == "object":
        if data[i].nunique() <= 2:
            encoded_data[i] = le.fit_transform(encoded_data[i])
        else:
            encoded_data = pd.get_dummies(data=encoded_data, columns=[i], dtype=int)

In [269]:
encoded_data

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,1,0,0,0,5849,0.0,128.0,360.0,1.0,1,1,0,0,0,0,0,1
1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0,1,0,0,1,0,0
2,1,1,0,1,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,0,1
3,1,1,1,0,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,0,0,1
4,1,0,0,0,6000,0.0,141.0,360.0,1.0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,2900,0.0,71.0,360.0,1.0,1,1,0,0,0,1,0,0
610,1,1,0,0,4106,0.0,40.0,180.0,1.0,1,0,0,0,1,1,0,0
611,1,1,0,0,8072,240.0,253.0,360.0,1.0,1,0,1,0,0,0,0,1
612,1,1,0,0,7583,0.0,187.0,360.0,1.0,1,0,0,1,0,0,0,1
