In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

stat = pd.read_csv('loan_approved.csv')
stat.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [69]:
stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Loan_ID                 614 non-null    object 
 1   Gender                  601 non-null    object 
 2   Married                 611 non-null    object 
 3   Dependents              599 non-null    object 
 4   Education               614 non-null    object 
 5   Self_Employed           582 non-null    object 
 6   ApplicantIncome         614 non-null    int64  
 7   CoapplicantIncome       614 non-null    float64
 8   LoanAmount              592 non-null    float64
 9   Loan_Amount_Term        600 non-null    float64
 10  Credit_History          564 non-null    float64
 11  Property_Area           614 non-null    object 
 12  Loan_Status (Approved)  614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


## 1. Categorical Data (Qualitative)
#### Categorical data represents groupings or classifications. It is divided into two types: Nominal and Ordinal.

### i) Nominal Data (Names, Labels)

#### Nominal data classifies items into distinct categories without any inherent order or rank. If you rearrange the order of the categories, the meaning of the data does not change, order does not matter.
#### Examples: Gender, Marital Status, Blood Group, etc.

### ii) Ordinal Data (Ranking)

#### Ordinal data ranks items in a specific order, indicating a hierarchy or preference. The order of the categories is meaningful, and you can assign numerical values based on the ranking. Ordinal data classifies items into categories that have a clear, logical rank or sequence. However, the distance or magnitude between the categories is not known or unequal
#### Examples: Education Level (High School, Bachelor's, Master's), Customer Satisfaction (Low, Medium, High), etc.

## 2. Numerical Data (Quantitative)
#### Numerical data represents quantities or measurements. It is further divided into two types: Discrete and Continuous.

### i) Discrete Data (Countable)

#### Discrete data consists of distinct, separate values that can be counted or measured in whole units. Each value is separate and distinct, and there is no intermediate value between them.Discrete data consists of countable items. It can only take on a finite or countably infinite number of specific, distinct values, usually whole numbers.
#### Examples: Number of Children, Number of Employees, etc.

### ii) Continuous Data (Measurable)

#### Continuous data represents measurements that can take on any value within a range. It is infinite and can be measured with infinite precision. Continuous data can be further divided into two types: Interval and Ratio.
#### Examples: Height, Weight, Temperature, etc.




## Segregation of columns into many types of data :-
### Nominal = Loan_ID, Gender, Married, Dependents, Education, Self_Employed, Loan_Status (Approved)
### Ordinal = Property_Area
### Discrete = Loan_Amount_Term, Credit_History
### Continoues value = ApplicantIncome, CoapplicantIncome, LoanAmount

In [70]:
len(stat['Loan_ID']) # categorical nominal (total no of observations)

614

In [71]:
len(stat['Gender'].unique()) # categorical nominal (unique categories)

3

In [72]:
stat.Gender.value_counts()

Gender
Male      489
Female    112
Name: count, dtype: int64

In [73]:
len(stat['Married'].unique()) # categorical nominal (unique categories)

3

In [74]:
len(stat['Dependents'].unique()) # categorical nominal (unique categories)

5

In [75]:
len(stat['Education'].unique()) # categorical nominal

2

In [76]:
len(stat['Self_Employed'].unique()) # categorical nominal

3

In [77]:
len(stat['ApplicantIncome'].unique()) # continuous

505

In [78]:
len(stat['CoapplicantIncome'].unique()) # continuous

287

In [79]:
len(stat['LoanAmount'].unique()) # continuous

204

In [80]:
len(stat['Loan_Amount_Term'].unique()) # discrete

11

In [81]:
len(stat['Credit_History'].unique()) # discrete

3

In [82]:
len(stat['Property_Area'].unique()) # categorical ordinal

3

In [83]:
len(stat['Loan_Status (Approved)'].unique()) # categorical nominal

2

## Where we use central tendencies ?
#### MODE --> we use for categorical data
#### MEDIAN , MEAN --> we use for numerical data

In [84]:
stat['Gender'] = stat['Gender'].fillna(stat['Gender'].mode()[0])

In [85]:
print(stat['Gender'].isnull().sum()) # after filling Nan (null values)

0


In [86]:
print(stat['Self_Employed'].isnull().sum()) # before filling Nan (null values)

32


In [88]:
stat['Self_Employed'] = stat['Self_Employed'].fillna(stat['Self_Employed'].mode()[0])

In [89]:
print(stat['Self_Employed'].isnull().sum()) # after filling Nan (null values)

0


In [91]:
print(stat['LoanAmount'].isnull().sum())# before filling Nan (null values)

22


In [92]:
stat['LoanAmount'] = stat['LoanAmount'].fillna(stat['LoanAmount'].median())

In [93]:
print(stat['LoanAmount'].isnull().sum()) # after filling Nan (null values)

0


In [94]:
print(stat['Loan_Amount_Term'].isnull().sum())# before filling Nan (null values)

14


In [96]:
stat['Loan_Amount_Term'] = stat['Loan_Amount_Term'].fillna(stat['Loan_Amount_Term'].median())

In [97]:
print(stat['Loan_Amount_Term'].isnull().sum()) # after filling Nan (null values)

0


In [None]:
print(stat['Credit_History'].isnull().sum())# before filling Nan (null values)

50


In [100]:
stat['Credit_History'] = stat['Credit_History'].fillna(stat['Credit_History'].mode()[0])

In [102]:
print(stat['Credit_History'].isnull().sum())# after filling Nan (null values)

0


In [105]:
stat.isnull().sum()  # this tells us totally whether any missing values are there in entire dataset at once instead of checking for every column

Loan_ID                    0
Gender                     0
Married                    3
Dependents                15
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Property_Area              0
Loan_Status (Approved)     0
dtype: int64

In [106]:
# as we seen some of the columns have missing values so start with married

stat['Married'] = stat['Married'].fillna(stat['Married'].mode()[0])

In [107]:
print(stat['Married'].isnull().sum())

0


In [109]:
# next dependents

stat['Dependents'] = stat['Dependents'].fillna(stat['Dependents'].mode()[0])

In [110]:
print(stat['Dependents'].isnull().sum())

0


In [111]:
stat.isnull().sum()

Loan_ID                   0
Gender                    0
Married                   0
Dependents                0
Education                 0
Self_Employed             0
ApplicantIncome           0
CoapplicantIncome         0
LoanAmount                0
Loan_Amount_Term          0
Credit_History            0
Property_Area             0
Loan_Status (Approved)    0
dtype: int64