In [2]:
import pandas as pd
import numpy as np

### Rectangular Data (Dataframe)
**Rectangular Data** is a general term for a 2D matrix with rows indicating records(cases) and columns indicating features(variables)

**Key Terms**:
Feature: A column within a table is commonly referred to as a feature - (attribute, input, predictor variable)
Outcome/Target: The result of a prediction - (dependent variable, response, target, output)
Records: A row within a table is commonly referred to as a record. (case, example, instance, observation, pattern, sample)

In [3]:
## Load the dataset
path = "../../datasets/loan_data_set.csv"
loan_df = pd.read_csv(path)

In [4]:
loan_df.shape

(614, 13)

In [5]:
## view the datset
loan_df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [6]:
loan_df.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

## Exploratory Data Analysis

### Structured Data Types:
    There are two basic types of structured data - Numeric and categorical
    ### Numeric: Data that is expressed on a numeric scale. Numeric data comes in two forms.
                 1. Continuous: Data that can take on any value in an interval. ex wind speed or time duration
                 2. Discrete: Data that can take on only integer values. ex count of occurance of an event.

    ### Categorical: Data that can take on only a specific or fixed set of values, representing a set of possible categories. ex types of tv screen (plasma, LCD, LED, etc). It also has two forms.
                 1. Binary: A special case of categorical data with just two categories of values. ex 0/1, true/false, male/female
                 2. Ordinal: Categorical data that has an explicit ordering. ex numerical rating(1,2,3,4, or 5)  

In [7]:
## all column names
loan_df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [8]:
## Check total number of na values
loan_df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
## Numeric columns or real value columns
numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

## Categorical columns
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education','Self_Employed', 'Loan_Amount_Term', 
                    'Credit_History', 'Property_Area', 'Loan_Status']

## Not required
not_req_cols = ["Loan_ID"] 

In [10]:
## drop the loan ID column and set an index for the dataframe
loan_df.drop(not_req_cols, axis=1, inplace=True)
loan_df.reset_index
loan_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [11]:
## List categorical columns
loan_df.select_dtypes(include=['object']).columns.to_list()

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

### Estimates of Location (Numerical Data)

    Variables with measured or numerical data can have thousands of distinct values that vary between a maximum and a minimum.
    The first step in exploring the data is getting a "typical value" for each feature(variable) or an estimate of where most of the data is located (i.e. its central tendency) 

### Mean
    -> The most basic estimate of location is mean, or average value.
    -> It is the sum of all values / total no of values.
    -> A variation is called trimmed mean which is calculated by dropping a fixed no of sorted values at each end
       and then taking an average of the remaining values.
    -> A trimmed mean eliminates the influence of extreme values(outliers)

In [12]:
## To calculate the mean of numerical columns
loan_df[numeric_cols].mean()

ApplicantIncome      5403.459283
CoapplicantIncome    1621.245798
LoanAmount            146.412162
dtype: float64

In [13]:
## For trimmed mean - 0.1 means it slices off ‘leftmost’ and ‘rightmost’ 10% of scores.
from scipy.stats import trim_mean
trim_mean(loan_df[numeric_cols],0.1)

array([4292.05894309, 1154.84739835,  138.28455285])

### Median and Robust Estimates
    -> The median is the middle number on a sorted list of the data.
    -> If there is an even number of data values, the middle value is the average of 2 values that divide the sorted data 
       into upper and lower halves.
    -> Median depends on the values in centre of the sorted data, that is why it is less sensitive to outliers.
    -> This is why a median is referred to as a robust estimate of a location.

### Outliers
    -> An oulier is any value that is very distant from any other values in the dataset.
    -> When outliers are the result of bad data, the mean will result in a poor estimate of location, while
       the median will still be valid

In [14]:
## To calculate median
## NOTE: Median values are much more closer to trimmed mean than to the actual mean
loan_df[numeric_cols].median()

ApplicantIncome      3812.5
CoapplicantIncome    1188.5
LoanAmount            128.0
dtype: float64

### Estimates of Variability
    -> Location is just one dimension in summarising a feature. 
    -> A second dimension, variability also referred to as dispersion, measures whether the data values are tightly 
       clustered or spread out. 

### Standard Deviation and Related Estimates
    -> Deviations : The difference between the observed values and the estimate of location. (errors, residuals)
    -> The most widely used estimates of variation are based on the differences, or deviations b/w the estimate of location (central value) and the observed data.
    -> The best known estimates of variability are the variance and standard deviation.

    ### Variance (mean-squared error)
        The sum of squared deviation from the mean divided by n-1 . where n is the no of data values. 
$$
S^2 = \frac{\sum (x_i - \bar{x})^2}{n - 1}
$$      

    ### Standard Deviation
        The square root of the variance
$$
S = \sqrt{Variance} 
$$

In [15]:
## Find the standard deviation (Not robust and sensitive to outliers)
loan_df[numeric_cols].std()

ApplicantIncome      6109.041673
CoapplicantIncome    2926.248369
LoanAmount             85.587325
dtype: float64

### Median Absolute Deviation
    A robust estimate of variability is median absolute deviation from the median or MAD:
    Steps to find MAD
    1. Order the numbers in the dataset and find the median
    2. Subtract the median from each number in the dataset.
    3. Take the absolute value of each difference.
    4. Add all the positive differences and divide this sum by number of data points in the set.

$$
MAD = \frac{\sum \left\lvert(x_i - m)\right\rvert}{n}
$$

Where m - median , x - data point, n - total number of data points in a set

In [16]:
## Calculating Median absolute deviation
## The output from MAD and Std shows the differnce in the values and the effect of outliers on both
## The LoanAmount column contains nan values so we have to omit them in order to get the result else it will be nan

from scipy.stats import median_abs_deviation
median_abs_deviation(loan_df[numeric_cols], nan_policy= "omit")

array([1229.5, 1188.5,   32. ])

#### NOTE 
      -> The Standard Deviation will always be greater than the Median Abs Deviation even in case of normal distribution.
      -> In this case the median abs deviation is multiplied by a constant scaling factor to put the MAD on the same scale 
         as STD .
      -> The commonly used multiplication factor is 1.4826 which means that 50% of the normal distribution falls within the range
         +_MAD

### Estimates based on Percentiles
    -> Another approach to estimate dispersion is based on looking at the spread of sorted data
    -> Statistics based on sorted (ranked) data are referred to as order statistics.
    -> The most basic measure is the range - the difference b/w the largest & smallest numbers.
    -> Range is very sensitive to outliers and not very useful for geenral measure of dispersion of data.

To avoid the sensitivity to outliers, we can look at the range of data after dropping values from each end, and these types of estimates are based on differences between percentiles.

### Percentiles (quantiles)
    -> The value such that P percent of the values take on this value or less and (100-P) percent take on this value or more.
    -> for the formula and more detials refer: https://onlinestatbook.com/2/introduction/percentiles.html

### Inter Quartile Range (IQR)
    -> The difference b/w 75th percentile and 25th percentile.
    -> It is a common measure of variability.
    Further Reading : (Study all the methods in numpy.percentile argument "method")

In [18]:
## To calculate percentiles and IQR
## 1. using pandas
q1 = loan_df[numeric_cols].quantile(0.25)
q3 = loan_df[numeric_cols].quantile(0.75)
print(f"Pandas IQR:\n {q3-q1}")

Pandas IQR:
 ApplicantIncome      2917.50
CoapplicantIncome    2297.25
LoanAmount             68.00
dtype: float64


In [22]:
## Using numpy
for num_col in numeric_cols:
    q3, q1 = np.percentile(loan_df[num_col], [75 ,25])
    np_iqr = q3 - q1
    print(f"Numpy IQR for {num_col}:\n{np_iqr}")

Numpy IQR for ApplicantIncome:
2917.5
Numpy IQR for CoapplicantIncome:
2297.25
Numpy IQR for LoanAmount:
nan


In [21]:
## IF you use percentile directly on columns containing nan values, the result is nan
## To ignore nan values and get the result use np.nanpercentile fucnction
## Since numpy percentile takes only one array, we have to calculate the IQR for each column
for each_col in numeric_cols:
    q3, q1 = np.nanpercentile(loan_df[each_col], [75 ,25])
    np_iqr = q3 - q1
    print(f"Numpy nan IQR for {each_col}:\n{np_iqr}")

Numpy nan IQR for ApplicantIncome:
2917.5
Numpy nan IQR for CoapplicantIncome:
2297.25
Numpy nan IQR for LoanAmount:
68.0


In [23]:
## To sum up all these values
loan_df[numeric_cols].describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
count,614.0,614.0,592.0
mean,5403.459283,1621.245798,146.412162
std,6109.041673,2926.248369,85.587325
min,150.0,0.0,9.0
25%,2877.5,0.0,100.0
50%,3812.5,1188.5,128.0
75%,5795.0,2297.25,168.0
max,81000.0,41667.0,700.0
