In [104]:
# Import the required libraries: Pandas, Numpy, Matplotlib and   Seaborn

import pandas as pd          
import numpy as np 
import seaborn as sns # For mathematical calculations
import matplotlib.pyplot as plt  # For plotting graphs
from datetime import datetime    # To access datetime
from pandas import Series # To work on series
%matplotlib inline
import warnings                   # To ignore the warnings
warnings.filterwarnings("ignore")

In [105]:
#  load the  train csv data set

df=pd.read_csv("train.csv")
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [106]:
df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [107]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

### Convert A Categorical Variable Into Dummy Variables

In [54]:
# Create a set of dummy variables from the sex variable

df_sex = pd.get_dummies(df['Gender'])
df_sex.sample()


Unnamed: 0,Female,Male
21,0,1


In [55]:
# Join the dummy variables to the main dataframe

df_new = pd.concat([df, df_sex], axis=1)
df_new.sample(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Female,Male
331,LP002098,Male,No,0,Graduate,No,2935,0.0,98.0,360.0,1.0,Semiurban,Y,0,1
50,LP001155,Female,Yes,0,Not Graduate,No,1928,1644.0,100.0,360.0,1.0,Semiurban,Y,1,0


In [56]:
# Alterative for joining the new columns to the Main Dataframe

df_new = df.join(df_sex)
df_new.sample(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Female,Male
430,LP002377,Female,No,1,Graduate,Yes,8624,0.0,150.0,360.0,1.0,Semiurban,Y,1,0
178,LP001616,Male,Yes,1,Graduate,No,3750,0.0,116.0,360.0,1.0,Semiurban,Y,0,1


In [23]:
# Checking the Column types

ctype = df.dtypes.reset_index()
ctype.columns = ["Count", "Column Type"]
ctype.groupby("Column Type").aggregate('count').reset_index()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           614 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [8]:
# Checking the  Missing Values

missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['Column_Name', 'Missing_Count']
missing_df = missing_df[missing_df['Missing_Count']>0]
missing_df = missing_df.sort_values(by='Missing_Count')
missing_df

Unnamed: 0,Column_Name,Missing_Count
2,Married,3
1,Gender,13
9,Loan_Amount_Term,14
3,Dependents,15
5,Self_Employed,32
10,Credit_History,50


In [91]:
# Checking the Missing Values in the dataset

missing_df =df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['Column Name', 'Missing Values Count']
missing_df['Filling Factor (%)']=(df.shape[0]-missing_df['Missing Values Count'])/df.shape[0]*100
missing_df.sort_values('Filling Factor (%)').reset_index(drop = True)

Unnamed: 0,Column Name,Missing Values Count,Filling Factor (%)
0,Credit_History,50,91.856678
1,Self_Employed,32,94.788274
2,LoanAmount,22,96.416938
3,Dependents,15,97.557003
4,Loan_Amount_Term,14,97.71987
5,Married,3,99.511401
6,Loan_ID,0,100.0
7,Gender,0,100.0
8,Education,0,100.0
9,ApplicantIncome,0,100.0


### The dropna() method has several additional parameters:

In [34]:
# Drop All rows that contain and have a Missing Value

a=df.dropna()
a.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 480 entries, 1 to 613
Data columns (total 13 columns):
Loan_ID              480 non-null object
Gender               480 non-null object
Married              480 non-null object
Dependents           480 non-null object
Education            480 non-null object
Self_Employed        480 non-null object
ApplicantIncome      480 non-null int64
CoapplicantIncome    480 non-null float64
LoanAmount           480 non-null float64
Loan_Amount_Term     480 non-null float64
Credit_History       480 non-null float64
Property_Area        480 non-null object
Loan_Status          480 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 52.5+ KB


In [41]:
# Drop Missing Values Only if ALL columns are NaN

df.dropna(how='all')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [42]:
# Drop row if it does not have atleast 3 values that are **NOT** NaN

c= df.dropna(thresh=3)
c.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 67.2+ KB


In [48]:
# Drop rows where NaNs appears in Specific Columns (in this case Loan Amount Column)

f=df.dropna(subset=["LoanAmount"])
f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 592 entries, 1 to 613
Data columns (total 13 columns):
Loan_ID              592 non-null object
Gender               579 non-null object
Married              590 non-null object
Dependents           579 non-null object
Education            592 non-null object
Self_Employed        561 non-null object
ApplicantIncome      592 non-null int64
CoapplicantIncome    592 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     578 non-null float64
Credit_History       543 non-null float64
Property_Area        592 non-null object
Loan_Status          592 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 64.8+ KB


### Similar Case Imputation
### Categorical Variable Missing Values Imputation

In [83]:
# Check the frequency of the  Gender Column Categories: Male and Female:


df["Gender"].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

Male has the most frequent variable entries  with 489 and Female 112

In [84]:
# Replace the Missing Values in the Gender Column with Male:

df=df.fillna({"Gender": "Male"})
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               614 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [85]:
# Check the total count of Gender Column Values after Imputation


df["Gender"].value_counts()

Male      502
Female    112
Name: Gender, dtype: int64

We have now filled the missing values in the gender column with Male:

In [97]:
# Check the frequency of the Categories in the Married Column

df["Married"].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

Yes category in the Married Column  has the highest frequency: we will use it to impute the column: Yes: 398 No:213

In [98]:
# Replace the Missing Values in the Gender Column with Male:

df=df.fillna({"Married": "Yes"})
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              614 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [100]:
# Check the total count of Gender Column Values after Imputation


df["Married"].value_counts()

Yes    401
No     213
Name: Married, dtype: int64

Yes Category of the Married Collumn has now 401 values and No: 213

In [11]:
# Replace and filling all the Missing Values in all the columns and rows with 0

df.fillna(0)
df.sample(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
457,LP002467,Male,Yes,0,Graduate,No,3708,2569.0,173.0,360.0,1.0,Urban,N
27,LP001073,Male,Yes,2,Not Graduate,No,4226,1040.0,110.0,360.0,1.0,Urban,Y


In [101]:
# Replace and  fill ALL the Continuous Numerical Variables in the dataframe with mean of their respective columns:

df=df.fillna(df.mean())
df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [102]:
# Find the mean of a column:

df["LoanAmount"].mean()

146.41216216216213

In [103]:
#  Mean / Mode / Median Imputation:

# Mean Imputation of the Loan Amount Column using the Loan Amount Mean

df['LoanAmount'].fillna((df['LoanAmount'].mean()), inplace=True)
df.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
