In [83]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import pi
import seaborn as sns


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [84]:
#loading datas

data_train ='C:/Users/KOMAL/OneDrive/Desktop/The Vectors/The-Vectors/data/train.csv' 
data_test = 'C:/Users/KOMAL/OneDrive/Desktop/The Vectors/The-Vectors/data/test.csv'
loan_train = pd.read_csv(data_train)
loan_test = pd.read_csv(data_test)

In [85]:
loan_train_copy = loan_train.copy()
#data preview

loan_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [86]:
loan_train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [87]:
loan_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [88]:
#checking for missing values
loan_train.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Data Cleaning & Preparation

For train data set

In [89]:
#using forward fill, to place 1 or 0 in the place of missing data

loan_train['Credit_History'].fillna(method='ffill', inplace=True)

In [90]:
#filling columns using the median of the values

median_loan = loan_train['Loan_Amount_Term'].median()
loan_train['Loan_Amount_Term'].fillna((median_loan), inplace=True)

median_loan_amount = loan_train['LoanAmount'].median()
loan_train['LoanAmount'].fillna((median_loan_amount), inplace=True)



In [91]:
# Count the values to know which occurs most frequently

loan_train['Self_Employed'].value_counts()

loan_train['Married'].mode()

loan_train['Gender'].mode()

0    Male
Name: Gender, dtype: object

In [92]:
#filling the mode

loan_train['Self_Employed'].fillna('No', inplace=True)

loan_train['Dependents'].fillna(0, inplace=True)

loan_train['Married'].fillna('Yes', inplace=True)

loan_train['Gender'].fillna('Male', inplace=True)


In [93]:
#check if there is still some data missing

loan_train.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

For test data set

In [94]:
# A preview of missing data in the testing set

loan_test.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [95]:
# forward filling

loan_test['Credit_History'].fillna(method='ffill', inplace=True)

# filling missing data using median

median_loan_test = loan_test['Loan_Amount_Term'].median()
loan_test['Loan_Amount_Term'].fillna((median_loan_test), inplace=True)


median_loan_amount_test = loan_test['LoanAmount'].median()
loan_test['LoanAmount'].fillna((median_loan_amount_test), inplace=True)

# filling data using mode

loan_test['Self_Employed'].fillna('No', inplace=True)

loan_test['Dependents'].fillna(0, inplace=True)


loan_test['Gender'].fillna('Male', inplace=True)



In [96]:
#checking for missing data

loan_test.isna().values.any()

False

In [97]:
#checking for duplicate values

loan_train.duplicated().values.any()

loan_test.duplicated().values.any()

False

# Data Pre-processing

In [98]:
loan_train.drop('Loan_ID', axis=1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [99]:
#first identify all categorical columns & pass into a variable

objectlist_train = loan_train.select_dtypes(include = "object").columns


One-hot encoding

In [100]:
objectlist_train

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [101]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

oh_encoder = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Loan_Status','Gender','Education','Self_Employed','Property_Area','Married'])], 
                               remainder='passthrough')
df_onehot = oh_encoder.fit_transform(loan_train)

print(df_onehot[:6])

[[0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 'LP001002' '0' 5849
  0.0 128.0 360.0 1.0]
 [1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 'LP001003' '1' 4583
  1508.0 128.0 360.0 1.0]
 [0.0 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 'LP001005' '0' 3000
  0.0 66.0 360.0 1.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 1.0 'LP001006' '0' 2583
  2358.0 120.0 360.0 1.0]
 [0.0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 'LP001008' '0' 6000
  0.0 141.0 360.0 1.0]
 [0.0 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 'LP001011' '2' 5417
  4196.0 267.0 360.0 1.0]]


# Modelling