# Set Up

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('./data/train.csv')
print(train.shape)
train.head()

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Examining the Data

In [2]:
train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [3]:
# Collecting the 'object' columns
obj_cols = list(train.columns[train.dtypes=='object'])
obj_cols.remove('Loan_ID')
obj_cols

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [4]:
# Showing unique values of each column with 'object' data types
# This will help determine how to preprocess them into number data types
for col in obj_cols:
    print(f"{col}:")
    print(f"{train[col].unique()}\n")

Gender:
['Male' 'Female' nan]

Married:
['No' 'Yes' nan]

Dependents:
['0' '1' '2' '3+' nan]

Education:
['Graduate' 'Not Graduate']

Self_Employed:
['No' 'Yes' nan]

Property_Area:
['Urban' 'Rural' 'Semiurban']

Loan_Status:
['Y' 'N']



# Preprocessing Data (1)

In [12]:
# Creating a copy of the original dataset
ntrain = train.copy()

# Binary Encoding the following columns:
# (Generally 0 means 'no' or implies negation, while 1 the opposite)

# Gender
ntrain.loc[ntrain.Gender == 'Male', 'Gender_encoded'] = 0
ntrain.loc[ntrain.Gender == 'Female', 'Gender_encoded'] = 1
print(ntrain.shape)

# Married
ntrain.loc[ntrain.Married == 'No', 'is_married'] = 0
ntrain.loc[ntrain.Married == 'Yes', 'is_married'] = 1
print(ntrain.shape)

# Married
ntrain.loc[ntrain.Education == 'No', 'is_graduate'] = 0
ntrain.loc[ntrain.Education == 'Yes', 'is_graduate'] = 1
print(ntrain.shape)

# Self_Employed
ntrain.loc[ntrain.Self_Employed == 'No', 'is_self_employed'] = 0
ntrain.loc[ntrain.Self_Employed == 'Yes', 'is_self_employed'] = 1
print(ntrain.shape)

# Loan_Status
ntrain.loc[ntrain.Loan_Status == 'N', 'Loan_Status'] = 0
ntrain.loc[ntrain.Loan_Status == 'Y', 'Loan_Status'] = 1
ntrain.Loan_Status = ntrain.Loan_Status.astype(int)
print(ntrain.shape)

ntrain.head()

(614, 14)
(614, 15)
(614, 16)
(614, 16)
(614, 13)


Unnamed: 0,Loan_ID,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender_encoded,is_married,is_self_employed
0,LP001002,0,Graduate,5849,0.0,,360.0,1.0,Urban,1,0.0,0.0,0.0
1,LP001003,1,Graduate,4583,1508.0,128.0,360.0,1.0,Rural,0,0.0,1.0,0.0
2,LP001005,0,Graduate,3000,0.0,66.0,360.0,1.0,Urban,1,0.0,1.0,1.0
3,LP001006,0,Not Graduate,2583,2358.0,120.0,360.0,1.0,Urban,1,0.0,1.0,0.0
4,LP001008,0,Graduate,6000,0.0,141.0,360.0,1.0,Urban,1,0.0,0.0,0.0


In [13]:
from sklearn.preprocessing import LabelBinarizer
binarizer = LabelBinarizer()

# Creat a new column called 'has_child'
ntrain.loc[ntrain.Dependents=='0', 'has_child'] = 0
ntrain.loc[ntrain.has_child.isnull(), 'has_child'] = 1

# Dependents
ntrain.Dependents = ntrain.Dependents.astype(str)
results = binarizer.fit_transform(ntrain["Dependents"])
dependents_encoded = pd.DataFrame(results, columns=['0','1','2','3','nan'])

# Property_Area
results = binarizer.fit_transform(ntrain["Property_Area"])
property_area_encoded = pd.DataFrame(results, columns=binarizer.classes_)

ntrain = pd.concat([ntrain, education_encoded, property_area_encoded, dependents_encoded], axis=1, sort=False)
ntrain.drop('Property_Area', axis=1, inplace=True)
ntrain.drop('Dependents', axis=1, inplace=True)

ntrain.head()

ValueError: Shape of passed values is (614, 1), indices imply (614, 2)

# Finding correlation between columns

In [11]:
ntrain.dtypes

Loan_ID               object
Gender                object
Married               object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Loan_Status           object
has_child            float64
Rural                  int64
Semiurban              int64
Urban                  int64
0                      int64
1                      int64
2                      int64
3                      int64
nan                    int64
dtype: object

# Preprocessing Data (2)