In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('./data/train.csv')
print(train.shape)
train.head()

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [3]:
obj_cols = [col for col in train.columns[train.dtypes=='object']]
obj_cols.remove('Loan_ID')

for col in obj_cols:
    print(f"Column: {col}")
    print(f"Unique values: {train[col].unique()}\n")

Column: Gender
Unique values: ['Male' 'Female' nan]

Column: Married
Unique values: ['No' 'Yes' nan]

Column: Dependents
Unique values: ['0' '1' '2' '3+' nan]

Column: Education
Unique values: ['Graduate' 'Not Graduate']

Column: Self_Employed
Unique values: ['No' 'Yes' nan]

Column: Property_Area
Unique values: ['Urban' 'Rural' 'Semiurban']

Column: Loan_Status
Unique values: ['Y' 'N']



## Encoding

In [4]:
ntrain = train.copy()
print(ntrain.shape)

(614, 13)


In [5]:
# Gender
ntrain.loc[ntrain.Gender == 'Male', 'Gender'] = 0
ntrain.loc[ntrain.Gender == 'Female', 'Gender'] = 1
print(ntrain.shape)

# Married
ntrain.loc[ntrain.Married == 'No', 'Married'] = 0
ntrain.loc[ntrain.Married == 'Yes', 'Married'] = 1
print(ntrain.shape)

# Education
ntrain.loc[ntrain.Education == 'Not Graduate', 'Education'] = 0
ntrain.loc[ntrain.Education == 'Graduate', 'Education'] = 1
print(ntrain.shape)

# Self_Employed
ntrain.loc[ntrain.Self_Employed == 'No', 'Self_Employed'] = 0
ntrain.loc[ntrain.Self_Employed == 'Yes', 'Self_Employed'] = 1
print(ntrain.shape)

# Loan_Status
ntrain.loc[ntrain.Loan_Status == 'No', 'Loan_Status'] = 0
ntrain.loc[ntrain.Loan_Status == 'Yes', 'Loan_Status'] = 1
print(ntrain.shape)

ntrain.head()

(614, 13)
(614, 13)
(614, 13)
(614, 13)
(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0,0,0,1,0,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,0,1,1,1,0,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,0,1,0,1,1,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,0,1,0,0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,0,0,0,1,0,6000,0.0,141.0,360.0,1.0,Urban,Y


In [38]:
from sklearn.preprocessing import LabelBinarizer
binarizer = LabelBinarizer()

# Property_Area
results = binarizer.fit_transform(train["Property_Area"])
property_area_encoded = pd.DataFrame(results, columns=binarizer.classes_)

# Dependents
ntrain.Dependents = ntrain.Dependents.astype(str)
results = binarizer.fit_transform(ntrain["Dependents"])
dependents_encoded = pd.DataFrame(results, columns=binarizer.classes_)

ntrain = pd.concat([ntrain, property_area_encoded, dependents_encoded], axis=1, sort=False)
ntrain.drop('Property_Area', axis=1, inplace=True)
ntrain.drop('Dependents', axis=1, inplace=True)

ntrain.head()

Unnamed: 0,Loan_ID,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Rural,Semiurban,Urban,0,1,2,3+,nan
0,LP001002,0,0,1,0,5849,0.0,,360.0,1.0,Y,0,0,1,1,0,0,0,0
1,LP001003,0,1,1,0,4583,1508.0,128.0,360.0,1.0,N,1,0,0,0,1,0,0,0
2,LP001005,0,1,1,1,3000,0.0,66.0,360.0,1.0,Y,0,0,1,1,0,0,0,0
3,LP001006,0,1,0,0,2583,2358.0,120.0,360.0,1.0,Y,0,0,1,1,0,0,0,0
4,LP001008,0,0,1,0,6000,0.0,141.0,360.0,1.0,Y,0,0,1,1,0,0,0,0
