In [1]:
import pandas as pd
import numpy as np

In [2]:
# %pip install keras

## Data Preprocessing
* filling missing values
* converting categories to numbers
* Bringing all variables in range from 0 to 1

In [3]:
df = pd.read_csv("loan_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
df["Education"].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [5]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

### 1. Filling Missing values

In [7]:
#fill categorical variables with mode
#Before this check of all columns where the number of unique values in columns <20
# and missing values are present in this column
cat_cols = [col for col in df.columns if len(df[col].unique()) < 20 and df[col].isnull().sum() > 0]

In [8]:
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace = True)

In [9]:
df.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [10]:
num_na = [col for col in df.columns if df[col].dtype != "O" and df[col].isnull().sum()> 0]

In [11]:
for col in num_na:
    df[col].fillna(df[col].mean(), inplace = True)

## 2. Converting categories to numeric

In [12]:
print("Columns to be encoded numeric:", df.select_dtypes("O").columns[1:])
yes_no = {"No" : 0, "Yes": 1}
property_area_map = {"Rural":0,"Semiurban":1,"Urban":2}
education_mapping = {"Graduate":1,"Not Graduate":0}
gender_mapping = {"Male": 0,"Female": 1}

dependents_mapping = {"0": 0, "1": 1,"2": 2,"3+": 3}
loan_status_mapping = {"N": 0, "Y" : 1}



Columns to be encoded numeric: Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area', 'Loan_Status'],
      dtype='object')


In [13]:
df["Gender"] = df["Gender"].map(gender_mapping)
df["Married"] = df["Married"].map(yes_no)
df["Dependents"] = df["Dependents"].map(dependents_mapping)
df["Education"] = df["Education"].map(education_mapping)
df["Self_Employed"] = df["Self_Employed"].map(yes_no)
df["Property_Area"] = df["Property_Area"].map(property_area_map)
df["Loan_Status"] = df["Loan_Status"].map(loan_status_mapping)


In [14]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0,0,0,1,0,5849,0.0,146.412162,360.0,1.0,2,1
1,LP001003,0,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,0,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,0,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,0,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1


In [15]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df.drop("Loan_ID", inplace = True, axis = 1)

In [19]:
scaler.fit_transform(df)

array([[-0.47234264, -1.37208932, -0.73780632, ...,  0.41173269,
         1.22329839,  0.67451931],
       [-0.47234264,  0.72881553,  0.25346957, ...,  0.41173269,
        -1.31851281, -1.48253724],
       [-0.47234264,  0.72881553, -0.73780632, ...,  0.41173269,
         1.22329839,  0.67451931],
       ...,
       [-0.47234264,  0.72881553,  0.25346957, ...,  0.41173269,
         1.22329839,  0.67451931],
       [-0.47234264,  0.72881553,  1.24474546, ...,  0.41173269,
         1.22329839,  0.67451931],
       [ 2.11710719, -1.37208932, -0.73780632, ..., -2.42876026,
        -0.04760721, -1.48253724]])

In [20]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,1,0,5849,0.0,146.412162,360.0,1.0,2,1
1,0,1,1,1,0,4583,1508.0,128.000000,360.0,1.0,0,0
2,0,1,0,1,1,3000,0.0,66.000000,360.0,1.0,2,1
3,0,1,0,0,0,2583,2358.0,120.000000,360.0,1.0,2,1
4,0,0,0,1,0,6000,0.0,141.000000,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,1,0,0,1,0,2900,0.0,71.000000,360.0,1.0,0,1
610,0,1,3,1,0,4106,0.0,40.000000,180.0,1.0,0,1
611,0,1,1,1,0,8072,240.0,253.000000,360.0,1.0,2,1
612,0,1,2,1,0,7583,0.0,187.000000,360.0,1.0,2,1
