In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import os  # paths to file
import numpy as np  # linear algebra
import pandas as pd  # data processing
import warnings  # warning filter


# ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# relevant ML libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# ML models
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# default theme
sns.set(
    context="notebook",
    style="darkgrid",
    palette="deep",
    font="sans-serif",
    font_scale=1,
    color_codes=False,
    rc=None,
)

# warning hadle
warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [3]:
df_train = pd.read_csv(
    "/Users/hyunahlustig/Desktop/projects/loan_prediction/data/train.csv"
)
df_test = pd.read_csv(
    "/Users/hyunahlustig/Desktop/projects/loan_prediction/data/test.csv"
)

display(df_train.head(), df_test.head())

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


<IPython.core.display.Javascript object>

In [4]:
df_train.columns = [
    "id",
    "gender",
    "married",
    "dependents",
    "education",
    "self_employed",
    "income",
    "co_income",
    "loan_amount",
    "loan_term",
    "credit_history",
    "property_area",
    "loan_status",
]

df_test.columns = [
    "id",
    "gender",
    "married",
    "dependents",
    "education",
    "self_employed",
    "income",
    "co_income",
    "loan_amount",
    "loan_term",
    "credit_history",
    "property_area",
]

<IPython.core.display.Javascript object>

| Column | Description 
| --- | --- | 
| id | unique loan id 
| gender | male or female 
| married | married (Yes), unmarried (No) 
| dependents | number of persons depending on the client
| education | applicant education (graduate/ not graduate)
| self_employed | self employed (yes/no)
| income | applicant's income
| co_income | co-applicant's income
| loan_amount | loan amount in thousands
| loan_term | term of load in months
| credit_history | credit history meets guidlines (1), no (0)
| property_area | urban / semi / rural
| loan_status | loan approved (y/n)


In [5]:
display(df_train.head(), df_test.head())

Unnamed: 0,id,gender,married,dependents,education,self_employed,income,co_income,loan_amount,loan_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Unnamed: 0,id,gender,married,dependents,education,self_employed,income,co_income,loan_amount,loan_term,credit_history,property_area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


<IPython.core.display.Javascript object>

In [6]:
print(f"Shape of train set (row,col): {df_train.shape}")
print(f"Shape of test set (row,col): {df_test.shape}")

Shape of train set (row,col): (614, 13)
Shape of test set (row,col): (367, 12)


<IPython.core.display.Javascript object>

In [7]:
# id not needed, drop for both datasets

df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

<IPython.core.display.Javascript object>

In [9]:
display(df_train.head(), df_test.head())

Unnamed: 0,gender,married,dependents,education,self_employed,income,co_income,loan_amount,loan_term,credit_history,property_area,loan_status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Unnamed: 0,gender,married,dependents,education,self_employed,income,co_income,loan_amount,loan_term,credit_history,property_area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


<IPython.core.display.Javascript object>

In [11]:
df_train.isnull().sum().sort_values(ascending=False)

credit_history    50
self_employed     32
loan_amount       22
dependents        15
loan_term         14
gender            13
married            3
loan_status        0
property_area      0
co_income          0
income             0
education          0
dtype: int64

<IPython.core.display.Javascript object>

In [12]:
null_cols = [
    "credit_history",
    "self_employed",
    "loan_amount",
    "dependents",
    "loan_term",
    "gender",
    "married",
]

<IPython.core.display.Javascript object>

In [31]:
for col in null_cols:
    print(f"{col}:\n{df_test[col].value_counts()}\n", "*" * 50)
    df_test[col] = df_test[col].fillna(df_test[col].dropna().mode().values[0])

credit_history:
1.0    308
0.0     59
Name: credit_history, dtype: int64
 **************************************************
self_employed:
No     330
Yes     37
Name: self_employed, dtype: int64
 **************************************************
loan_amount:
150.0    17
125.0    11
110.0    10
187.0     9
100.0     9
         ..
55.0      1
74.0      1
66.0      1
142.0     1
213.0     1
Name: loan_amount, Length: 144, dtype: int64
 **************************************************
dependents:
0     210
2      59
1      58
3+     40
Name: dependents, dtype: int64
 **************************************************
loan_term:
360.0    317
180.0     22
480.0      8
300.0      7
240.0      4
84.0       3
6.0        1
120.0      1
36.0       1
350.0      1
12.0       1
60.0       1
Name: loan_term, dtype: int64
 **************************************************
gender:
Male      297
Female     70
Name: gender, dtype: int64
 **************************************************
married:
Ye

<IPython.core.display.Javascript object>

In [37]:
df_train.isnull().sum().sort_values(ascending=False)

credit_history    50
self_employed     32
loan_amount       22
dependents        15
loan_term         14
gender            13
married            3
loan_status        0
property_area      0
co_income          0
income             0
education          0
dtype: int64

<IPython.core.display.Javascript object>