# Loan Prediction

#### Column Description
* **Loan_ID** -	Unique Loan ID
* **Gender** -	Male/ Female
* **Married** -	Applicant married (Y/N)
* **Dependents** -	Number of dependents
* **Education** -	Applicant Education (Graduate/ Under Graduate)
* **Self_Employed** -	Self employed (Y/N)
* **ApplicantIncome** -	Applicant income
* **CoapplicantIncome** -	Coapplicant income
* **LoanAmount** -	Loan amount in thousands
* **Loan_Amount_Term** -	Term of loan in months
* **Credit_History** -	credit history meets guidelines
* **Property_Area** -	Urban/ Semi Urban/ Rural
* **Loan_Status** -	(Target) Loan approved (Y/N)


## Set Up

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('./data/train.csv')
print(train.shape)
train.head()

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
test = pd.read_csv('./data/test.csv')
print(test.shape)
test.head()

(367, 12)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


## Examining the Data

In [3]:
train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [4]:
# Unique text values per column
obj_cols = list(train.columns[train.dtypes=='object'])
obj_cols.remove('Loan_ID')
obj_cols

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [5]:
for col in obj_cols:
    print(f"{col}:")
    print(f"{train[col].unique()}\n")

Gender:
['Male' 'Female' nan]

Married:
['No' 'Yes' nan]

Dependents:
['0' '1' '2' '3+' nan]

Education:
['Graduate' 'Not Graduate']

Self_Employed:
['No' 'Yes' nan]

Property_Area:
['Urban' 'Rural' 'Semiurban']

Loan_Status:
['Y' 'N']



## Preprocessing Columns & Values

First, the ```Loan_Status``` column would be much more useful to utilize in plots if encoded into integer forms. In this notebook, I will encode 'Y' as a, and 'N' as 0

In [6]:
ntrain = train.copy()
print(ntrain.shape)
ntrain.head()

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
obj_cols = [col for col in train.columns[train.dtypes=='object']]
obj_cols.remove('Loan_ID')

# Gender
ntrain.loc[ntrain.Gender == 'Male', 'Gender'] = 0
ntrain.loc[ntrain.Gender == 'Female', 'Gender'] = 1
print(ntrain.shape)

# Married
ntrain.loc[ntrain.Married == 'No', 'Married'] = 0
ntrain.loc[ntrain.Married == 'Yes', 'Married'] = 1
print(ntrain.shape)

# Education
ntrain.loc[ntrain.Education == 'Not Graduate', 'Education'] = 0
ntrain.loc[ntrain.Education == 'Graduate', 'Education'] = 1
print(ntrain.shape)

# Self_Employed
ntrain.loc[ntrain.Self_Employed == 'No', 'Self_Employed'] = 0
ntrain.loc[ntrain.Self_Employed == 'Yes', 'Self_Employed'] = 1
print(ntrain.shape)

# Loan_Status
ntrain.loc[ntrain.Loan_Status == 'N', 'Loan_Status'] = 0
ntrain.loc[ntrain.Loan_Status == 'Y', 'Loan_Status'] = 1
print(ntrain.shape)

ntrain.loc[ntrain.Property_Area == 'Urban', 'Property_Area'] = 1
ntrain.loc[ntrain.Property_Area == 'Rural', 'Property_Area'] = 2
ntrain.loc[ntrain.Property_Area == 'Semiurban', 'Property_Area'] = 3

ntrain[obj_cols] = ntrain[obj_cols].apply(pd.to_numeric, errors='coerce')

(614, 13)
(614, 13)
(614, 13)
(614, 13)
(614, 13)


In [8]:
from sklearn.preprocessing import LabelBinarizer
binarizer = LabelBinarizer()

# Property_Area
results = binarizer.fit_transform(train["Property_Area"])
property_area_encoded = pd.DataFrame(results, columns=binarizer.classes_)

In [22]:
ntrain = train.copy()
ntrain.Dependents.fillna(0)
ntrain[ntrain.Dependents.isnull()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
102,LP001350,Male,Yes,,Graduate,No,13650,0.0,,360.0,1.0,Urban,Y
104,LP001357,Male,,,Graduate,No,3816,754.0,160.0,360.0,1.0,Urban,Y
120,LP001426,Male,Yes,,Graduate,No,5667,2667.0,180.0,360.0,1.0,Rural,Y
226,LP001754,Male,Yes,,Not Graduate,Yes,4735,0.0,138.0,360.0,1.0,Urban,N
228,LP001760,Male,,,Graduate,No,4758,0.0,158.0,480.0,1.0,Semiurban,Y
293,LP001945,Female,No,,Graduate,No,5417,0.0,143.0,480.0,0.0,Urban,N
301,LP001972,Male,Yes,,Not Graduate,No,2875,1750.0,105.0,360.0,1.0,Semiurban,Y
332,LP002100,Male,No,,Graduate,No,2833,0.0,71.0,360.0,1.0,Urban,Y
335,LP002106,Male,Yes,,Graduate,Yes,5503,4490.0,70.0,,1.0,Semiurban,Y
346,LP002130,Male,Yes,,Not Graduate,No,3523,3230.0,152.0,360.0,0.0,Rural,N


In [None]:
# Dependents
ntrain.Dependents.fillna('0')

ntrain.loc[(ntrain.Dependents.isnull()) | ("Dependents"=='0'), 'has_child'] = 0
ntrain.loc[ntrain.has_child.isnull(), 'has_child'] = 1

ntrain.Dependents = ntrain.Dependents.astype(str)
results = binarizer.fit_transform(ntrain["Dependents"])
dependents_encoded = pd.DataFrame(results, columns=['0','1','2','3'])

ntrain = pd.concat([ntrain, property_area_encoded, dependents_encoded], axis=1, sort=False)
ntrain.drop('Property_Area', axis=1, inplace=True)
ntrain.drop('Dependents', axis=1, inplace=True)

ntrain.head()

In [None]:
ntrain.dtypes

## Finding correlation between columns

In [None]:
print("==== Correlation between columns ==== \n")

for col in ntrain.columns[1:-7]:
    if col != ntrain.columns[-1]:
        i = ntrain.columns.get_loc(col)
        while ntrain.columns[i] != ntrain.columns[-7]:
            next_col = ntrain.columns[i+1]
            corr = ntrain[col].corr(ntrain[next_col])*100
            if abs(corr) > 10:
                print(f"{col} & {next_col}: {abs(corr):.3f}%\n")
            i += 1

In [None]:
# Gender & Married
sns.countplot(data=train, x='Gender', hue='Married')

In [None]:
# Gender & LoanAmount
sns.swarmplot(data=train, x='Gender', y='LoanAmount')

In [None]:
# Gender & has_child
sns.countplot(data=ntrain, x="Gender", hue='has_child')

In [None]:
# Married & LoanAmount
sns.swarmplot(data=train, x='Married', y='LoanAmount')

In [None]:
# Married & Loan_Amount_Term

In [None]:
# Married & has_child
sns.countplot(data=ntrain, x='Married', hue='has_child')

In [None]:
# Education & ApplicantIncome

In [None]:
# Education & LoanAmount

In [None]:
# Self_Employed & ApplicantIncome

In [None]:
# Self_Employed & LoanAmount

In [None]:
# ApplicantIncome & CoapplicantIncome

In [None]:
# ApplicantIncome & LoanAmount

In [None]:
# CoapplicantIncome & LoanAmount

In [None]:
# Credit_History & Loan_Status

In [None]:
# Loan_Status & Rural

In [None]:
# Gender & Married

# Gender & LoanAmount

# Gender & has_child

# Married & LoanAmount

# Married & Loan_Amount_Term

# Married & has_child

# Education & ApplicantIncome

# Education & LoanAmount

# Self_Employed & ApplicantIncome

# Self_Employed & LoanAmount

# ApplicantIncome & CoapplicantIncome

# ApplicantIncome & LoanAmount

# CoapplicantIncome & LoanAmount

# Credit_History & Loan_Status

# Loan_Status & Rural

In [None]:
for col in ntrain.columns[1:-4]:
    corr = ntrain['Loan_Status'].corr(ntrain[col])*100
    if abs(corr) > 8:
        
    print(f"{col}:{abs(corr):.2f}\n")

In [None]:
ntrain.has_child.unique()