In [241]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df=pd.read_csv("train_loanPrediction.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [242]:
df.apply(lambda x: sum(x.isnull()),axis=0) #to tell the missing values in each column

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [243]:
#to fill the missing values by mean 
df.LoanAmount = df.LoanAmount.fillna(df.LoanAmount.mean())
#since *no* has ~82% so, we will fill the missing values of this column with "no"
df.Self_Employed = df.Self_Employed.fillna('No')
df['LoanAmount_log'] = np.log(df['LoanAmount']) #log function to reduce the extreme effect in Loan amount (which we saw in the previous barplot)
#to decrease the extreme values of ApplicantIncome (which can be ssen from the previous barplot), we can add CoApplicantIncome to compensate the value 
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['TotalIncome_log'] = np.log(df['TotalIncome'])
df['Capacity%'] = ((df['LoanAmount'] / df['TotalIncome']) *100).astype(float) #capacity of each applicant of how well he/she is suited to pay back his loan.
#males are around ~81%, so we'll fill the missing values with "male"
df.Gender = df.Gender.fillna('Male')
#yes has ~65% 
#filling the missing values with yes
df.Married = df.Married.fillna('Yes')
df.Loan_Status = df.Loan_Status.fillna('1')
df.Credit_History = df.Credit_History.fillna('1.0')
new_data ={
    '1':1,
    '2':2,
    '3+':3
}
df=df.replace({'Dependents':new_data})
df=df.dropna()
df['Dependents']=df['Dependents'].astype('int')

In [244]:
#building predictive model 
#we will use scikit-learn (sklearn), for which we need to convert the categorical values into numerical values
from sklearn.preprocessing import LabelEncoder
var_col = ['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()  #Encode labels with value between 0 and n_classes-1.
for i in var_col:
    df[i] = le.fit_transform(df[i]) #Fit label encoder and return encoded labels

In [245]:
df.loc[df['Credit_History']=='1.0']=1
df['Credit_History']=df['Credit_History'].astype('int')

In [247]:
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [264]:
df.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LoanAmount_log,TotalIncome,TotalIncome_log,Capacity%
Gender,1.0,0.367951,0.189529,0.10477,0.077298,0.014842,0.122454,0.031292,-0.163925,0.032099,-0.009553,0.071048,-0.063391,0.059559,-0.090733,-0.150498
Married,0.367951,1.0,0.368998,0.107816,0.139682,-0.000705,0.072391,0.049854,-0.237289,0.029084,0.019126,0.128034,-0.117673,0.026123,-0.169993,-0.090056
Dependents,0.189529,0.368998,1.0,0.061786,0.081968,0.108286,-0.023659,0.134948,-0.100864,-0.04318,0.013453,0.038275,0.013544,0.094955,-0.032586,0.041762
Education,0.10477,0.107816,0.061786,1.0,0.277586,-0.237098,-0.139597,-0.353294,-0.462955,0.00484,-0.053607,0.024409,-0.504009,-0.278814,-0.525004,-0.132776
Self_Employed,0.077298,0.139682,0.081968,0.277586,1.0,-0.042956,-0.105703,-0.189705,-0.516403,0.052449,-0.044311,0.100259,-0.506256,-0.080313,-0.539787,-0.317054
ApplicantIncome,0.014842,-0.000705,0.108286,-0.237098,-0.042956,1.0,-0.076943,0.599858,0.173503,-0.053371,-0.012835,-0.056486,0.39191,0.929336,0.406185,-0.188798
CoapplicantIncome,0.122454,0.072391,-0.023659,-0.139597,-0.105703,-0.076943,1.0,0.217507,0.148068,-0.027876,0.004065,-0.072021,0.236886,0.296635,0.253893,-0.082931
LoanAmount,0.031292,0.049854,0.134948,-0.353294,-0.189705,0.599858,0.217507,1.0,0.391714,-0.072516,-0.059484,-0.120146,0.739136,0.655123,0.585566,0.310881
Loan_Amount_Term,-0.163925,-0.237289,-0.100864,-0.462955,-0.516403,0.173503,0.148068,0.391714,1.0,-0.091678,-0.039816,-0.166539,0.78575,0.221054,0.809909,0.417932
Credit_History,0.032099,0.029084,-0.04318,0.00484,0.052449,-0.053371,-0.027876,-0.072516,-0.091678,1.0,0.001332,0.542772,-0.1237,-0.061448,-0.114256,-0.087725


In [306]:
k=df.drop(['Loan_Status','Loan_ID','Dependents','Property_Area',''],axis=1)

In [307]:
x_train,x_test,y_train,y_test=train_test_split(k,df['Loan_Status'])

In [308]:
lf=LogisticRegression()
lf.fit(x_train,y_train)
predictions = lf.predict(x_test)
accuracy = metrics.accuracy_score(predictions,y_test)
accuracy

0.8775510204081632

In [309]:
lf=RandomForestClassifier()
lf.fit(x_train,y_train)
predictions = lf.predict(x_test)
accuracy = metrics.accuracy_score(predictions,y_test)
accuracy

0.8367346938775511