In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline

df = pd.read_csv('Loan_Data.csv')

In [3]:
df.shape

(614, 13)

In [4]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [8]:
print(df.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [9]:
df.isnull().sum() * 100 / len(df)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [10]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [11]:
df = df.drop('Loan_ID', axis=1)

In [12]:
columns = ['Gender', 'Dependents', 'LoanAmount', 'Loan_Amount_Term']

In [13]:
df = df.dropna(subset=columns)

In [14]:
df.isnull().sum() * 100 / len(df)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [15]:
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [16]:
df.isnull().sum() * 100 / len(df)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

In [17]:
df.sample(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
369,Male,Yes,0,Graduate,No,19730,5266.0,570.0,360.0,1.0,Rural,N
59,Male,Yes,2,Not Graduate,No,3357,2859.0,144.0,360.0,1.0,Urban,Y
527,Male,Yes,1,Not Graduate,No,5285,1430.0,161.0,360.0,0.0,Semiurban,Y
504,Male,Yes,0,Not Graduate,No,3814,1483.0,124.0,300.0,1.0,Semiurban,Y
596,Male,Yes,2,Not Graduate,Yes,6383,1000.0,187.0,360.0,1.0,Rural,N
317,Male,Yes,0,Graduate,No,2058,2134.0,88.0,360.0,1.0,Urban,Y
168,Male,No,0,Graduate,No,2237,0.0,63.0,480.0,0.0,Semiurban,N
579,Male,No,0,Graduate,No,3182,2917.0,161.0,360.0,1.0,Urban,Y
574,Male,Yes,3+,Graduate,No,6406,0.0,150.0,360.0,1.0,Semiurban,N
566,Male,No,0,Graduate,No,3333,0.0,70.0,360.0,1.0,Urban,Y


In [18]:
df['Dependents'] = df['Dependents'].replace(to_replace="3+", value='4')
df.sample(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
478,Male,Yes,1,Graduate,Yes,16667,2250.0,86.0,360.0,1.0,Semiurban,Y
447,Male,Yes,0,Graduate,No,3539,1376.0,55.0,360.0,1.0,Rural,N
299,Male,Yes,1,Graduate,No,2014,2925.0,113.0,360.0,1.0,Urban,N
317,Male,Yes,0,Graduate,No,2058,2134.0,88.0,360.0,1.0,Urban,Y
600,Female,No,4,Graduate,No,416,41667.0,350.0,180.0,1.0,Urban,N
313,Male,Yes,2,Graduate,Yes,5746,0.0,144.0,84.0,1.0,Rural,Y
456,Male,Yes,0,Graduate,No,4301,0.0,118.0,360.0,1.0,Urban,Y
613,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N
184,Female,Yes,0,Graduate,No,3625,0.0,108.0,360.0,1.0,Semiurban,Y
190,Male,No,0,Not Graduate,No,4885,0.0,48.0,360.0,1.0,Rural,Y


In [19]:
print(df['Gender'].unique())
print(df['Married'].unique())
print(df['Education'].unique())
print(df['Self_Employed'].unique())
print(df['Property_Area'].unique())
print(df['Loan_Status'].unique())

['Male' 'Female']
['Yes' 'No']
['Graduate' 'Not Graduate']
['No' 'Yes']
['Rural' 'Urban' 'Semiurban']
['N' 'Y']


In [20]:
df['Gender'] = df['Gender'].map({'Male':1, 'Female':0}).astype('int')
df['Married'] = df['Married'].map({'Yes':1, 'No':0}).astype('int')
df['Education'] = df['Education'].map({'Graduate':1, 'Not Graduate':0}).astype('int')
df['Self_Employed'] = df['Self_Employed'].map({'No': 1, 'Yes': 0}).astype('int')
df['Property_Area'] = df['Property_Area'].map({'Rural': 0, 'Urban': 1, 'Semiurban': 2}).astype('int')
df['Loan_Status'] = df['Loan_Status'].map({'N': 0, 'Y':1}).astype('int')

In [21]:
print(df['Gender'].unique())
print(df['Married'].unique())
print(df['Education'].unique())
print(df['Self_Employed'].unique())
print(df['Property_Area'].unique())
print(df['Loan_Status'].unique())

[1 0]
[1 0]
[1 0]
[1 0]
[0 1 2]
[0 1]


In [22]:
df.sample(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
438,1,0,0,1,0,10416,0.0,187.0,360.0,0.0,1,0
49,0,0,0,1,1,4000,2275.0,144.0,360.0,1.0,2,1
118,1,1,0,1,1,5568,2142.0,175.0,360.0,1.0,0,0
105,1,1,1,1,1,3052,1030.0,100.0,360.0,1.0,1,1
4,1,0,0,1,1,6000,0.0,141.0,360.0,1.0,1,1
394,1,1,2,1,1,3100,1400.0,113.0,360.0,1.0,1,1
93,1,0,0,1,1,4133,0.0,122.0,360.0,1.0,2,1
428,1,1,0,1,1,2920,16.120001,87.0,360.0,1.0,0,1
454,1,0,0,1,0,7085,0.0,84.0,360.0,1.0,2,1
222,1,0,0,1,1,2971,2791.0,144.0,360.0,1.0,2,1


In [23]:
X = df.drop('Loan_Status', axis=1)
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,1,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,0,3000,0.0,66.0,360.0,1.0,1
3,1,1,0,0,1,2583,2358.0,120.0,360.0,1.0,1
4,1,0,0,1,1,6000,0.0,141.0,360.0,1.0,1
5,1,1,2,1,0,5417,4196.0,267.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,1,2900,0.0,71.0,360.0,1.0,0
610,1,1,4,1,1,4106,0.0,40.0,180.0,1.0,0
611,1,1,1,1,1,8072,240.0,253.0,360.0,1.0,1
612,1,1,2,1,1,7583,0.0,187.0,360.0,1.0,1


In [24]:
y = df['Loan_Status']
y

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 553, dtype: int64

In [25]:
df.sample(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
428,1,1,0,1,1,2920,16.120001,87.0,360.0,1.0,0,1
163,1,1,2,1,1,4167,1447.0,158.0,360.0,1.0,0,1
394,1,1,2,1,1,3100,1400.0,113.0,360.0,1.0,1,1
486,1,0,2,1,1,3547,0.0,80.0,360.0,0.0,0,0
294,1,1,0,1,1,2383,3334.0,172.0,360.0,1.0,2,1
241,1,1,1,0,1,2510,1983.0,140.0,180.0,1.0,1,0
116,0,1,0,1,1,3167,2283.0,154.0,360.0,1.0,2,1
359,1,1,4,1,1,5167,3167.0,200.0,360.0,1.0,2,1
532,1,0,2,1,1,3588,0.0,110.0,360.0,0.0,0,0
107,1,0,0,0,1,7333,0.0,120.0,360.0,1.0,0,0


In [26]:
feature_scaling_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

In [27]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X[feature_scaling_cols] = st.fit_transform(X[feature_scaling_cols])
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,1,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,0,-0.394296,-0.545638,-0.952675,0.279961,1.0,1
3,1,1,0,0,1,-0.464262,0.229842,-0.309634,0.279961,1.0,1
4,1,0,0,1,1,0.109057,-0.545638,-0.059562,0.279961,1.0,1
5,1,1,2,1,0,0.011239,0.834309,1.440866,0.279961,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,1,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,4,1,1,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,1,0.456706,-0.466709,1.274152,0.279961,1.0,1
612,1,1,2,1,1,0.374659,-0.545638,0.488213,0.279961,1.0,1


In [28]:
model_cv_score = {}

def model_eval(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model} Accuracy : {accuracy_score(y_test, y_pred)}")
    cv_score = cross_val_score(model, X, y, cv=5)
    print(f"{model} Cross validation score : {np.mean(cv_score)}")
    model_cv_score[model] = round(np.mean(cv_score) * 100, 2)

In [29]:
model = LogisticRegression()
model_eval(model, X, y)

LogisticRegression() Accuracy : 0.8378378378378378
LogisticRegression() Cross validation score : 0.8047829647829647


### Hyperparameter Tuning

In [30]:
from sklearn.model_selection import RandomizedSearchCV

In [31]:
log_reg_grid = {"C":np.logspace(-4, 4, 20), "solver":['liblinear']}
log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions=log_reg_grid, n_iter=20, cv=5, verbose=True)

In [32]:
log_reg.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [33]:
log_reg.best_score_

0.8066011466011467

In [34]:
log_reg.best_params_

{'solver': 'liblinear', 'C': 0.23357214690901212}

In [35]:
import joblib

In [36]:
joblib.dump(log_reg, 'loan_prediction_model')

['loan_prediction_model']

In [37]:
model = joblib.load('loan_prediction_model')

In [38]:
model_test_data = pd.DataFrame({
    'Gender': 1,
    'Married': 1,
    'Dependents': 2,
    'Education': 0,
    'Self_Employed': 0,
    'ApplicantIncome': 2889,
    'CoapplicantIncome': 0.0,
    'LoanAmount': 45,
    'Loan_Amount_Term': 180,
    'Credit_History': 0,
    'Property_Area': 1
}, index=[0])

In [39]:
model_test_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,2,0,0,2889,0.0,45,180,0,1


In [40]:
result = model.predict(model_test_data)

In [41]:
if result == 1:
    print("Loan Approved")
else:
    print("You are not eligible for this loan")

Loan Approved
