In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data_cleaning.csv')

## Store Feature Matrix In X And Response (Target) In Vector y

In [3]:
X = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

##  Feature Scaling

In [4]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
1,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
2,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
3,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
4,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


In [5]:
cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
st = StandardScaler()
X[cols]=st.fit_transform(X[cols])

## Splitting The Dataset Into The Training Set And Test Set & Applying K-Fold Cross Validation 

In [6]:
model_df={}
# Define model evaluation function
def model_val(model,x,y):
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state=42)
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(f"{model} accuracy is {accuracy_score(y_test, y_pred)}")
    score = cross_val_score(model,x,y,cv=5)
    print(f"{model} avg cross val score is {np.mean(score)}")
    model_df[model]=round(np.mean(score)*100,2)

####  Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model,X,y)

LogisticRegression() accuracy is 0.8018018018018018
LogisticRegression() avg cross val score is 0.8047829647829647


#### SVC

In [8]:
from sklearn import svm
model = svm.SVC()
model_val(model,X,y)

SVC() accuracy is 0.8018018018018018
SVC() avg cross val score is 0.7938902538902539


#### DecisionTree

In [9]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model,X,y)

DecisionTreeClassifier() accuracy is 0.7477477477477478
DecisionTreeClassifier() avg cross val score is 0.7107125307125307


#### RandomForest

In [10]:
from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier()
model_val(model,X,y)

RandomForestClassifier() accuracy is 0.7567567567567568
RandomForestClassifier() avg cross val score is 0.7921048321048321


#### GradientBoosting

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
model =GradientBoostingClassifier()
model_val(model,X,y)

GradientBoostingClassifier() accuracy is 0.7927927927927928
GradientBoostingClassifier() avg cross val score is 0.7776085176085176


In [12]:
model_df

{LogisticRegression(): 80.48,
 SVC(): 79.39,
 DecisionTreeClassifier(): 71.07,
 RandomForestClassifier(): 79.21,
 GradientBoostingClassifier(): 77.76}

## Hyperparameter Tuning

In [13]:
from sklearn.model_selection import RandomizedSearchCV

#### Logistic Regression

In [14]:
log_reg_grid={"C":np.logspace(-4,4,20),
             "solver":['liblinear']}

In [15]:
rs_log_reg=RandomizedSearchCV(LogisticRegression(),
                   param_distributions=log_reg_grid,
                  n_iter=20,cv=5,verbose=True)
rs_log_reg.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [16]:
rs_log_reg.best_score_

0.8047829647829647

In [17]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': 0.23357214690901212}

#### SVC

In [18]:
svc_grid = {'C':[0.25,0.50,0.75,1],"kernel":["linear"]}

In [19]:
rs_svc=RandomizedSearchCV(svm.SVC(),
                  param_distributions=svc_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [20]:
rs_svc.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [21]:
rs_svc.best_score_

0.8066011466011467

In [22]:
rs_svc.best_params_

{'kernel': 'linear', 'C': 0.25}

#### Random Forest

In [23]:
rf_grid={'n_estimators':np.arange(10,1000,10),
  'max_features':['auto','sqrt'],
 'max_depth':[None,3,5,10,20,30],
 'min_samples_split':[2,5,20,50,100],
 'min_samples_leaf':[1,2,5,10]
 }

In [24]:
rs_rf=RandomizedSearchCV(RandomForestClassifier(),
                  param_distributions=rf_grid,
                   cv=5,
                   n_iter=20,
                  verbose=True)

In [28]:
rs_rf.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [29]:
rs_rf.best_score_

0.8066175266175266

In [30]:
rs_rf.best_params_

{'n_estimators': 920,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': 20}

##### LogisticRegression
1. LogisticRegression score Before Hyperparameter Tuning: 80.47
2. LogisticRegression score after Hyperparameter Tuning: 80.48 
    
------------------------------------------------------
##### SVC
1. SVC score Before Hyperparameter Tuning: 79.39
2. SVC score after Hyperparameter Tuning: 80.66
    
--------------------------------------------------------
##### RandomForestClassifier
1. RandomForestClassifier score Before Hyperparameter Tuning: 77.76
2. RandomForestClassifier score after Hyperparameter Tuning: 80.66 


### Save The Model

In [62]:
X = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

In [31]:
rf = RandomForestClassifier(n_estimators=920,
 min_samples_split=5,
 min_samples_leaf=5,
 max_features='sqrt',
 max_depth=20)
rf.fit(X,y)

In [32]:
data.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [33]:
import joblib

In [36]:
joblib.dump(rf,'loan_status_predict')

['loan_status_predict.pkl']

In [37]:
model = joblib.load('loan_status_predict')

In [38]:
import pandas as pd
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':1,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2000,
    'CoapplicantIncome':0.0,
    'LoanAmount':4500000,
    'Loan_Amount_Term':18,
    'Credit_History':0,
    'Property_Area':2,
},index=[0])

In [39]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1,0,0,2000,0.0,4500000,18,0,2


In [40]:
result = model.predict(df)
if result==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

Loan Not Approved


### GUI

In [42]:
from tkinter import *
import joblib
import pandas as pd

In [43]:
def show_entry():
    
    p1 = float(e1.get())
    p2 = float(e2.get())
    p3 = float(e3.get())
    p4 = float(e4.get())
    p5 = float(e5.get())
    p6 = float(e6.get())
    p7 = float(e7.get())
    p8 = float(e8.get())
    p9 = float(e9.get())
    p10 = float(e10.get())
    p11 = float(e11.get())
    
    model = joblib.load('loan_status_predict.pkl')
    df = pd.DataFrame({
    'Gender':p1,
    'Married':p2,
    'Dependents':p3,
    'Education':p4,
    'Self_Employed':p5,
    'ApplicantIncome':p6,
    'CoapplicantIncome':p7,
    'LoanAmount':p8,
    'Loan_Amount_Term':p9,
    'Credit_History':p10,
    'Property_Area':p11
},index=[0])
    result = model.predict(df)
    
    if result == 1:
        Label(master, text="Loan approved").grid(row=31)
    else:
        Label(master, text="Loan Not Approved").grid(row=31)

master =Tk()
master.title("Loan Status Prediction Using Machine Learning")
label = Label(master,text = "Loan Status Prediction",bg = "black",
               fg = "white").grid(row=0,columnspan=2)

Label(master,text = "Gender [1:Male ,0:Female]").grid(row=1)
Label(master,text = "Married [1:Yes,0:No]").grid(row=2)
Label(master,text = "Dependents [1,2,3,4]").grid(row=3)
Label(master,text = "Education[1:Educated, 0: Uneducated]").grid(row=4)
Label(master,text = "Self_Employed[1:employeed, 0:self-employeed]").grid(row=5)
Label(master,text = "ApplicantIncome").grid(row=6)
Label(master,text = "CoapplicantIncome").grid(row=7)
Label(master,text = "LoanAmount").grid(row=8)
Label(master,text = "Loan_Amount_Term").grid(row=9)
Label(master,text = "Credit_History").grid(row=10)
Label(master,text = "Property_Area[0:Rural, 1:Urban, 2:Semi-Urban]").grid(row=11)


e1 = Entry(master)
e2 = Entry(master)
e3 = Entry(master)
e4 = Entry(master)
e5 = Entry(master)
e6 = Entry(master)
e7 = Entry(master)
e8 = Entry(master)
e9 = Entry(master)
e10 = Entry(master)
e11 = Entry(master)

e1.grid(row=1,column=1)
e2.grid(row=2,column=1)
e3.grid(row=3,column=1)
e4.grid(row=4,column=1)
e5.grid(row=5,column=1)
e6.grid(row=6,column=1)
e7.grid(row=7,column=1)
e8.grid(row=8,column=1)
e9.grid(row=9,column=1)
e10.grid(row=10,column=1)
e11.grid(row=11,column=1)

Button(master,text="Predict",command=show_entry).grid()

mainloop()