In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [72]:
# load dataset in pandas
df = pd.read_csv("Loan_dataset.csv")

In [73]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [74]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [75]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [76]:
df.isnull()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,False,False,False,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,False,False,False,False,False,False,False,False,False,False,False,False,False
610,False,False,False,False,False,False,False,False,False,False,False,False,False
611,False,False,False,False,False,False,False,False,False,False,False,False,False
612,False,False,False,False,False,False,False,False,False,False,False,False,False


In [77]:
# getting total number of null values
total_null_count = df.isnull().sum().sum()
print(f"Total number of null values are: {total_null_count}")

Total number of null values are: 149


In [78]:
# get name of columns with missing values
df.columns[df.isnull().any()]


Index(['Gender', 'Married', 'Dependents', 'Self_Employed', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [79]:
# dropping the rows with null values
df_cleaned = df.dropna()
df = df_cleaned

In [80]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [81]:
# checking the null after dropping the rows with null values
df.isnull().sum().sum()

0

In [82]:
# setting string values to numeric as 1 and 0
df['Gender']  = df['Gender'].map({'Male': 1, 'Female': 0}).astype(int)
df['Married'] = df['Married'].map({'Yes': 1, 'No': 0}).astype(int)
df['Education'] = df['Education'].map({'Graduate': 1, 'Not Graduate': 0}).astype(int)
df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No': 0}).astype(int)
df['Property_Area'] = df['Property_Area'].map({'Urban':1, 'Rural':0, 'Semiurban':2}).astype(int)
df['Loan_Status'] = df['Loan_Status'].map({'Y':1, 'N':0}).astype(int)

In [83]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,1


In [84]:
df.describe()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0
mean,0.820833,0.647917,0.797917,0.1375,5364.23125,1581.093583,144.735417,342.05,0.854167,1.108333,0.691667
std,0.383892,0.478118,0.401973,0.344734,5668.251251,2617.692267,80.508164,65.212401,0.353307,0.822906,0.462287
min,0.0,0.0,0.0,0.0,150.0,0.0,9.0,36.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,2898.75,0.0,100.0,360.0,1.0,0.0,0.0
50%,1.0,1.0,1.0,0.0,3859.0,1084.5,128.0,360.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,0.0,5852.5,2253.25,170.0,360.0,1.0,2.0,1.0
max,1.0,1.0,1.0,1.0,81000.0,33837.0,600.0,480.0,1.0,2.0,1.0


In [85]:
# splitting data for test purpose
x = df[['Gender','Married','Education','Self_Employed',
        'ApplicantIncome','CoapplicantIncome','LoanAmount',
        'Loan_Amount_Term','Credit_History','Property_Area']]
y = df['Loan_Status']

In [86]:
print(y.unique())
print(y.value_counts())
print(x.dtypes)
print(x.head())

[0 1]
Loan_Status
1    332
0    148
Name: count, dtype: int64
Gender                 int64
Married                int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
dtype: object
   Gender  Married  Education  Self_Employed  ApplicantIncome  \
1       1        1          1              0             4583   
2       1        1          1              1             3000   
3       1        1          0              0             2583   
4       1        0          1              0             6000   
5       1        1          1              1             5417   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
1             1508.0       128.0             360.0             1.0   
2                0.0        66.0             360.0             1.0   
3             2358.0       120.0 

In [87]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)


In [88]:
print(x.shape, x_train.shape, x_test.shape)

(480, 10) (384, 10) (96, 10)


In [89]:
# logistic regression
clf = LogisticRegression(max_iter=10000, random_state=0)
clf.fit(x_train, y_train)

In [90]:
# training the data
acc = accuracy_score(y_test, clf.predict(x_test)) *100
print(f"Logistic Regression Accuracy: {acc}")

Logistic Regression Accuracy: 75.0


In [91]:
# testing the data
x_test_prediction = clf.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print(f"Accuracy of Testing data: {test_data_accuracy}")

Accuracy of Testing data: 0.75


In [92]:
# random forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(x_train, y_train)

In [93]:
y_prediction = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test,y_prediction) *100
print(f"Accuracy of Testing data: {accuracy}")

Accuracy of Testing data: 77.08333333333334


In [94]:
classification_rep = classification_report(y_test, y_prediction)
print(f"classification report\n{classification_rep}")

classification report
              precision    recall  f1-score   support

           0       0.88      0.43      0.58        35
           1       0.75      0.97      0.84        61

    accuracy                           0.77        96
   macro avg       0.81      0.70      0.71        96
weighted avg       0.80      0.77      0.75        96



Making Predictive System

In [None]:
input_data = (0, 0, 1, 0, 0, 2900, 0.0, 71.0, 360.0, 1.0)

#changing input data to numpy arrays
input_data_as_numpy_array = np.asarray(input_data)

#Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = rf_clf.predict(input_data_reshaped)
if prediction[0] == 0:
    print("Loan Will Not Be Approved")
else:
    print("Loan Is Approved")

Loan Will Not Be Approved




In [None]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,LP002979,1,1,3+,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,LP002983,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,1,1
612,LP002984,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,1,1
613,LP002990,0,0,0,1,1,4583,0.0,133.0,360.0,0.0,2,0


In [103]:
input_data = (0, 0, 1, 0, 2900, 0.0, 71.0, 360.0, 1.0, 0)

#changing input data to numpy arrays
input_data_as_numpy_array = np.asarray(input_data)

#Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = clf.predict(input_data_reshaped)
if prediction[0] == 0:
    print("Loan Will Not Be Approved")
else:
    print("Loan Is Approved")

Loan Is Approved




SAVING THE TRAINED MODEL

In [104]:
import pickle

In [105]:
filename = "Loan_dataset.sav"
pickle.dump(rf_clf, open(filename, 'wb'))
