In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [16]:
data = pd.read_csv('loan_data_set.csv')

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
data.shape

(614, 13)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [17]:
categorical_variables=['Gender','Married','Dependents','Education','Self_Employed','Credit_History', 'Property_Area', 'Loan_Status']
print('Categorical variable are :\n')
for i in range (len(categorical_variables)):
    print(categorical_variables[i])

Categorical variable are :

Gender
Married
Dependents
Education
Self_Employed
Credit_History
Property_Area
Loan_Status


In [18]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [21]:
data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
data['Married'] = data['Married'].fillna(data['Married'].mode()[0])
data['Dependents'] = data['Dependents'].fillna(data['Dependents'].mode()[0])
data['Self_Employed'] = data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])
data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].mode()[0])

data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].median())
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].median())

In [22]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [24]:
dummies1=pd.get_dummies(data['Gender'])
dummies2=pd.get_dummies(data['Married'])
dummies2=dummies2.rename(columns = {'Yes':'Married_yes','No':'NotMarried'})
dummies3=pd.get_dummies(data['Dependents'])
dummies3=dummies3.rename(columns = {'0': 'Dependents_0','1': 'Dependents_1','2': 'Dependents_2','3+': 'Dependents_3+'})
dummies4=pd.get_dummies(data['Education'])
dummies5=pd.get_dummies(data['Self_Employed'])
dummies5=dummies5.rename(columns = {'Yes':'Self_Employed_yes','No':'NotSelf_Employed'})
dummies6=pd.get_dummies(data['Property_Area'])

In [25]:
data= pd.concat([data,dummies1],axis=1)
data= pd.concat([data,dummies2],axis=1)
data= pd.concat([data,dummies3],axis=1)
data= pd.concat([data,dummies4],axis=1)
data= pd.concat([data,dummies5],axis=1)
data= pd.concat([data,dummies6],axis=1)


data=data.drop(['Loan_ID','Gender','Married','Property_Area','Dependents','Self_Employed','Education','NotMarried','NotSelf_Employed','Not Graduate','Female'] , axis=1)

In [27]:
data['Loan_Status']= data['Loan_Status'].replace(['Y'],1).infer_objects(copy=False)
data['Loan_Status']= data['Loan_Status'].replace(['N'],0).infer_objects(copy=False)

In [28]:
data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Male,Married_yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Graduate,Self_Employed_yes,Rural,Semiurban,Urban
0,5849,0.0,128.0,360.0,1.0,1,True,False,True,False,False,False,True,False,False,False,True
1,4583,1508.0,128.0,360.0,1.0,0,True,True,False,True,False,False,True,False,True,False,False
2,3000,0.0,66.0,360.0,1.0,1,True,True,True,False,False,False,True,True,False,False,True
3,2583,2358.0,120.0,360.0,1.0,1,True,True,True,False,False,False,False,False,False,False,True
4,6000,0.0,141.0,360.0,1.0,1,True,False,True,False,False,False,True,False,False,False,True


In [30]:
X=data.drop(columns='Loan_Status')
Y=pd.DataFrame(data['Loan_Status'])

In [31]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [32]:
classifier = RandomForestClassifier(n_estimators=1000,max_features=15,max_depth=5,bootstrap=True)
classifier.fit(X_train,Y_train)
predictions = classifier.predict(X_test)
accuracyScores = accuracy_score(predictions, Y_test)
print(accuracyScores)

  return fit_method(estimator, *args, **kwargs)


0.7804878048780488


In [35]:
X_test

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Male,Married_yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Graduate,Self_Employed_yes,Rural,Semiurban,Urban
350,9083,0.0,228.0,360.0,1.0,True,True,True,False,False,False,True,False,False,True,False
377,4310,0.0,130.0,360.0,1.0,True,True,True,False,False,False,True,False,False,True,False
163,4167,1447.0,158.0,360.0,1.0,True,True,False,False,True,False,True,False,True,False,False
609,2900,0.0,71.0,360.0,1.0,False,False,True,False,False,False,True,False,True,False,False
132,2718,0.0,70.0,360.0,1.0,True,False,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,3716,0.0,42.0,180.0,1.0,True,True,True,False,False,False,True,False,True,False,False
312,2507,0.0,56.0,360.0,1.0,False,False,True,False,False,False,True,False,True,False,False
248,2882,1843.0,123.0,480.0,1.0,True,True,False,True,False,False,True,False,False,True,False
11,2500,1840.0,109.0,360.0,1.0,True,True,False,False,True,False,True,False,False,False,True


In [33]:
import pickle

In [34]:
filename = 'model.sav'
pickle.dump(classifier, open(filename, 'wb'))