In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('loan_data.csv')

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# Data Cleaning

### Droping unnecessary column

In [5]:
df.drop(['Loan_ID'], axis=1, inplace=True)

In [6]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Filling the missing values (Numerical)

In [7]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mean())

### Droping the missing values remaining (Categorical)

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [10]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Replacing object type into int

In [11]:
df['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [12]:
df['Gender'] = df['Gender'].map({'Male':1,'Female':0}).astype('int')
df['Married'] = df['Married'].map({'Yes':1,'No':0}).astype('int')
df['Education'] = df['Education'].map({'Graduate':1,'Not Graduate':0}).astype('int')
df['Self_Employed'] = df['Self_Employed'].map({'Yes':1,'No':0}).astype('int')
df['Property_Area'] = df['Property_Area'].map({'Urban':1,'Rural':0,'Semiurban':2}).astype('int')
df['Loan_Status'] = df['Loan_Status'].map({'Y':1,'N':0}).astype('int')

In [13]:
df['Dependents'] = df['Dependents'].replace(to_replace="3+",value='3')

In [14]:
df.sample(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
87,1,1,0,1,0,2500,2118.0,104.0,360.0,1.0,2,1
339,0,0,0,1,0,4160,0.0,71.0,360.0,1.0,2,1
356,1,1,2,1,0,8333,3167.0,165.0,360.0,1.0,0,1
96,0,1,0,1,0,2484,2302.0,137.0,360.0,1.0,2,1
12,1,1,2,1,0,3073,8106.0,200.0,360.0,1.0,1,1


# Exploratory Data Analysis

### Creating a new column for Total Income

In [15]:
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,1,0,0,1,0,5849,0.0,146.412162,360.0,1.0,1,1,5849.0
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0,6091.0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1,3000.0
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1,4941.0
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1,6000.0


In [16]:
skewness = df.skew().sort_values(ascending=False)
skewness

ApplicantIncome      6.881481
Total_Income         5.859785
CoapplicantIncome     5.73659
LoanAmount           2.578663
Self_Employed         2.11487
Dependents           0.979788
Property_Area        -0.16091
Married              -0.64658
Loan_Status         -0.821489
Education           -1.341337
Gender              -1.650104
Credit_History      -1.972706
Loan_Amount_Term    -2.463236
dtype: object

### Fixing the right skewed data by Log Transformation

In [17]:
df['ApplicantIncome'] = np.log(df['ApplicantIncome'])
df['CoapplicantIncome'] = np.log(df['CoapplicantIncome'])
df['Total_Income'] = np.log(df['Total_Income'])

skewness = df.skew().sort_values(ascending=False)
skewness

  result = getattr(ufunc, method)(*inputs, **kwargs)


LoanAmount           2.578663
Self_Employed         2.11487
Dependents           0.979788
Total_Income         0.895323
ApplicantIncome      0.604036
Property_Area        -0.16091
Married              -0.64658
Loan_Status         -0.821489
Education           -1.341337
Gender              -1.650104
Credit_History      -1.972706
Loan_Amount_Term    -2.463236
CoapplicantIncome         NaN
dtype: object

In [18]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,1,0,0,1,0,8.674026,-inf,146.412162,360.0,1.0,1,1,8.674026
1,1,1,1,1,0,8.430109,7.31854,128.0,360.0,1.0,0,0,8.714568
2,1,1,0,1,1,8.006368,-inf,66.0,360.0,1.0,1,1,8.006368
3,1,1,0,0,0,7.856707,7.765569,120.0,360.0,1.0,1,1,8.505323
4,1,0,0,1,0,8.699515,-inf,141.0,360.0,1.0,1,1,8.699515


### Droping unnecessary columns

In [19]:
df.drop(['CoapplicantIncome'], axis=1, inplace=True)

# Train-Test Split

In [20]:
X = df.drop(columns=['Loan_Status'], axis=1)
y = df['Loan_Status']

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Model Training

In [22]:
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy is", model.score(x_test, y_test)*100)

In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

Accuracy is 83.45323741007195


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model, X, y)

Accuracy is 69.06474820143885


In [25]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
model = RandomForestClassifier()
classify(model, X, y)

Accuracy is 82.73381294964028


In [26]:
from sklearn import svm
model = svm.SVC()
classify(model, X, y)

Accuracy is 70.50359712230215


In [42]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=1,random_state=42)
classify(model, X, y)

Accuracy is 82.73381294964028


CONCLUSION:

The Loan Status is heavily dependent on the Credit History for Predictions.
The Logistic Regression algorithm gives us the maximum Accuracy (79% approx) compared to the other 3 Machine Learning Classification Algorithms.