In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [150]:
df = pd.read_csv("E:\loan approval dataset.csv")
print(df.shape)
print(df.info())

(614, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 43.2+ KB
None


The above output shows that there are 8 categorical (object) and 1 numerical variables - four float and two integers. The output also shows that few of the variables in the data have less than 614 observations, suggesting the presence of missing values.

# Finding Missing Values

In [151]:
df.apply(lambda x: sum(x.isnull()),axis=0) 

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Missing Values for Continuous Variables

The most widely used method for treating missing values in numeric variables is by replacing them with the measures of central tendency.

In [152]:
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(), inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mean(), inplace=True)
df.apply(lambda x: sum(x.isnull()),axis=0) 

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

# Missing Values for Categorical Variables

In [153]:
df['Gender'].value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

In [154]:
df['Gender'].fillna('Male',inplace=True)
df['Gender'].value_counts()

Male      502
Female    112
Name: Gender, dtype: int64

In [155]:
df['Married'].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [156]:
df['Married'].fillna('Yes',inplace=True)
df['Married'].value_counts()

Yes    401
No     213
Name: Married, dtype: int64

In [157]:
df['Self_Employed'].fillna('No',inplace=True)
df['Married'].value_counts()

Yes    401
No     213
Name: Married, dtype: int64

In [158]:
df.apply(lambda x: sum(x.isnull()),axis=0) 

Loan_ID               0
Gender                0
Married               0
Dependents           15
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

# Deleting the Missing Records

The missing values of the variable 'Gender' has been replaced. However, the data still has 15 missing values in the variable 'Dependents'.

In [159]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 599 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            599 non-null    object 
 1   Gender             599 non-null    object 
 2   Married            599 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          599 non-null    object 
 5   Self_Employed      599 non-null    object 
 6   ApplicantIncome    599 non-null    int64  
 7   CoapplicantIncome  599 non-null    float64
 8   LoanAmount         599 non-null    float64
 9   Loan_Amount_Term   599 non-null    float64
 10  Credit_History     599 non-null    float64
 11  Property_Area      599 non-null    object 
 12  Loan_Status        599 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 46.8+ KB


The data is free from missing values and we can now work on the feature engineering part. We will start with the categorical variables.

# Feature Engineering for the Categorical Variables

# Encoding Categorical Variables

For modeling using scikit-learn, all the variables should be numeric, so we will have to change the labels. There are two common techniques of performing this.
1.One Hot Encoding
2.Dummy Encoding

Dummy Encoding

In this technique, the features are encoded so that there is no duplication of the information. It can be achieved by passing in the argument drop_first=True to the .get_dummies function, as done in the first four lines of code below. The fifth line of code drops the category 'S_Male', while the sixth line prints the information about the data.

In [160]:
df.drop('Loan_ID',axis = 1,inplace=True)

In [161]:
df.drop('Property_Area',axis = 1,inplace=True)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Y


In [162]:
from sklearn.preprocessing import LabelEncoder

In [163]:
le = LabelEncoder()
df['Self_Employed'] = le.fit_transform(df.Self_Employed)
df['Gender'] = le.fit_transform(df.Gender)

In [164]:
df['Education'] = le.fit_transform(df.Education)
df['Married'] = le.fit_transform(df.Married)
df['Dependents'] = le.fit_transform(df.Dependents)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,Y
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,N
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,Y
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,Y
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,Y


In [165]:
df['Loan_Status'] = le.fit_transform(df.Loan_Status)
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,1


In [166]:
df.Loan_Status.value_counts()

1    413
0    186
Name: Loan_Status, dtype: int64

In [167]:
X = df.iloc[:,:-1]
y = df.Loan_Status
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0


In [168]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [169]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [170]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_predict))
pd.crosstab(y_test,y_predict)

0.7


col_0,0,1
Loan_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8,43
1,11,118


In [171]:
43/52


0.8269230769230769

In [172]:
118/129

0.9147286821705426

In [173]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.
