In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
insurance = pd.read_excel('INSURANCE.xlsx', sheet_name='TRAINING')
insurance.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
insurance.drop('Loan_ID', axis=1, inplace=True)
insurance.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


#### Missing Values

In [4]:
null_count = insurance.isnull().sum()

In [5]:
percentage = insurance.isnull().sum()*100/insurance.shape[0]

In [6]:
pd.concat([null_count, percentage], axis=1, keys=['null_count','percentage']).sort_values(by='percentage',ascending=False)

Unnamed: 0,null_count,percentage
Credit_History,50,8.143322
Self_Employed,32,5.211726
LoanAmount,22,3.583062
Dependents,15,2.442997
Loan_Amount_Term,14,2.28013
Gender,13,2.117264
Married,3,0.488599
Education,0,0.0
ApplicantIncome,0,0.0
CoapplicantIncome,0,0.0


In [7]:
# fill missing `Gender` values with the mode
insurance.Gender.fillna(insurance.Gender.mode()[0],inplace=True)
# fill missing `Married` values with the mode
insurance.Married.fillna(insurance.Married.mode()[0],inplace=True)
# fill missing `Dependents` values with the mode
insurance.Dependents.fillna(insurance.Dependents.mode()[0],inplace=True)
# fill missing `Self_Employed` values with the mode
insurance.Self_Employed.fillna(insurance.Self_Employed.mode()[0],inplace=True)
# fill missing `LoanAmount` values with the median
insurance.LoanAmount.fillna(insurance.LoanAmount.median(),inplace=True)
# fill missing `Loan_Amount_Term` values with the mode
insurance.Loan_Amount_Term.fillna(insurance.Loan_Amount_Term.mode()[0],inplace=True)
# fill missing `Credit_History` values with the mode
insurance.Credit_History.fillna(insurance.Credit_History.mode()[0],inplace=True)

In [8]:
insurance.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [9]:
insurance.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0.0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [10]:
code_numeric = {
    'Male':1,
    'Female':0,
    'Graduate':1,
    'Not Graduate':0,
    'Yes':1,
    'No':0,
    'Y':1,
    'N':0
}

In [11]:
insurance = insurance.applymap(lambda x: code_numeric.get(x) if x in code_numeric else x)

In [12]:
insurance.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,Urban,1
1,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,Rural,0
2,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,Urban,1
3,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,Urban,1
4,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,Urban,1


In [13]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int64  
 1   Married            614 non-null    int64  
 2   Dependents         614 non-null    float64
 3   Education          614 non-null    int64  
 4   Self_Employed      614 non-null    int64  
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 57.7+ KB


In [14]:
insurance = insurance.astype(dtype={'Dependents':'int64','Loan_Amount_Term':'int64','Credit_History':'int64'})

In [15]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int64  
 1   Married            614 non-null    int64  
 2   Dependents         614 non-null    int64  
 3   Education          614 non-null    int64  
 4   Self_Employed      614 non-null    int64  
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    int64  
 9   Credit_History     614 non-null    int64  
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    int64  
dtypes: float64(2), int64(9), object(1)
memory usage: 57.7+ KB


In [16]:
def dummify(data, columns):
    for col in columns:
        x = pd.get_dummies(data[col],prefix=col,drop_first=True)
        data = data.drop(col, 1)
        data = pd.concat([data, x], axis=1)
    return data

In [17]:
insurance = dummify(insurance,['Property_Area','Loan_Status'])

  data = data.drop(col, 1)


In [18]:
insurance.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_1
0,1,0,0,1,0,5849,0.0,128.0,360,1,0,1,1
1,1,1,1,1,0,4583,1508.0,128.0,360,1,0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360,1,0,1,1
3,1,1,0,0,0,2583,2358.0,120.0,360,1,0,1,1
4,1,0,0,1,0,6000,0.0,141.0,360,1,0,1,1


### Class Imbalance

In [19]:
insurance.Loan_Status_1.value_counts()

1    422
0    192
Name: Loan_Status_1, dtype: int64

In [20]:
loan_stat_0 = insurance[insurance['Loan_Status_1'] == 0]

In [21]:
loan_stat_0.sample(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_1
251,0,0,2,1,0,3427,0.0,138.0,360,1,0,1,0
487,1,1,1,1,0,18333,0.0,500.0,360,1,0,1,0
308,1,0,0,1,0,20233,0.0,480.0,360,1,0,0,0


In [22]:
insurance_balanced = pd.concat([insurance, loan_stat_0], axis=0)

In [23]:
insurance_balanced.Loan_Status_1.value_counts()

1    422
0    384
Name: Loan_Status_1, dtype: int64

In [24]:
insurance_balanced.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_1
0,1,0,0,1,0,5849,0.0,128.0,360,1,0,1,1
1,1,1,1,1,0,4583,1508.0,128.0,360,1,0,0,0
2,1,1,0,1,1,3000,0.0,66.0,360,1,0,1,1
3,1,1,0,0,0,2583,2358.0,120.0,360,1,0,1,1
4,1,0,0,1,0,6000,0.0,141.0,360,1,0,1,1


### X and Y

In [25]:
insurance_balanced.head(1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_1
0,1,0,0,1,0,5849,0.0,128.0,360,1,0,1,1


In [26]:
X = insurance_balanced.drop('Loan_Status_1', axis=1)
y = insurance_balanced['Loan_Status_1']

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

#### Scaling features

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train.iloc[:, 5:9] = sc.fit_transform(X_train.iloc[:, 5:9])
X_test.iloc[:, 5:9] = sc.fit_transform(X_test.iloc[:, 5:9])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


## Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
classifier_LR = LogisticRegression(max_iter=200)
classifier_LR.fit(X_train, y_train)

y_pred_LR = classifier_LR.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred_LR)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred_LR))
print('F1 Score:', f1_score(y_test, y_pred_LR))
print('Precision:', precision_score(y_test, y_pred_LR))
print('Recall:', recall_score(y_test, y_pred_LR))

[[33 37]
 [11 81]]
Accuracy: 0.7037037037037037
F1 Score: 0.7714285714285714
Precision: 0.6864406779661016
Recall: 0.8804347826086957


## K-Nearest Neighbors

In [32]:
from sklearn.neighbors import KNeighborsClassifier
classifier_KNN = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
classifier_KNN.fit(X_train, y_train)

y_pred_KNN = classifier_KNN.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred_KNN)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred_KNN))
print('F1 Score:', f1_score(y_test, y_pred_KNN))
print('Precision:', precision_score(y_test, y_pred_KNN))
print('Recall:', recall_score(y_test, y_pred_KNN))

[[42 28]
 [28 64]]
Accuracy: 0.654320987654321
F1 Score: 0.6956521739130435
Precision: 0.6956521739130435
Recall: 0.6956521739130435


## Support Vector Machine

In [33]:
from sklearn.svm import SVC
classifier_SVM = SVC(kernel = 'linear', random_state = 0)
classifier_SVM.fit(X_train, y_train)

y_pred_SVM = classifier_SVM.predict(X_test)

cm = confusion_matrix(y_test, y_pred_SVM)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred_SVM))
print('F1 Score:', f1_score(y_test, y_pred_SVM))
print('Precision:', precision_score(y_test, y_pred_SVM))
print('Recall:', recall_score(y_test, y_pred_SVM))

[[27 43]
 [ 3 89]]
Accuracy: 0.7160493827160493
F1 Score: 0.7946428571428571
Precision: 0.6742424242424242
Recall: 0.967391304347826


## Support Vector Machine - Kernel

In [34]:
from sklearn.svm import SVC
classifier_Kernel = SVC(kernel = 'rbf', random_state = 0)
classifier_Kernel.fit(X_train, y_train)

y_pred_Kernel = classifier_Kernel.predict(X_test)

cm = confusion_matrix(y_test, y_pred_Kernel)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred_Kernel))
print('F1 Score:', f1_score(y_test, y_pred_Kernel))
print('Precision:', precision_score(y_test, y_pred_Kernel))
print('Recall:', recall_score(y_test, y_pred_Kernel))

[[33 37]
 [ 8 84]]
Accuracy: 0.7222222222222222
F1 Score: 0.7887323943661971
Precision: 0.6942148760330579
Recall: 0.9130434782608695


## Naive Bayes

In [35]:
from sklearn.naive_bayes import GaussianNB
classifier_NB = GaussianNB()
classifier_NB.fit(X_train, y_train)

y_pred_NB = classifier_NB.predict(X_test)

cm = confusion_matrix(y_test, y_pred_NB)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred_NB))
print('F1 Score:', f1_score(y_test, y_pred_NB))
print('Precision:', precision_score(y_test, y_pred_NB))
print('Recall:', recall_score(y_test, y_pred_NB))

[[30 40]
 [ 4 88]]
Accuracy: 0.7283950617283951
F1 Score: 0.8
Precision: 0.6875
Recall: 0.9565217391304348


## Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier
classifier_D3 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_D3.fit(X_train, y_train)

y_pred_D3 = classifier_D3.predict(X_test)

cm = confusion_matrix(y_test, y_pred_D3)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred_D3))
print('F1 Score:', f1_score(y_test, y_pred_D3))
print('Precision:', precision_score(y_test, y_pred_D3))
print('Recall:', recall_score(y_test, y_pred_D3))

[[45 25]
 [25 67]]
Accuracy: 0.691358024691358
F1 Score: 0.7282608695652174
Precision: 0.7282608695652174
Recall: 0.7282608695652174


## Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
classifier_RF = RandomForestClassifier(n_estimators = 150, criterion = 'entropy', random_state = 0)
classifier_RF.fit(X_train, y_train)

y_pred_RF = classifier_RF.predict(X_test)

cm = confusion_matrix(y_test, y_pred_RF)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred_RF))
print('F1 Score:', f1_score(y_test, y_pred_RF))
print('Precision:', precision_score(y_test, y_pred_RF))
print('Recall:', recall_score(y_test, y_pred_RF))

[[42 28]
 [ 9 83]]
Accuracy: 0.7716049382716049
F1 Score: 0.8177339901477833
Precision: 0.7477477477477478
Recall: 0.9021739130434783
