# **Importing Libraries**

In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import eli5
from pdpbox import pdp, get_dataset, info_plots
import shap
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# **Read and Loading the Dataset**

In [None]:
#Loading the Data
df=pd.read_csv("/kaggle/input/loan-approval-prediction/Training Dataset.csv")

# **Exploratory Data Analysis**

In [None]:
#Exploring the data
#Checking the top 5 rows of data
df.head()

In [None]:
#Checking the size of Dataset
df.info()

In [None]:
#Checking the null values in the columns.
df.isnull().sum()

In [None]:
#Dropping null values
data =df.dropna()
data.shape

# **Data PreProcessing And Visualization**

In [None]:
#Setting Object Data Types
obj = (data.dtypes == 'object')

In [None]:
#Dropping 'Loan_ID' column as it have unique values and are not related with any other column.
data.drop(['Loan_ID'], axis=1, inplace=True)

Converting categorical values to numerical value.

In [None]:
#Label_encoder
label_encoder = preprocessing.LabelEncoder()
obj = (data.dtypes == 'object')
for col in list(obj[obj].index):
    data[col] = label_encoder.fit_transform(data[col])

In [None]:
#Checking the Data Columns
obj = (df.dtypes == 'object')
print("Categorical Variables:", len(list(obj[obj].index)))

# **Data Visualisation**

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(), cmap='BrBG', fmt='.2f', linewidth=2, annot=True)

In [None]:
sns.catplot(x='Gender', y='Married', hue='Loan_Status', kind='bar', data=data)

There are missing values. But we will try to proceed further to see how the model works.

# **Splitting the DataSet**

In [None]:
X = data.drop(['Loan_Status'], axis=1)
Y = data['Loan_Status']
X.shape, Y.shape

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

# **Model Training & Evaluation**

We will use 4 Models for this problem.
1. KNeighbors Classifiers
2. Random Forest Classifiers
3. SupportVector Classifiers (SVC)
4. Logistics Regression

In [None]:
knnc = KNeighborsClassifier(n_neighbors = 3)
rfc = RandomForestClassifier(n_estimators = 7, criterion = 'entropy', random_state=7)
svc = SVC()
lrc = LogisticRegression()

for c in (rfc, knnc, svc, lrc):
    c.fit(X_train, Y_train)
    Y_pred = c.predict(X_train)
    print("Accuracy of ", c.__class__.__name__, '=', 100*metrics.accuracy_score(Y_train, Y_pred))

**Cross - Validation**

In [None]:
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestClassifier(n_estimators=100,
                                                              random_state=0))
                             ])
# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, Y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores)

In [None]:
print("Average MAE score (across experiments):")
print(scores.mean())

In [None]:
my_model = RandomForestClassifier(n_estimators=7,
                                  random_state=0).fit(X_train, Y_train)

# **Explainability**

In [None]:
#Feature Importance
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(my_model, random_state=1).fit(X_train, Y_train)
eli5.show_weights(perm, feature_names = X_train.columns.tolist())

Here we can see the feature which is important for the model in predictions.

In [None]:
row_to_show = 2
data_for_prediction = X_train.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


my_model.predict_proba(data_for_prediction_array)

In [None]:
# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)

Summary Plot

In [None]:
# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(X_train)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values[1], X_train)

SHAP Dependence Contribution Plots

In [None]:
# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

# calculate shap values. This is what we will plot.
shap_values = explainer.shap_values(X)

# make plot.
shap.dependence_plot('LoanAmount', shap_values[1], X, interaction_index="ApplicantIncome")