## Installing and Importing Dependencies

In [1]:
!pip install kagglehub pandas scikit-learn



In [2]:
import kagglehub
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

### Loading Dataset

##### Using Kagglehub

In [3]:
path = kagglehub.dataset_download("architsharma01/loan-approval-prediction-dataset")
print("Path to dataset files:", path)
csv_file = os.path.join(path, 'loan_approval_dataset.csv')
supe_data = pd.read_csv(csv_file)

Path to dataset files: C:\Users\kalat\.cache\kagglehub\datasets\architsharma01\loan-approval-prediction-dataset\versions\1


##### Download Directly from website

###### Loan Prediction Dataset --> https://www.kaggle.com/datasets/architsharma01/loan-approval-prediction-dataset/data

## Preprocessing Data

In [4]:
loan_data = supe_data.copy()
loan_data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


##### Checking Null Values

In [5]:
loan_data.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

##### Checking Duplicates

In [6]:
loan_data.duplicated().sum()

0

In [7]:
loan_data.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

##### Encoding Categorical Columns

In [8]:
label_encoder = LabelEncoder()
loan_data.iloc[:, 2] = label_encoder.fit_transform(loan_data.iloc[:, 2])  ## encoding education column

loan_data.iloc[:, 3] = label_encoder.fit_transform(loan_data.iloc[:, 3])  ## encoding self_employed column

In [9]:
loan_data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


## Splitting Data

In [10]:
X_loan = loan_data.iloc[:, 1:12]
y_loan = loan_data[' loan_status']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_loan, y_loan, test_size = 0.25, random_state = 274)

## Decision Tree for Classification

#### Fitting Decision Tree

In [12]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

#### Making Predictions

In [13]:
y_pred = decision_tree.predict(X_test)

#### Evaluation

In [14]:
print('Accuracy Score        :', accuracy_score(y_test, y_pred))
print('Confusion Matrix      :\n', confusion_matrix(y_test, y_pred))
print('Classification Report :\n', classification_report(y_test, y_pred))

Accuracy Score        : 0.9728464419475655
Confusion Matrix      :
 [[658  14]
 [ 15 381]]
Classification Report :
               precision    recall  f1-score   support

    Approved       0.98      0.98      0.98       672
    Rejected       0.96      0.96      0.96       396

    accuracy                           0.97      1068
   macro avg       0.97      0.97      0.97      1068
weighted avg       0.97      0.97      0.97      1068



#### Optimizing using GridSearchCV

##### Declaring parameters that we want to optimize

In [15]:
param_grid = {
    'criterion' : ['gini', 'entropy'],
}

##### Fitting GridSearchCV

In [16]:
grid_search = GridSearchCV(decision_tree, param_grid = param_grid, cv = 10, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

##### Extracting Best Parameters

In [17]:
grid_search.best_params_

{'criterion': 'gini'}

##### Fitting Optimized Model

In [18]:
decision_tree_cv = DecisionTreeClassifier(criterion = 'entropy')
decision_tree_cv.fit(X_train, y_train)

##### Evaluation

In [19]:
y_pred_cv = decision_tree_cv.predict(X_test)
print('Accuracy Score        :', accuracy_score(y_test, y_pred_cv))
print('Confusion Matrix      :\n', confusion_matrix(y_test, y_pred_cv))
print('Classification Report :\n', classification_report(y_test, y_pred_cv))

Accuracy Score        : 0.9794007490636704
Confusion Matrix      :
 [[661  11]
 [ 11 385]]
Classification Report :
               precision    recall  f1-score   support

    Approved       0.98      0.98      0.98       672
    Rejected       0.97      0.97      0.97       396

    accuracy                           0.98      1068
   macro avg       0.98      0.98      0.98      1068
weighted avg       0.98      0.98      0.98      1068



### Comparative Analysis before and after using GridSearchCV

In [20]:
print('Accuracy Score before GridSearchCV   :', accuracy_score(y_test, y_pred))
print('Accuracy Score after GridSearchCV    :', accuracy_score(y_test, y_pred_cv))

print('Confusion Matrix before GridSearchCV :\n', confusion_matrix(y_test, y_pred))
print('Confusion Matrix after GridSearchCV  :\n', confusion_matrix(y_test, y_pred_cv))

Accuracy Score before GridSearchCV   : 0.9728464419475655
Accuracy Score after GridSearchCV    : 0.9794007490636704
Confusion Matrix before GridSearchCV :
 [[658  14]
 [ 15 381]]
Confusion Matrix after GridSearchCV  :
 [[661  11]
 [ 11 385]]


## Random Forest for Classification

#### Fitting Random Forest

In [21]:
random_class = RandomForestClassifier(25)
random_class.fit(X_train, y_train)

#### Evaluation

In [22]:
y_pred_rd = random_class.predict(X_test)
print('Accuracy Score        :', accuracy_score(y_test, y_pred_rd))
print('Confusion Matrix      :\n', confusion_matrix(y_test, y_pred_rd))
print('Classification Report :\n', classification_report(y_test, y_pred_rd))

Accuracy Score        : 0.9709737827715356
Confusion Matrix      :
 [[654  18]
 [ 13 383]]
Classification Report :
               precision    recall  f1-score   support

    Approved       0.98      0.97      0.98       672
    Rejected       0.96      0.97      0.96       396

    accuracy                           0.97      1068
   macro avg       0.97      0.97      0.97      1068
weighted avg       0.97      0.97      0.97      1068



#### Optimizing using GridSearchCV

##### Declaring parameters that we want to optimize

In [23]:
grid_params_rf = {
    'n_estimators' : [10, 25, 50, 100, 200],
    'criterion'    : ['gini', 'entropy'],
}

##### Fitting GridSearchCV

In [24]:
grid_search_rf = GridSearchCV(random_class , param_grid = grid_params_rf, cv = 10, scoring = 'accuracy')
grid_search_rf.fit(X_train, y_train)

##### Extracting Best Parameters

In [25]:
grid_search_rf.best_params_

{'criterion': 'gini', 'n_estimators': 200}

##### Fitting Optimized Model

In [26]:
random_class_cv = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
random_class_cv.fit(X_train, y_train)

##### Making Predictions

In [27]:
y_pred_rd_cv = random_class_cv.predict(X_test)

##### Evaluation

In [28]:
print('Accuracy Score        :', accuracy_score(y_test, y_pred_rd_cv))
print('Confusion Matrix      :\n', confusion_matrix(y_test, y_pred_rd_cv))
print('Classification Report :\n', classification_report(y_test, y_pred_rd_cv))

Accuracy Score        : 0.9831460674157303
Confusion Matrix      :
 [[665   7]
 [ 11 385]]
Classification Report :
               precision    recall  f1-score   support

    Approved       0.98      0.99      0.99       672
    Rejected       0.98      0.97      0.98       396

    accuracy                           0.98      1068
   macro avg       0.98      0.98      0.98      1068
weighted avg       0.98      0.98      0.98      1068



### Comparative analysis before and after using GridSearchCV

In [29]:
print('Accuracy Score before GridSearchCV   :', accuracy_score(y_test, y_pred_rd))
print('Accuracy Score after GridSearchCV    :', accuracy_score(y_test, y_pred_rd_cv))

print('Confusion Matrix before GridSearchCV :\n', confusion_matrix(y_test, y_pred_rd))
print('Confusion Matrix after GridSearchCV  :\n', confusion_matrix(y_test, y_pred_rd_cv))

Accuracy Score before GridSearchCV   : 0.9709737827715356
Accuracy Score after GridSearchCV    : 0.9831460674157303
Confusion Matrix before GridSearchCV :
 [[654  18]
 [ 13 383]]
Confusion Matrix after GridSearchCV  :
 [[665   7]
 [ 11 385]]


In [35]:
random_class_cv.predict([[1, 0, 1, 9000000, 33500000, 2, 748, 7900000, 9800000, 25700000, 11500000]])

array([' Approved'], dtype=object)