In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [2]:
data = pd.read_csv('loan_approval_dataset.csv')

In [3]:
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
data[data['education']=='Graduate']

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status


In [5]:
# df.columns
# df['education'].str.strip().unique()

In [6]:
df = data.copy()
for col in df.columns:
    if df[col].dtype=='O':
        df[col] = df[col].str.strip()


In [7]:
# Cleaning --> EDA --> train_test_split --> Feature Engg.(Encoding,scaling,etc)--> model train --> Tuning

In [8]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [9]:
df.shape

(4269, 13)

In [10]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [11]:
X.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000


In [12]:
y.head()

0    Approved
1    Rejected
2    Rejected
3    Rejected
4    Rejected
Name: loan_status, dtype: object

In [13]:
X.shape

(4269, 12)

In [14]:
y.shape

(4269,)

In [15]:
4269*0.75

3201.75

In [16]:
4269*0.25

1067.25

In [17]:
X.drop('loan_id',axis=1,inplace=True)

In [18]:
# train_test
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.75)

In [19]:
df.head(11)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
5,6,0,Graduate,Yes,4800000,13500000,10,319,6800000,8300000,13700000,5100000,Rejected
6,7,5,Graduate,No,8700000,33000000,4,678,22500000,14800000,29200000,4300000,Approved
7,8,2,Graduate,Yes,5700000,15000000,20,382,13200000,5700000,11800000,6000000,Rejected
8,9,0,Graduate,Yes,800000,2200000,20,782,1300000,800000,2800000,600000,Approved
9,10,5,Not Graduate,No,1100000,4300000,10,388,3200000,1400000,3300000,1600000,Rejected


In [20]:
# X_train.shape
X_train.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
497,2,Not Graduate,No,8100000,24800000,6,307,200000,4000000,31100000,10500000
1339,5,Not Graduate,Yes,8800000,26600000,16,545,19100000,16600000,30400000,7600000
2607,4,Not Graduate,No,5400000,12600000,18,786,300000,2500000,21000000,6600000
628,2,Not Graduate,Yes,4500000,12500000,12,418,400000,4300000,12400000,4300000
3364,1,Graduate,Yes,5200000,16300000,6,524,11000000,3400000,20700000,4600000


In [21]:
# X_test.shape
y_train.head()

497     Rejected
1339    Rejected
2607    Approved
628     Rejected
3364    Rejected
Name: loan_status, dtype: object

# Encoding

In [22]:
ohe = OneHotEncoder(drop='first')
# label_enc = LabelEncoder()

In [23]:
numerical = []
categorical = []

for col in X.columns:
    if X[col].dtype=='O':
        categorical.append(col)
    else:
        numerical.append(col)


In [24]:
categorical

['education', 'self_employed']

In [25]:
X_train_enc = ohe.fit_transform(X_train[categorical])
X_test_enc = ohe.transform(X_test[categorical])

In [26]:
X_train_enc

<3201x2 sparse matrix of type '<class 'numpy.float64'>'
	with 3218 stored elements in Compressed Sparse Row format>

In [27]:
X_train_enc.toarray()

array([[1., 0.],
       [1., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 1.],
       [0., 1.]])

In [28]:
X_test_enc.toarray()

array([[0., 0.],
       [1., 1.],
       [0., 1.],
       ...,
       [1., 1.],
       [1., 0.],
       [1., 1.]])

In [29]:
X_train[numerical].head()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
497,2,8100000,24800000,6,307,200000,4000000,31100000,10500000
1339,5,8800000,26600000,16,545,19100000,16600000,30400000,7600000
2607,4,5400000,12600000,18,786,300000,2500000,21000000,6600000
628,2,4500000,12500000,12,418,400000,4300000,12400000,4300000
3364,1,5200000,16300000,6,524,11000000,3400000,20700000,4600000


In [30]:
X_train_enc_df = pd.DataFrame(data=X_test_enc.toarray(),columns=ohe.get_feature_names_out())
X_train_enc_df.head()

Unnamed: 0,education_Not Graduate,self_employed_Yes
0,0.0,0.0
1,1.0,1.0
2,0.0,1.0
3,0.0,0.0
4,0.0,1.0


In [31]:
X_train_num = X_train[numerical].reset_index(drop=True)

In [32]:
X_train_num

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,8100000,24800000,6,307,200000,4000000,31100000,10500000
1,5,8800000,26600000,16,545,19100000,16600000,30400000,7600000
2,4,5400000,12600000,18,786,300000,2500000,21000000,6600000
3,2,4500000,12500000,12,418,400000,4300000,12400000,4300000
4,1,5200000,16300000,6,524,11000000,3400000,20700000,4600000
...,...,...,...,...,...,...,...,...,...
3196,3,8300000,25900000,8,696,14400000,12200000,23500000,7100000
3197,2,6900000,18200000,20,396,15600000,4300000,22900000,5400000
3198,1,9000000,33500000,2,748,7900000,9800000,25700000,11500000
3199,1,3200000,8100000,10,670,9100000,3800000,9200000,1900000


In [33]:
X_train = pd.concat([X_train_num,X_train_enc_df],axis=1)

In [34]:
X_train

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_Not Graduate,self_employed_Yes
0,2,8100000,24800000,6,307,200000,4000000,31100000,10500000,0.0,0.0
1,5,8800000,26600000,16,545,19100000,16600000,30400000,7600000,1.0,1.0
2,4,5400000,12600000,18,786,300000,2500000,21000000,6600000,0.0,1.0
3,2,4500000,12500000,12,418,400000,4300000,12400000,4300000,0.0,0.0
4,1,5200000,16300000,6,524,11000000,3400000,20700000,4600000,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
3196,3,8300000,25900000,8,696,14400000,12200000,23500000,7100000,,
3197,2,6900000,18200000,20,396,15600000,4300000,22900000,5400000,,
3198,1,9000000,33500000,2,748,7900000,9800000,25700000,11500000,,
3199,1,3200000,8100000,10,670,9100000,3800000,9200000,1900000,,


In [35]:
X_train = pd.concat([X_train_num,X_train_enc_df],axis=1)

In [36]:
X_train

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_Not Graduate,self_employed_Yes
0,2,8100000,24800000,6,307,200000,4000000,31100000,10500000,0.0,0.0
1,5,8800000,26600000,16,545,19100000,16600000,30400000,7600000,1.0,1.0
2,4,5400000,12600000,18,786,300000,2500000,21000000,6600000,0.0,1.0
3,2,4500000,12500000,12,418,400000,4300000,12400000,4300000,0.0,0.0
4,1,5200000,16300000,6,524,11000000,3400000,20700000,4600000,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
3196,3,8300000,25900000,8,696,14400000,12200000,23500000,7100000,,
3197,2,6900000,18200000,20,396,15600000,4300000,22900000,5400000,,
3198,1,9000000,33500000,2,748,7900000,9800000,25700000,11500000,,
3199,1,3200000,8100000,10,670,9100000,3800000,9200000,1900000,,


# 
<h1 style='color:red'>07-08-2024

In [37]:
X_test_enc_df = pd.DataFrame(data=X_test_enc.toarray(),columns=ohe.get_feature_names_out())
X_test_enc_df.head()

Unnamed: 0,education_Not Graduate,self_employed_Yes
0,0.0,0.0
1,1.0,1.0
2,0.0,1.0
3,0.0,0.0
4,0.0,1.0


In [38]:
X_test_num = X_test[numerical].reset_index(drop=True)

In [39]:
X_test_num.head()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,0,4200000,10300000,10,506,9300000,4500000,12400000,6000000
1,3,6400000,15200000,14,467,15500000,9100000,18400000,3200000
2,5,2300000,5600000,8,642,2600000,1900000,8600000,2500000
3,2,8000000,31900000,16,806,19900000,1800000,23200000,5400000
4,3,8000000,26300000,12,476,21700000,6600000,16400000,9000000


In [40]:
# X_test_num.shape

In [41]:
# X_train_num.shape

In [42]:
X_test = pd.concat([X_test_num,X_test_enc_df],axis=1)
X_test.head()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_Not Graduate,self_employed_Yes
0,0,4200000,10300000,10,506,9300000,4500000,12400000,6000000,0.0,0.0
1,3,6400000,15200000,14,467,15500000,9100000,18400000,3200000,1.0,1.0
2,5,2300000,5600000,8,642,2600000,1900000,8600000,2500000,0.0,1.0
3,2,8000000,31900000,16,806,19900000,1800000,23200000,5400000,0.0,0.0
4,3,8000000,26300000,12,476,21700000,6600000,16400000,9000000,0.0,1.0


# Label Encoding

In [43]:
label_enc = LabelEncoder()

In [44]:
y_train_enc = label_enc.fit_transform(y_train)

In [45]:
y_train_enc[:2]

array([1, 1])

In [46]:
y_train.head(2)

497     Rejected
1339    Rejected
Name: loan_status, dtype: object

In [47]:
label_enc.classes_

array(['Approved', 'Rejected'], dtype=object)

In [48]:
label_enc.inverse_transform([1,0,0,0,1])

array(['Rejected', 'Approved', 'Approved', 'Approved', 'Rejected'],
      dtype=object)

In [49]:
y_test_enc = label_enc.transform(y_test)

In [50]:
y_train_enc

array([1, 1, 0, ..., 0, 0, 0])

## Training the model
<h1 style:

In [51]:
model = DecisionTreeClassifier()

In [52]:
model.fit(X_train,y_train_enc)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test_enc,y_pred)
print(accuracy)


0.9719101123595506


In [53]:
# accuracy,percision,recall,confusion matrix, auc,pr-curve

In [54]:
y_pred

array([1, 1, 0, ..., 0, 1, 0])

In [55]:
y_test = y_test.reset_index(drop=True)

In [56]:
X_test['y_actual'] = y_test

In [57]:
X_test

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_Not Graduate,self_employed_Yes,y_actual
0,0,4200000,10300000,10,506,9300000,4500000,12400000,6000000,0.0,0.0,Rejected
1,3,6400000,15200000,14,467,15500000,9100000,18400000,3200000,1.0,1.0,Rejected
2,5,2300000,5600000,8,642,2600000,1900000,8600000,2500000,0.0,1.0,Approved
3,2,8000000,31900000,16,806,19900000,1800000,23200000,5400000,0.0,0.0,Approved
4,3,8000000,26300000,12,476,21700000,6600000,16400000,9000000,0.0,1.0,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...
1063,2,3900000,9300000,8,591,10900000,2700000,9600000,3300000,1.0,1.0,Approved
1064,3,7500000,26000000,2,342,13900000,1900000,26900000,11200000,1.0,1.0,Approved
1065,4,3200000,12800000,8,638,4300000,2600000,8000000,3800000,1.0,1.0,Approved
1066,3,9500000,20400000,14,376,2300000,6600000,25900000,8400000,1.0,0.0,Rejected


In [58]:
y_pred_inv = label_enc.inverse_transform(y_pred)

In [59]:
y_pred_inv

array(['Rejected', 'Rejected', 'Approved', ..., 'Approved', 'Rejected',
       'Approved'], dtype=object)

In [60]:
X_test['y_pred'] = y_pred_inv

In [61]:
X_test

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_Not Graduate,self_employed_Yes,y_actual,y_pred
0,0,4200000,10300000,10,506,9300000,4500000,12400000,6000000,0.0,0.0,Rejected,Rejected
1,3,6400000,15200000,14,467,15500000,9100000,18400000,3200000,1.0,1.0,Rejected,Rejected
2,5,2300000,5600000,8,642,2600000,1900000,8600000,2500000,0.0,1.0,Approved,Approved
3,2,8000000,31900000,16,806,19900000,1800000,23200000,5400000,0.0,0.0,Approved,Approved
4,3,8000000,26300000,12,476,21700000,6600000,16400000,9000000,0.0,1.0,Rejected,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,2,3900000,9300000,8,591,10900000,2700000,9600000,3300000,1.0,1.0,Approved,Approved
1064,3,7500000,26000000,2,342,13900000,1900000,26900000,11200000,1.0,1.0,Approved,Rejected
1065,4,3200000,12800000,8,638,4300000,2600000,8000000,3800000,1.0,1.0,Approved,Approved
1066,3,9500000,20400000,14,376,2300000,6600000,25900000,8400000,1.0,0.0,Rejected,Rejected


# Hyperparam Tuning

In [62]:
model

In [63]:
model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [64]:
params = {
         'criterion':['entropy', 'gini'],
         'max_depth': [3,4,5,6,7],
         'max_features': [3,4,5,6,7],
         'min_samples_leaf': [10,12,14,16],
         'min_samples_split': [20,22,24],
        }

In [65]:
2*5*5*4*3

600

In [66]:
grid_search_cv = GridSearchCV(estimator=model,
                             param_grid=params,
                             cv=10)

In [78]:
grid_search_cv.fit(X_train,y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [79]:
# np.arange(3,15)

In [80]:
grid_search_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 7,
 'min_samples_leaf': 12,
 'min_samples_split': 22}

In [81]:
grid_search_cv.best_score_

0.9746933411214952

In [82]:
y_train_enc

array([1, 1, 0, ..., 0, 0, 0])

In [83]:
best_model = grid_search_cv.best_estimator_

In [73]:
best_model.fit(X_train,y_train_enc)

In [74]:
X_test.head()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_Not Graduate,self_employed_Yes,y_actual,y_pred
0,0,4200000,10300000,10,506,9300000,4500000,12400000,6000000,0.0,0.0,Rejected,Rejected
1,3,6400000,15200000,14,467,15500000,9100000,18400000,3200000,1.0,1.0,Rejected,Rejected
2,5,2300000,5600000,8,642,2600000,1900000,8600000,2500000,0.0,1.0,Approved,Approved
3,2,8000000,31900000,16,806,19900000,1800000,23200000,5400000,0.0,0.0,Approved,Approved
4,3,8000000,26300000,12,476,21700000,6600000,16400000,9000000,0.0,1.0,Rejected,Rejected


In [75]:
best_pred = best_model.predict(X_test.iloc[:,:-2])

In [76]:
best_pred

array([1, 1, 0, ..., 0, 1, 0])

In [77]:
accuracy_score(y_test_enc,best_pred)
print(accuracy)

0.9719101123595506


# Scalling Test