In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV


In [2]:
df = pd.read_csv('cleaned_data.csv')

## Data splitting

In [18]:
df.sample()

Unnamed: 0.1,Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
8642,8642,0.0,0.0,0,0.0,1,0,0,0,2,...,False,False,False,True,False,False,False,False,False,True


In [23]:
df.drop(['Unnamed: 0'] , axis = 1 , inplace= True)

In [3]:
y = df['Approved_Flag']
x = df.drop(['Approved_Flag'], axis = 1 )

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## ML Model Training

#### 3.1 Random forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)

rf_classifier.fit(x_train, y_train)


y_pred = rf_classifier.predict(x_test)

print('Random Forest Classifier Report:')
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy}')
print ()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Random Forest Classifier Report:
              precision    recall  f1-score   support

          P1       0.84      0.70      0.77      1014
          P2       0.79      0.93      0.86      5045
          P3       0.44      0.21      0.28      1325
          P4       0.72      0.72      0.72      1029

    accuracy                           0.76      8413
   macro avg       0.70      0.64      0.66      8413
weighted avg       0.74      0.76      0.74      8413


Accuracy: 0.7648876738381077

Class p1:
Precision: 0.8439716312056738
Recall: 0.7041420118343196
F1 Score: 0.767741935483871

Class p2:
Precision: 0.7940283400809717
Recall: 0.9330029732408325
F1 Score: 0.8579239952610953

Class p3:
Precision: 0.4444444444444444
Recall: 0.20528301886792452
F1 Score: 0.2808466701084151

Class p4:
Precision: 0.7224926971762414
Recall: 0.7210884353741497
F1 Score: 0.7217898832684825



**Inference:** That random forest is not able to predict P3 as it has very less f1 score

#### 3.2 XGBoost

In [6]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

xgb_classifier = XGBClassifier(objective='multi:softmax',  num_class=4)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train_xg, x_test_xg, y_train_xg, y_test_xg = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

xgb_classifier.fit(x_train_xg, y_train_xg)
y_pred_xg = xgb_classifier.predict(x_test_xg)

print('XGB Classifier Report:')
print(classification_report(y_test_xg, y_pred_xg))

accuracy = accuracy_score(y_test_xg, y_pred_xg)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

XGB Classifier Report:
              precision    recall  f1-score   support

           0       0.82      0.77      0.80      1014
           1       0.82      0.91      0.87      5045
           2       0.47      0.29      0.36      1325
           3       0.72      0.74      0.73      1029

    accuracy                           0.78      8413
   macro avg       0.71      0.68      0.69      8413
weighted avg       0.75      0.78      0.76      8413


Accuracy: 0.78

Class p1:
Precision: 0.8439716312056738
Recall: 0.7041420118343196
F1 Score: 0.767741935483871

Class p2:
Precision: 0.7940283400809717
Recall: 0.9330029732408325
F1 Score: 0.8579239952610953

Class p3:
Precision: 0.4444444444444444
Recall: 0.20528301886792452
F1 Score: 0.2808466701084151

Class p4:
Precision: 0.7224926971762414
Recall: 0.7210884353741497
F1 Score: 0.7217898832684825



**Inference:** That XGBoost is better than random forest but also is not able to predict P3 as it has very less f1 score

In [7]:
from sklearn.model_selection import GridSearchCV
param_grid = {
  # 'colsample_bytree': [ 0.5, 0.9],
  'learning_rate'   : [0.001, 0.01, 0.1, 1],
  'max_depth'       : [3, 5,],
  'alpha'           : [1, 100],
  'n_estimators'    : [50,100,200]
}
xgb_classifier = XGBClassifier()

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train_xg, x_test_xg, y_train_xg, y_test_xg = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

# Perform grid search
xgb_grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
xgb_grid_search.fit(x_train_xg, y_train_xg)

# Best parameters and score
print("Best parameters for Logistic Regression:", xgb_grid_search.best_params_)
print("Best score for Logistic Regression:", xgb_grid_search.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters for Logistic Regression: {'alpha': 1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best score for Logistic Regression: 0.7794717396146502


#### 3.3 Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)
print('Decision Tree Classifier Report:')
print(classification_report(y_test, y_pred))


accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Decision Tree Classifier Report:
              precision    recall  f1-score   support

          P1       0.73      0.72      0.73      1014
          P2       0.81      0.83      0.82      5045
          P3       0.34      0.31      0.32      1325
          P4       0.63      0.63      0.63      1029

    accuracy                           0.71      8413
   macro avg       0.63      0.62      0.62      8413
weighted avg       0.70      0.71      0.71      8413


Accuracy: 0.71

Class p1:
Precision: 0.7279046673286991
Recall: 0.722879684418146
F1 Score: 0.7253834735279565

Class p2:
Precision: 0.8092373374733165
Recall: 0.8265609514370664
F1 Score: 0.817807413218278

Class p3:
Precision: 0.33797054009819966
Recall: 0.31169811320754715
F1 Score: 0.3243031016882607

Class p4:
Precision: 0.6314258001939864
Recall: 0.6326530612244898
F1 Score: 0.6320388349514564



**Inference:** Less accuracy.

#### 3.4 KNN

In [9]:
# K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train)
knn_predictions = knn_model.predict(x_test)
print('KNN Accuracy:', accuracy_score(y_test, knn_predictions))
print('KNN Classification Report:')
print(classification_report(y_test, knn_predictions))

y_pred = knn_predictions
accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

KNN Accuracy: 0.5564008082729109
KNN Classification Report:
              precision    recall  f1-score   support

          P1       0.31      0.14      0.19      1014
          P2       0.62      0.86      0.72      5045
          P3       0.22      0.10      0.14      1325
          P4       0.21      0.06      0.10      1029

    accuracy                           0.56      8413
   macro avg       0.34      0.29      0.29      8413
weighted avg       0.47      0.56      0.49      8413


Accuracy: 0.56

Class p1:
Precision: 0.31042128603104213
Recall: 0.13806706114398423
F1 Score: 0.19112627986348124

Class p2:
Precision: 0.6157335223245924
Recall: 0.8610505450941526
F1 Score: 0.7180165289256197

Class p3:
Precision: 0.22372881355932203
Recall: 0.09962264150943397
F1 Score: 0.13785900783289817

Class p4:
Precision: 0.20504731861198738
Recall: 0.06316812439261418
F1 Score: 0.09658246656760773



In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter grid for KNeighborsClassifier
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize the KNeighborsClassifier
knn_model = KNeighborsClassifier()

# Perform grid search
knn_grid_search = GridSearchCV(estimator=knn_model, param_grid=knn_param_grid, cv=5, n_jobs=-1, verbose=2)
knn_grid_search.fit(x_train, y_train)

# Best parameters and score
print("Best parameters for KNN:", knn_grid_search.best_params_)
print("Best score for KNN:", knn_grid_search.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
Best score for KNN: 0.5856884349828023


#### 3.5 Logistic Regression

In [11]:
# Logistic Regression
log_reg_model = LogisticRegression(max_iter=200)
log_reg_model.fit(x_train, y_train)
log_reg_predictions = log_reg_model.predict(x_test)
print('Logistic Regression Accuracy:', accuracy_score(y_test, log_reg_predictions))
print('Logistic Regression Classification Report:')
print(classification_report(y_test, log_reg_predictions))

y_pred = log_reg_predictions
accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Logistic Regression Accuracy: 0.6130987757042672
Logistic Regression Classification Report:
              precision    recall  f1-score   support

          P1       0.57      0.30      0.39      1014
          P2       0.62      0.95      0.75      5045
          P3       0.00      0.00      0.00      1325
          P4       0.39      0.04      0.07      1029

    accuracy                           0.61      8413
   macro avg       0.39      0.32      0.30      8413
weighted avg       0.49      0.61      0.51      8413


Accuracy: 0.61

Class p1:
Precision: 0.5666041275797373
Recall: 0.2978303747534517
F1 Score: 0.39043309631544926

Class p2:
Precision: 0.6195316520844055
Recall: 0.9544103072348861
F1 Score: 0.7513458687680424

Class p3:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0

Class p4:
Precision: 0.3867924528301887
Recall: 0.03984450923226433
F1 Score: 0.07224669603524228



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## XGBoost is the best model with accuracy of 78

#### Note : The overall accuracy score of the model is less because they are not able to identify P3 category 

In [31]:
df2 = pd.read_excel("data2.xlsx")

In [44]:
for i in ['P1' , 'P2', 'P3' , 'P4']:
    max_credit_score = df2[df2['Approved_Flag'] == i]['Credit_Score'].max()
    min_credit_score = df2[df2['Approved_Flag'] == i]['Credit_Score'].min()

    print(f'For {i} categorry.The value of maximun and minimun = {min_credit_score} - {max_credit_score}')

For P1 categorry.The value of maximun and minimun = 701 - 811
For P2 categorry.The value of maximun and minimun = 669 - 700
For P3 categorry.The value of maximun and minimun = 489 - 776
For P4 categorry.The value of maximun and minimun = 469 - 658


#### Note: For P3 categorry.The value of maximun and minimun = 489 - 776
#### So it has has wide range of pages so it is difficuit for machine to predict it