 ##  To extract and display important features for predicting Credit Score Category using a Decision Tree


In [16]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
# -------------------------
#  Load the dataset
# -------------------------
df = pd.read_csv("data.csv")

# ---------------------------------------------
#  Drop irrelevant or unused columns
# ---------------------------------------------
df = df.drop(columns=["ApplicationDate", "LoanApproved", "RiskScore"])

In [18]:
# ------------------------------------------------------
# Convert CreditScore to Categorical Labels
# ------------------------------------------------------
def credit_score_category(score):
    if score < 580:
        return "Poor"
    elif score < 670:
        return "Fair"
    elif score < 740:
        return "Good"
    else:
        return "Excellent"

df['CreditScoreCategory'] = df['CreditScore'].apply(credit_score_category)
df = df.drop(columns=["CreditScore"])

In [19]:
# ------------------------------------------------
#  Encode Categorical Columns Numerically
# ------------------------------------------------
categorical_cols = df.select_dtypes(include='object').columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# ------------------------------------------
#  Prepare Features and Target Labels
# ------------------------------------------
X = df.drop(columns=['CreditScoreCategory'])
y = df['CreditScoreCategory']

# -------------------------------
#  Split into Train & Test
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [20]:

# ----------------------------------------
#  Train Decision Tree Classifier (all features)
# ----------------------------------------
clf = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
clf.fit(X_train, y_train)

# -------------------------------
#  Extract Important Features
# -------------------------------
importance = pd.Series(clf.feature_importances_, index=X.columns)
threshold = 0.05
selected_features = importance[importance > threshold].index.tolist()

print("\n All Feature Importances:")
print(importance.sort_values(ascending=False))

print(f"\n Selected Features (importance > {threshold}):")
print(selected_features)



 All Feature Importances:
BaseInterestRate              0.525263
LoanDuration                  0.231543
LoanAmount                    0.217971
MonthlyLoanPayment            0.010285
Experience                    0.001426
SavingsAccountBalance         0.001223
UtilityBillsPaymentHistory    0.001048
InterestRate                  0.000892
CreditCardUtilizationRate     0.000780
TotalAssets                   0.000746
MonthlyDebtPayments           0.000745
TotalLiabilities              0.000704
CheckingAccountBalance        0.000698
DebtToIncomeRatio             0.000671
NetWorth                      0.000585
Age                           0.000576
PaymentHistory                0.000522
JobTenure                     0.000520
TotalDebtToIncomeRatio        0.000506
MonthlyIncome                 0.000503
NumberOfOpenCreditLines       0.000481
LengthOfCreditHistory         0.000465
NumberOfDependents            0.000430
AnnualIncome                  0.000357
MaritalStatus                 0.00022

In [21]:

# ---------------------------------------------
# Train Model on Selected Features Only
# ---------------------------------------------
X_sel = df[selected_features]

X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
    X_sel, y, test_size=0.2, random_state=42
)

clf_sel = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
clf_sel.fit(X_train_sel, y_train_sel)

# -----------------------------------------
# Evaluate Model on Selected Features
# -----------------------------------------
y_pred_sel = clf_sel.predict(X_test_sel)
accuracy_sel = accuracy_score(y_test_sel, y_pred_sel)

print(f"\n Model Accuracy (Selected Features): {accuracy_sel:.4f}")
print("\n Classification Report:")
print(classification_report(y_test_sel, y_pred_sel))


 Model Accuracy (Selected Features): 0.9910

 Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     19150
           1       0.88      0.85      0.86       318
           2       0.99      0.99      0.99     20532

    accuracy                           0.99     40000
   macro avg       0.95      0.94      0.95     40000
weighted avg       0.99      0.99      0.99     40000



In [15]:

# Confusion matrix
# conf_matrix = confusion_matrix(y_test_sel, y_pred_sel)
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
#             xticklabels=clf_sel.classes_, yticklabels=clf_sel.classes_)
# plt.title("Confusion Matrix (Selected Features)")
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.tight_layout()
# plt.show()
