 ##  To extract and display important features for predicting Credit Score Category using a Decision Tree


In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# -------------------------
#  Load the dataset
# -------------------------
df = pd.read_csv("data.csv")

# ---------------------------------------------
#  Drop irrelevant or unused columns
# ---------------------------------------------
df = df.drop(columns=["ApplicationDate", "LoanApproved", "RiskScore"])
# print(df.head())

In [None]:
# ------------------------------------------------------
# Convert CreditScore to Categorical Labels
# ------------------------------------------------------
def credit_score_category(score):
    if score < 580:
        return "Poor"
    elif score < 670:
        return "Fair"
    elif score < 740:
        return "Good"
    else:
        return "Excellent"

df['CreditScoreCategory'] = df['CreditScore'].apply(credit_score_category)
df = df.drop(columns=["CreditScore"])

In [None]:
# ------------------------------------------------
#  Encode Categorical Columns Numerically
# ------------------------------------------------
categorical_cols = df.select_dtypes(include='object').columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# ------------------------------------------
#  Prepare Features and Target Labels
# ------------------------------------------
X = df.drop(columns=['CreditScoreCategory'])
y = df['CreditScoreCategory']

# -------------------------------
#  Split into Train & Test
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:

# ----------------------------------------
#  Train Decision Tree Classifier (all features)
# ----------------------------------------
clf = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
clf.fit(X_train, y_train)

# -------------------------------
#  Extract Important Features
# -------------------------------
importance = pd.Series(clf.feature_importances_, index=X.columns)
threshold = 0.05
selected_features = importance[importance > threshold].index.tolist()

print("\n All Feature Importances:")
print(importance.sort_values(ascending=False))

print(f"\n Selected Features (importance > {threshold}):")
print(selected_features)


In [None]:

# ---------------------------------------------
# Train Model on Selected Features Only
# ---------------------------------------------
X_sel = df[selected_features]

X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
    X_sel, y, test_size=0.2, random_state=42
)

clf_sel = DecisionTreeClassifier(criterion="gini", max_depth=None, random_state=42)
clf_sel.fit(X_train_sel, y_train_sel)

# -----------------------------------------
# Evaluate Model on Selected Features
# -----------------------------------------
y_pred_sel = clf_sel.predict(X_test_sel)
accuracy_sel = accuracy_score(y_test_sel, y_pred_sel)

print(f"\n Model Accuracy (Selected Features): {accuracy_sel:.4f}")
print("\n Classification Report:")
print(classification_report(y_test_sel, y_pred_sel))

In [None]:

# Confusion matrix
# conf_matrix = confusion_matrix(y_test_sel, y_pred_sel)
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
#             xticklabels=clf_sel.classes_, yticklabels=clf_sel.classes_)
# plt.title("Confusion Matrix (Selected Features)")
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.tight_layout()
# plt.show()
