In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import re

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
# from ydata_profiling import ProfileReport
import scipy.stats as stats 
from scipy.stats import mode

# Suppress warnings
import warnings 
warnings.filterwarnings("ignore")

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

# Model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,RandomizedSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, accuracy_score, auc, precision_recall_curve, average_precision_score

# Machine learning models
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import xgboost as xgb
import lightgbm as lgb

# Visualization of feature importances
from yellowbrick.model_selection import FeatureImportances

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('float_format', '{:f}'.format)

In [14]:
df = pd.read_csv('../data/processed/train_cleaned.csv')
print('This dataset has %d rows dan %d columns.\n' % df.shape)
df.head()

This dataset has 100000 rows dan 27 columns.



Unnamed: 0,ID,Customer_ID,Month,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,23,821000265.0,12,19114.12,1824.843333,3.0,4,3.0,4.0,135,3.0,7.0,11.27,4.0,2,809.98,26.82262,265.0,1,49.574949,80.415295,2,312.494089,Good
1,5635,3392,2,23,821000265.0,12,19114.12,1584.78875,3.0,4,3.0,4.0,135,3.0,5.0,11.27,4.0,1,809.98,31.94496,265.0,1,49.574949,118.280222,3,284.629162,Good
2,5636,3392,3,14,821000265.0,12,19114.12,1583.388333,3.0,4,3.0,4.0,135,3.0,7.0,9.27,4.0,1,809.98,28.609352,267.0,1,49.574949,81.699521,4,331.209863,Good
3,5637,3392,4,23,821000265.0,12,19114.12,1569.50875,3.0,4,3.0,4.0,135,5.0,4.0,6.27,4.0,1,809.98,31.377862,268.0,1,49.574949,199.458074,5,223.45131,Good
4,5638,3392,5,23,821000265.0,12,19114.12,1824.843333,3.0,4,3.0,4.0,135,6.0,0.0,11.27,4.0,1,809.98,24.797347,269.0,1,49.574949,41.420153,1,341.489231,Good


In [15]:
df = df.drop([
    "ID", "Customer_ID",  "SSN", "Type_of_Loan"], axis=1)

df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,1,23,12,19114.12,1824.843333,3.0,4,3.0,4.0,3.0,7.0,11.27,4.0,2,809.98,26.82262,265.0,1,49.574949,80.415295,2,312.494089,Good
1,2,23,12,19114.12,1584.78875,3.0,4,3.0,4.0,3.0,5.0,11.27,4.0,1,809.98,31.94496,265.0,1,49.574949,118.280222,3,284.629162,Good
2,3,14,12,19114.12,1583.388333,3.0,4,3.0,4.0,3.0,7.0,9.27,4.0,1,809.98,28.609352,267.0,1,49.574949,81.699521,4,331.209863,Good
3,4,23,12,19114.12,1569.50875,3.0,4,3.0,4.0,5.0,4.0,6.27,4.0,1,809.98,31.377862,268.0,1,49.574949,199.458074,5,223.45131,Good
4,5,23,12,19114.12,1824.843333,3.0,4,3.0,4.0,6.0,0.0,11.27,4.0,1,809.98,24.797347,269.0,1,49.574949,41.420153,1,341.489231,Good


In [16]:
df["Credit_Score"].value_counts()

Credit_Score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64

In [17]:
label_encoder = LabelEncoder()
df["Credit_Score"] = label_encoder.fit_transform(df["Credit_Score"])

In [18]:
X = df.drop(columns=['Credit_Score'])
y = df['Credit_Score']

In [19]:
y.value_counts()

Credit_Score
2    53174
1    28998
0    17828
Name: count, dtype: int64

In [20]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [22]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# Split into train (75%) and temp (25%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_res, y_res, test_size=1 - train_ratio, stratify=y_res, random_state=42
)

# Split temp (25%) into validation (15%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=test_ratio / (test_ratio + validation_ratio),
    stratify=y_temp,
    random_state=42
)

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Print the shapes of the datasets
print(f"X_train shape: {X_train_scaled.shape}")
print(f"X_val shape: {X_val_scaled.shape}")
print(f"X_test shape: {X_test_scaled.shape}")
print(f"y_train distribution:\n{y_train.value_counts(normalize=True)}")
print(f"y_val distribution:\n{y_val.value_counts(normalize=True)}")
print(f"y_test distribution:\n{y_test.value_counts(normalize=True)}")

X_train shape: (119641, 22)
X_val shape: (23928, 22)
X_test shape: (15953, 22)
y_train distribution:
Credit_Score
0   0.333339
1   0.333331
2   0.333331
Name: proportion, dtype: float64
y_val distribution:
Credit_Score
1   0.333333
0   0.333333
2   0.333333
Name: proportion, dtype: float64
y_test distribution:
Credit_Score
2   0.333354
1   0.333354
0   0.333292
Name: proportion, dtype: float64


In [30]:
def evaluate_model(model, X, y, dataset_name):
    # Predict class probabilities
    y_pred_proba = model.predict_proba(X)  # Returns probabilities

    # Predict class labels
    y_pred = model.predict(X)

    print(f"--- Evaluation for {dataset_name} ---")
    print(f"Accuracy: {accuracy_score(y, y_pred):.4f}")
    print(f"Recall: {recall_score(y, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y, y_pred, average='weighted'):.4f}")
    
    # Multi-class ROC AUC
    if len(y.unique()) > 2:  # Multi-class
        print(f"ROC AUC: {roc_auc_score(y, y_pred_proba, multi_class='ovr'):.4f}")
    else:  # Binary
        print(f"ROC AUC: {roc_auc_score(y, y_pred_proba[:, 1]):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y, y_pred))

In [31]:
xgb_model = xgb.XGBClassifier(
    random_state=42, 
    eval_metric="mlogloss", 
    use_label_encoder=False, 
    tree_method="hist"
)

# LightGBM Model
lgb_model = lgb.LGBMClassifier(
    random_state=42, 
    objective="multiclass"
)

In [32]:
#  Train XGBoost
xgb_model.fit(X_train_scaled, y_train)
# Evaluate XGBoost
evaluate_model(xgb_model, X_train_scaled, y_train, "Training Data (XGBoost)")
evaluate_model(xgb_model, X_val_scaled, y_val, "Validation Data (XGBoost)")
evaluate_model(xgb_model, X_test_scaled, y_test, "Test Data (XGBoost)")

# Train LightGBM
lgb_model.fit(X_train_scaled, y_train)
# Evaluate LightGBM
evaluate_model(lgb_model, X_train_scaled, y_train, "Training Data (LightGBM)")
evaluate_model(lgb_model, X_val_scaled, y_val, "Validation Data (LightGBM)")
evaluate_model(lgb_model, X_test_scaled, y_test, "Test Data (LightGBM)")

--- Evaluation for Training Data (XGBoost) ---
Accuracy: 0.8517
Recall: 0.8517
F1 Score: 0.8506
ROC AUC: 0.9619

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.93      0.89     39881
           1       0.86      0.86      0.86     39880
           2       0.84      0.77      0.80     39880

    accuracy                           0.85    119641
   macro avg       0.85      0.85      0.85    119641
weighted avg       0.85      0.85      0.85    119641

--- Evaluation for Validation Data (XGBoost) ---
Accuracy: 0.8107
Recall: 0.8107
F1 Score: 0.8093
ROC AUC: 0.9405

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      7976
           1       0.83      0.81      0.82      7976
           2       0.78      0.72      0.75      7976

    accuracy                           0.81     23928
   macro avg       0.81      0.81      0.81     23928
weighted avg       

In [34]:
bag_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=10, random_state=42),
    n_estimators=200, 
    random_state=42
)

In [35]:
bag_model.fit(X_train_scaled, y_train)

# Evaluate the model on Training, Validation, and Test sets
print("\nBagging Classifier Performance:")
evaluate_model(bag_model, X_train_scaled, y_train, "Training Data (Bagging)")
evaluate_model(bag_model, X_val_scaled, y_val, "Validation Data (Bagging)")
evaluate_model(bag_model, X_test_scaled, y_test, "Test Data (Bagging)")


Bagging Classifier Performance:
--- Evaluation for Training Data (Bagging) ---
Accuracy: 0.7787
Recall: 0.7787
F1 Score: 0.7769
ROC AUC: 0.9087

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81     39881
           1       0.81      0.79      0.80     39880
           2       0.77      0.68      0.72     39880

    accuracy                           0.78    119641
   macro avg       0.78      0.78      0.78    119641
weighted avg       0.78      0.78      0.78    119641

--- Evaluation for Validation Data (Bagging) ---
Accuracy: 0.7516
Recall: 0.7516
F1 Score: 0.7493
ROC AUC: 0.8928

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      7976
           1       0.79      0.76      0.77      7976
           2       0.73      0.64      0.68      7976

    accuracy                           0.75     23928
   macro avg       0.75      0.75      0

In [42]:
# from catboost import CatBoostClassifier

# cat_model = CatBoostClassifier(
#     random_seed=42, 
#     loss_function='MultiClass', 
#     eval_metric='Accuracy',
#     verbose=100, 
#     iterations=500
# )
# cat_model.fit(X_train_scaled, y_train, eval_set=(X_val_scaled, y_val))


# # Evaluate the model on Training, Validation, and Test sets
# print("\nBagging Classifier Performance:")
# evaluate_model(bag_model, X_train_scaled, y_train, "Training Data (Bagging)")
# evaluate_model(bag_model, X_val_scaled, y_val, "Validation Data (Bagging)")
# evaluate_model(bag_model, X_test_scaled, y_test, "Test Data (Bagging)")
