In [76]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import re

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
# from ydata_profiling import ProfileReport
import scipy.stats as stats 
from scipy.stats import mode

# Suppress warnings
import warnings 
warnings.filterwarnings("ignore")

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

# Model selection and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,RandomizedSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, accuracy_score, auc, precision_recall_curve, average_precision_score

# Machine learning models
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
import xgboost as xgb
import lightgbm as lgb

# Visualization of feature importances
from yellowbrick.model_selection import FeatureImportances

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('float_format', '{:f}'.format)

In [38]:
df = pd.read_csv('../data/processed/credit_score_cleaned_train.csv')
print('This dataset has %d rows dan %d columns.\n' % df.shape)
df.head()

This dataset has 100000 rows dan 28 columns.



Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,total_emi_per_month,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal...",3,7,11.27,4,Good,809.98,26.82262,265,No,80.4153,High_spent_Small_value_payments,312.49408,2
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal...",-1,7,11.27,4,Good,809.98,31.94496,266,No,118.28022,Low_spent_Large_value_payments,284.62915,2
2,0x1604,CUS_0xd40,March,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal...",3,7,11.27,4,Good,809.98,28.609352,267,No,81.699524,Low_spent_Medium_value_payments,331.20987,2
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal...",5,4,6.27,4,Good,809.98,31.377861,268,No,199.45807,Low_spent_Small_value_payments,223.45131,2
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,"['Auto Loan', 'Credit-Builder Loan', 'Personal...",6,4,11.27,4,Good,809.98,24.797346,269,No,41.420155,High_spent_Medium_value_payments,341.48923,2


In [39]:
# List of unique loan values
unique_loan_types = ['Auto Loan', 'Credit-Builder Loan', 'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan',
                     'No Loan', 'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan']

# Adding a new column for each unique loan type and checking how many times it appears
for loan_type in unique_loan_types:
    # Replacing '-' and spaces with underscores, converting other characters to lowercase
    cleaned_loan_type = loan_type.replace(' ', '_').replace('-', '_').lower()

    # Counting how many times the loan_type value appears in each row
    df[cleaned_loan_type] = df['type_of_loan'].apply(lambda x: x.count(loan_type))


In [40]:
df.loc[df['delay_from_due_date'] < 0, 'delay_from_due_date'] = None

In [41]:
lower, upper = 0.5, 29.98

def handle_outliers_for_changed_credit_limit(df):
    df['changed_credit_limit'] = np.where(df['changed_credit_limit'] < lower, lower, df['changed_credit_limit'])
    df['changed_credit_limit'] = np.where(df['changed_credit_limit'] > upper, upper, df['changed_credit_limit'])
    return df

df = handle_outliers_for_changed_credit_limit(df)

In [42]:
# # Filtrer uniquement les colonnes numériques
# numeric_df = df.select_dtypes(include=['number'])

# # Identifier les colonnes avec des valeurs négatives
# negative_columns = (numeric_df < 0).any()

# # Afficher les colonnes avec des valeurs négatives et leur nombre
# for column in negative_columns[negative_columns].index:
#     negative_count = (numeric_df[column] < 0).sum()
#     print(f"Colonne '{column}' : {negative_count} valeurs négatives.")

In [43]:
# def impute_missing_values_for_column(data, column_name):
#     """
#     Impute missing values for a specific numeric column using a decision tree.
#     """
#     if column_name not in data.columns:
#         raise ValueError(f"La colonne '{column_name}' n'existe pas dans le DataFrame.")
    
#     if data[column_name].isnull().any():
#         print(f"Imputing missing values for column: {column_name}")
        
#         # Create a temporary DataFrame without missing values in the relevant column
#         temp_df = data.dropna(subset=[column_name])
        
#         # Separate features (X) and target (y) for imputation
#         features = [col for col in temp_df.columns if col != column_name and data[col].dtype != 'object']  # Exclude categorical features
#         X = temp_df[features]
#         y = temp_df[column_name]
        
#         # Use a regression model for numeric columns
#         model = DecisionTreeRegressor(random_state=42)
        
#         # Train the model with non-missing data
#         model.fit(X, y)
        
#         # Identify indices with missing values in the column
#         missing_values_index = data[data[column_name].isnull()].index
        
#         # Select features for rows with missing values
#         X_missing = data.loc[missing_values_index, features]
        
#         # Predict the missing values and impute them in the column
#         imputed_values = model.predict(X_missing)
#         data.loc[missing_values_index, column_name] = imputed_values
        
#         print(f"Imputation for '{column_name}' complete.")
#     else:
#         print(f"No missing values to impute for column: {column_name}")

# # Appliquer la fonction uniquement pour la colonne delay_from_due_date
# impute_missing_values_for_column(df, 'delay_from_due_date')


Imputing missing values for column: delay_from_due_date
Imputation for 'delay_from_due_date' complete.


In [45]:
df["credit_score"].value_counts(normalize=True)  # 2:Good, 1: Standard, 0: Poor

credit_score
1   0.531740
0   0.289980
2   0.178280
Name: proportion, dtype: float64

In [46]:
df = df.drop([
    "id", "customer_id", "name", "ssn", "type_of_loan"], axis=1)

df.head()

Unnamed: 0,month,age,occupation,annual_income,monthly_inhand_salary,total_emi_per_month,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score,auto_loan,credit_builder_loan,debt_consolidation_loan,home_equity_loan,mortgage_loan,no_loan,not_specified,payday_loan,personal_loan,student_loan
0,January,23,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,3.0,7,11.27,4,Good,809.98,26.82262,265,No,80.4153,High_spent_Small_value_payments,312.49408,2,1,1,0,1,0,0,0,0,1,0
1,February,23,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,3.0,7,11.27,4,Good,809.98,31.94496,266,No,118.28022,Low_spent_Large_value_payments,284.62915,2,1,1,0,1,0,0,0,0,1,0
2,March,23,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,3.0,7,11.27,4,Good,809.98,28.609352,267,No,81.699524,Low_spent_Medium_value_payments,331.20987,2,1,1,0,1,0,0,0,0,1,0
3,April,23,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,5.0,4,6.27,4,Good,809.98,31.377861,268,No,199.45807,Low_spent_Small_value_payments,223.45131,2,1,1,0,1,0,0,0,0,1,0
4,May,23,Scientist,19114.12,1824.8434,49.574947,3,4,3,4,6.0,4,11.27,4,Good,809.98,24.797346,269,No,41.420155,High_spent_Medium_value_payments,341.48923,2,1,1,0,1,0,0,0,0,1,0


In [47]:
payment_mapping = {
    'High_spent_Large_value_payments': 6,
    #Successfully managing large debts provides the most positive contribution to the credit score.
    'High_spent_Medium_value_payments': 5,
    #Medium-value payments with high spending positively impact the credit score.
    'High_spent_Small_value_payments': 4,
    #Small payments can negatively affect the credit score if debts accumulate over time.
    'Low_spent_Large_value_payments': 3,  #shows quick financial responsibility, positively affecting the credit score.
    'Low_spent_Medium_value_payments': 2,  #contributes positively to the credit score by demonstrating debt management.
    'Low_spent_Small_value_payments': 1
    #may limit the credit history and provide minimal contribution to the credit score
}

df['payment_behaviour'] = df['payment_behaviour'].map(payment_mapping)

df['payment_behaviour'] = pd.to_numeric(df['payment_behaviour'], downcast='integer')

In [48]:
# Convert the payment_of_min_amount column to numerical values
df['payment_of_min_amount'] = df['payment_of_min_amount'].map({'Yes': 1, 'No': 0})

df['payment_of_min_amount'] = pd.to_numeric(df['payment_of_min_amount'], downcast='integer')

In [49]:
label_encoder = LabelEncoder()
df['occupation'] = label_encoder.fit_transform(df['occupation'])

In [50]:
month_map = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8
}

#Mapping
df['month'] = df['month'].map(month_map)

df['month'] = pd.to_numeric(df['month'], downcast='integer')

In [72]:
# Separate features and target variable
X = df.drop("credit_score", axis=1)
y = df.credit_score

In [59]:
y.value_counts()

credit_score
1    53174
0    28998
2    17828
Name: count, dtype: int64

In [74]:
label_encoder = LabelEncoder()
non_numeric_columns = X.select_dtypes(include=['object', 'category']).columns
for col in non_numeric_columns:
    X[col] = label_encoder.fit_transform(X[col])

In [75]:
# smote for unbalanced data
smote = SMOTE()
X, y = smote.fit_resample(X, y)

In [63]:
# Set ratios for splitting
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# Step 1: Split into train (75%) and temp (25%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=1 - train_ratio, stratify=y, random_state=42)

# Step 2: Split temp (25%) into validation (15%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=test_ratio / (test_ratio + validation_ratio), 
    stratify=y_temp, 
    random_state=42
)

In [67]:
# Columns to apply RobustScaler
robust_columns = ['total_emi_per_month', 'amount_invested_monthly', 'monthly_balance']

# Columns to apply StandardScaler (all columns except robust columns)
standard_columns = [col for col in X_train.columns if col not in robust_columns]

# Create the ColumnTransformer
scaler = ColumnTransformer(
    transformers=[
        ('standard', StandardScaler(), standard_columns),
        ('robust', RobustScaler(), robust_columns)
    ]
)

# Apply scaling to training validation and test data 
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [77]:
# Calculate correlation matrix for the feature set X
corr_matrix = X.corr()

# Initialize an empty set to store features to drop
features_to_remove = set()

# Set a threshold for high correlation
threshold = 0.9

# Iterate through the correlation matrix and find highly correlated features
for col in corr_matrix.columns:
    for row in corr_matrix.index:
        # Avoid comparing a feature with itself
        if col != row and abs(corr_matrix.loc[row, col]) > threshold:
            # Add the feature with the higher index to the removal set
            if corr_matrix.columns.get_loc(row) < corr_matrix.columns.get_loc(col):
                features_to_remove.add(col)

# Convert the set to a list for easier handling
features_to_remove = list(features_to_remove)

# Display the highly correlated features
print("Highly correlated features (correlation > 0.9):", features_to_remove)

# Drop the highly correlated features from the dataset
X = X.drop(columns=features_to_remove)

# Display the updated feature set shape
print("Shape of X after removing highly correlated features:", X.shape)

# Continue with your train-test-validation split and scaling
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# Step 1: Split into train (75%) and temp (25%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=1 - train_ratio, stratify=y, random_state=42)

# Step 2: Split temp (25%) into validation (15%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=test_ratio / (test_ratio + validation_ratio),
    stratify=y_temp,
    random_state=42
)

# Scaling numeric data
robust_columns = ['total_emi_per_month', 'amount_invested_monthly', 'monthly_balance']
standard_columns = [col for col in X_train.columns if col not in robust_columns]

scaler = ColumnTransformer(
    transformers=[
        ('standard', StandardScaler(), standard_columns),
        ('robust', RobustScaler(), robust_columns)
    ]
)

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Print the shapes of train, validation, and test sets
print(f"X_train shape: {X_train_scaled.shape}")
print(f"X_val shape: {X_val_scaled.shape}")
print(f"X_test shape: {X_test_scaled.shape}")


Highly correlated features (correlation > 0.9): ['monthly_inhand_salary']
Shape of X after removing highly correlated features: (159522, 31)
X_train shape: (119641, 31)
X_val shape: (23928, 31)
X_test shape: (15953, 31)


In [78]:
def evaluate_model(model, X, y, dataset_name="Dataset"):
    y_pred = model.predict(X)
    print(f"Evaluation on {dataset_name}:")
    print("Accuracy:", accuracy_score(y, y_pred))
    print("Precision:", precision_score(y, y_pred, average='weighted'))
    print("Recall:", recall_score(y, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y, y_pred, average='weighted'))
    print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
    print("Classification Report:\n", classification_report(y, y_pred))
    print("\n")

In [80]:
# Train XGBoost
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss', tree_method='hist', scale_pos_weight=compute_class_weight('balanced', classes=y.unique(), y=y).tolist())
xgb_model.fit(X_train_scaled, y_train)

# Evaluate XGBoost
evaluate_model(xgb_model, X_train_scaled, y_train, "Training Data (XGBoost)")
evaluate_model(xgb_model, X_val_scaled, y_val, "Validation Data (XGBoost)")
evaluate_model(xgb_model, X_test_scaled, y_test, "Test Data (XGBoost)")

Evaluation on Training Data (XGBoost):
Accuracy: 0.8591954263170652
Precision: 0.8600861901275055
Recall: 0.8591954263170652
F1 Score: 0.8576636278855538
Confusion Matrix:
 [[34829  2666  2386]
 [ 4863 30447  4570]
 [  352  2009 37519]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87     39881
           1       0.87      0.76      0.81     39880
           2       0.84      0.94      0.89     39880

    accuracy                           0.86    119641
   macro avg       0.86      0.86      0.86    119641
weighted avg       0.86      0.86      0.86    119641



Evaluation on Validation Data (XGBoost):
Accuracy: 0.8295720494817787
Precision: 0.829741882324793
Recall: 0.8295720494817787
F1 Score: 0.8274987629353684
Confusion Matrix:
 [[6724  711  541]
 [1148 5756 1072]
 [  89  517 7370]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84     

In [81]:
# Train LightGBM
lgbm_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced')
lgbm_model.fit(X_train_scaled, y_train)

# Evaluate LightGBM
evaluate_model(lgbm_model, X_train_scaled, y_train, "Training Data (LightGBM)")
evaluate_model(lgbm_model, X_val_scaled, y_val, "Validation Data (LightGBM)")
evaluate_model(lgbm_model, X_test_scaled, y_test, "Test Data (LightGBM)")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2561
[LightGBM] [Info] Number of data points in the train set: 119641, number of used features: 31
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Evaluation on Training Data (LightGBM):
Accuracy: 0.7998846549259869
Precision: 0.8007360089464904
Recall: 0.7998846549259869
F1 Score: 0.7972008872673227
Confusion Matrix:
 [[32507  3722  3652]
 [ 6476 27070  6334]
 [  671  3087 36122]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82     39881
           1       0.80      0.68      0.73     39880
           2       0.78      0.91      0.84     39880

    