In [144]:
# Install scikit-learn if not already installed
#Import libraries
import pandas as pd #For data manipulation and analysis
import os #For file and directory manipulation
import numpy as np #For numerical operations
import matplotlib.pyplot as plt #For data visualization
import seaborn as sns #For statistical data visualization
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder #For data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

Data Preprocessing

In [145]:
# Load data
filepath = 'train.csv'  # Replace with your actual path
df_Train = pd.read_csv(filepath, sep=';')
# Display info and head
df_Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [146]:

display(df_Train.head(10))

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [147]:
# Drop duplicates
#before = df_Train.shape[0]
#df_Train = df_Train.drop_duplicates()
#after = df_Train.shape[0]
#print(f"Duplicates dropped: {before - after}")
#print(f"New shape: {df_Train.shape}")


In [148]:
# Check missing values
#print(df_Train.isnull().sum())

In [149]:

# Impute missing values (here: drop rows with any missing values)
#df_Train = df_Train.dropna()
#print(df_Train.isnull().sum())
#print(f"Dataset shape after dropping missing values: {df_Train.shape}")

In [150]:
# List of columns to apply ratio-based imputation
columns_to_impute = ['Gender', 'Dependents', 'Self_Employed', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

# Create a copy for imputation (optional)
df_imputed = df_Train.copy()

# Loop through columns and fill missing values based on observed value distributions
for col in columns_to_impute:
    if df_imputed[col].isnull().any():
        value_ratios = df_imputed[col].value_counts(normalize=True, dropna=True)
        missing_mask = df_imputed[col].isnull()
        imputed_values = np.random.choice(
            value_ratios.index,
            size=missing_mask.sum(),
            p=value_ratios.values
        )
        df_imputed.loc[missing_mask, col] = imputed_values

# Replace df_Train with the imputed version (optional)
df_Train = df_imputed

#Save the cleaned data to a new CSV file
df_Train.to_csv('cleaned_data.csv', index=False)

# Check for remaining missing values
print(df_Train.isnull().sum())
print(f"Dataset shape after imputation: {df_Train.shape}")

Loan_ID              0
Gender               0
Married              3
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64
Dataset shape after imputation: (614, 13)


In [151]:
# Encode categorical variables
# Encode categorical variables
label_cols = df_Train.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for col in label_cols:
    df_Train[col] = le.fit_transform(df_Train[col].astype(str))



Modelling | XGboost

In [152]:
# Define features and target
X = df_Train.drop(columns=['Loan_ID', 'Loan_Status', 'Property_Area', 'Education', 'Married','Gender' ], axis=1)
y = df_Train['Loan_Status']

# First split: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

# Second split: 15% validation, 15% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


In [153]:
def decode_vector(vector):
    max_depth = int(3 + vector[0] * 7)
    learning_rate = 0.01 + vector[1] * 0.29
    subsample = 0.5 + vector[2] * 0.5
    colsample_bytree = 0.5 + vector[3] * 0.5
    return max_depth, learning_rate, subsample, colsample_bytree

def evaluate_fitness(vector):
    max_depth, learning_rate, subsample, colsample_bytree = decode_vector(vector)
    model = XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        n_estimators=30,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        tree_method='hist',
        random_state=42,
        verbosity=0
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return accuracy_score(y_val, preds)

def four_vector_optimization(pop_size=5, iterations=5, alpha=0.1):
    population = np.random.rand(pop_size, 4)
    fitness = np.array([evaluate_fitness(ind) for ind in population])
    best_idx = np.argmax(fitness)
    best_vector = population[best_idx].copy()
    
    for _ in range(iterations):
        for i in range(pop_size):
            new_vector = population[i] + alpha * (best_vector - population[i]) + alpha * np.random.rand(4)
            new_vector = np.clip(new_vector, 0, 1)
            new_fitness = evaluate_fitness(new_vector)
            if new_fitness > fitness[i]:
                population[i] = new_vector
                fitness[i] = new_fitness
                if new_fitness > fitness[best_idx]:
                    best_idx = i
                    best_vector = new_vector.copy()
                    
    return decode_vector(best_vector), fitness[best_idx]

# Optimize and train model
best_params, best_acc = four_vector_optimization()
print("Best Hyperparameters:")
print(f" Max_depth: {best_params[0]}")
print(f" Learning_rate: {best_params[1]:.4f}")
print(f" Subsample: {best_params[2]:.4f}")
print(f" Colsample_bytree: {best_params[3]:.4f}")
print(f"Validation Accuracy: {best_acc:.4f}")

Best Hyperparameters:
 Max_depth: 5
 Learning_rate: 0.0783
 Subsample: 0.5092
 Colsample_bytree: 0.9736
Validation Accuracy: 0.8478


In [154]:
best_params, best_acc = four_vector_optimization()
max_depth, learning_rate, subsample, colsample_bytree = best_params

#Initialize and train the XGBoost model with optimized hyperparameters
xgb_model = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=100,  # You may keep or adjust this
    max_depth=max_depth,
    learning_rate=learning_rate,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7419354838709677

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.62      0.60        29
           1       0.82      0.80      0.81        64

    accuracy                           0.74        93
   macro avg       0.70      0.71      0.70        93
weighted avg       0.75      0.74      0.74        93


Confusion Matrix:
[[18 11]
 [13 51]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


4 - Vector Optimization 

Explainability 

In [155]:
import shap

# Create a SHAP explainer for your trained XGBoost model
explainer = shap.TreeExplainer(xgb_model)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)



In [156]:

from IPython.display import display, HTML

# Show full text in DataFrame columns
pd.set_option('display.max_colwidth', None)

num_applicants = 50  # Show top 50 applicants
num_features = 5    # Top 5 contributing features

explanations = []

# Helper function for value readability
def readable_value(fname, val):
    if isinstance(val, (bool, np.bool_)):
        return "Yes" if val else "No"
    if fname.endswith("_Yes") or fname.endswith("_No"):
        return "Yes" if val == 1 else "No"
    if "_" in fname and (val == 1 or val == 0):
        base, cat = fname.rsplit("_", 1)
        if val == 1:
            return f"is {cat}"
        else:
            return f"is not {cat}"
    return val

# Loop through top applicants in X_test
for i in range(min(num_applicants, X_test.shape[0])):
    shap_row = shap_values[i]
    feature_values = X_test.iloc[i]
    prediction = xgb_model.predict(X_test.iloc[[i]])[0]
    outcome = 'Approved' if prediction == 1 else 'Rejected'

    top_idx = np.argsort(np.abs(shap_row))[::-1][:num_features]
    reasons = []
    tips = set()

    for idx in top_idx:
        fname = X_test.columns[idx]
        val = feature_values[fname]
        val_str = readable_value(fname, val)
        impact = shap_row[idx]

        # Approved: show only positive contributions
        if outcome == 'Approved' and impact > 0:
            reasons.append(f"<b>{fname}</b> contributed positively to loan approval.")
        
        # Rejected: show only negative contributions and give advice
        elif outcome == 'Rejected' and impact < 0:
            reasons.append(f"<b>{fname}</b> contributed negatively to loan approval.")
            
            # Add actionable tips only if negative
            if 'income' in fname.lower():
                tips.add("Try to increase your income or add a co-applicant.")
            elif 'credit' in fname.lower():
                tips.add("Focus on improving your credit score by managing debt responsibly.")
            elif 'loan' in fname.lower() and 'amount' in fname.lower():
                tips.add("Consider applying for a smaller loan amount or increasing your repayment term.")
            elif 'dependents' in fname.lower():
                tips.add("Reducing financial dependents or applying jointly may help.")
            elif 'history' in fname.lower() or 'defaults' in fname.lower():
                tips.add("Maintain a clean repayment history to improve approval chances.")

    loan_id = feature_values['Loan_ID'] if 'Loan_ID' in feature_values else f"Loan_{i+1}"
    advice_text = " ".join(tips) if outcome == 'Rejected' and tips else "No specific advice. Keep up the good financial habits."

    explanations.append({
        'Loan_Id': loan_id,
        'Outcome': outcome,
        'Reasons': " ".join(reasons) if reasons else "No major contributing factors found.",
        'Advice': advice_text
    })

explanation_df = pd.DataFrame(explanations)

# Display nicely
display(HTML(explanation_df.to_html(escape=False)))

Unnamed: 0,Loan_Id,Outcome,Reasons,Advice
0,Loan_1,Approved,Credit_History contributed positively to loan approval. ApplicantIncome contributed positively to loan approval.,No specific advice. Keep up the good financial habits.
1,Loan_2,Approved,ApplicantIncome contributed positively to loan approval. Credit_History contributed positively to loan approval. Dependents contributed positively to loan approval.,No specific advice. Keep up the good financial habits.
2,Loan_3,Rejected,LoanAmount contributed negatively to loan approval. CoapplicantIncome contributed negatively to loan approval. Loan_Amount_Term contributed negatively to loan approval.,Try to increase your income or add a co-applicant. Consider applying for a smaller loan amount or increasing your repayment term.
3,Loan_4,Rejected,Credit_History contributed negatively to loan approval. ApplicantIncome contributed negatively to loan approval. CoapplicantIncome contributed negatively to loan approval. Dependents contributed negatively to loan approval.,Try to increase your income or add a co-applicant. Reducing financial dependents or applying jointly may help. Focus on improving your credit score by managing debt responsibly.
4,Loan_5,Approved,Credit_History contributed positively to loan approval. ApplicantIncome contributed positively to loan approval. LoanAmount contributed positively to loan approval.,No specific advice. Keep up the good financial habits.
5,Loan_6,Rejected,Credit_History contributed negatively to loan approval. Loan_Amount_Term contributed negatively to loan approval. CoapplicantIncome contributed negatively to loan approval.,Consider applying for a smaller loan amount or increasing your repayment term. Try to increase your income or add a co-applicant. Focus on improving your credit score by managing debt responsibly.
6,Loan_7,Rejected,Credit_History contributed negatively to loan approval. CoapplicantIncome contributed negatively to loan approval. Dependents contributed negatively to loan approval.,Try to increase your income or add a co-applicant. Reducing financial dependents or applying jointly may help. Focus on improving your credit score by managing debt responsibly.
7,Loan_8,Approved,LoanAmount contributed positively to loan approval. Credit_History contributed positively to loan approval. ApplicantIncome contributed positively to loan approval. CoapplicantIncome contributed positively to loan approval.,No specific advice. Keep up the good financial habits.
8,Loan_9,Rejected,ApplicantIncome contributed negatively to loan approval. LoanAmount contributed negatively to loan approval. CoapplicantIncome contributed negatively to loan approval.,Try to increase your income or add a co-applicant. Consider applying for a smaller loan amount or increasing your repayment term.
9,Loan_10,Approved,LoanAmount contributed positively to loan approval. Dependents contributed positively to loan approval. Credit_History contributed positively to loan approval. ApplicantIncome contributed positively to loan approval.,No specific advice. Keep up the good financial habits.
