<a href="https://colab.research.google.com/github/jovitaand/Default-of-Credit-Card-Clients/blob/main/Default_Of_Credit_Card_Clients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import gc
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
import xgboost as xgb

pd.set_option('display.max_columns', 100)

In [None]:
file_path = '/content/default of credit card clients.xlsx'
data = pd.ExcelFile(file_path)
#checking if there are any other excel sheets
sheet_names = data.sheet_names
print(sheet_names)

In [None]:
import pandas as pd
data = pd.read_excel("/content/default of credit card clients.xlsx")
data.head()

In [None]:
#Dealing with the null values
data.isnull().sum()

In [None]:
data.info() #looks we would need to delete the first row and shift the content up

In [None]:
# Reassigning proper column names manually from the observed structure
data.columns = [
    "ID", "LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE",
    "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6",
    "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
    "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6",
    "default.payment.next.month"
]

# Remove any additional non-numeric or header rows
data = data[1:]  # Exclude the duplicated header row
data.reset_index(drop=True, inplace=True)  # Reset index

# Convert data types for numeric columns
for col in data.columns[1:]:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Display the first few rows to confirm the structure
data


In [None]:
data.info()

In [None]:
data.describe()

# Feature Engineering

The training and testing datasets have been enhanced with the follwoing new features:
1. `CREDIT_UTILIZATION`: The total bill amount as a proportion of the credit limit.
2. `AVG_REPAY_DELAY`: The average repayment delay across the last six months
3. `MAX_REPAY_DELAY`: The maximum repayment delay in the last six months.
4. `TOTAL_PAYMENTS`: The total payment amount over the last six months.
5. `TOTAL_BILLS`: The total bill amount over the last six months.
6. `PAYMENT_TO_BILL_RATIO`: The ratio of total payments to total bills.

In [None]:
# Feature Engineering: Adding meaningful features to the training and testing datasets

def add_features(data):
    # Credit Utilization Rate
    data['CREDIT_UTILIZATION'] = (data[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
                                        'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].sum(axis=1)) / data['LIMIT_BAL']

    # Average and Maximum Repayment Delay
    data['AVG_REPAY_DELAY'] = data[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1)
    data['MAX_REPAY_DELAY'] = data[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].max(axis=1)

    # Total Payments and Total Bills
    data['TOTAL_PAYMENTS'] = data[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].sum(axis=1)
    data['TOTAL_BILLS'] = data[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].sum(axis=1)

    # Payment-to-Bill Ratio
    data['PAYMENT_TO_BILL_RATIO'] = data['TOTAL_PAYMENTS'] / data['TOTAL_BILLS']
    data['PAYMENT_TO_BILL_RATIO'] = data['PAYMENT_TO_BILL_RATIO'].fillna(0)  # Handle division by zero

    return data

# Apply feature engineering to both training and testing datasets
X_train_balanced = add_features(X_train_balanced)
X_test = add_features(X_test)

# Display the first few rows of the enhanced training data
X_train_balanced.head()


We are going to sample the data by retaining the important features such as
1. `LIMIT_BAL` - Credit Limit
2. `SEX`- Gender
3. `EDUCATION` - Educational Level
4. `MARRIAGE` - Marital Status
5. `AGE` - Age of the individual
6. `CREDIT_UTILIZATION` - Credit utilization rate
7. `AVG_REPAY_DELAY` - Average repayment delay
8. `MAX_REPAY_DELAY` - maximum repayment delay
10. `TOTAL_PAYMENTS` - Total payment amounts
11. `TOTAL_BILLS` - Total payment amounts
12. `PAYMENT_TO_BILL_RATIO` - Total billed amounts

In [None]:
# Retain only the specified features in a new DataFrame called data1
selected_features = [
    "LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE",
    "CREDIT_UTILIZATION", "AVG_REPAY_DELAY", "MAX_REPAY_DELAY",
    "TOTAL_PAYMENTS", "TOTAL_BILLS", "PAYMENT_TO_BILL_RATIO"
]

# Create data1 with the selected features
data1 = X_train_balanced[selected_features]

# Display the first few rows of the new downsized dataset
data1.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Analysis: Statistical summary
feature_stats = data1.describe()

# Correlation Analysis: Compute pairwise correlations
correlation_matrix = data1.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Feature Correlation Heatmap")
plt.show()

# Visualize distributions of key features
for feature in data1.columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(data1[feature], kde=True, bins=30, color="blue", edgecolor="black")
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.grid(axis="y")
    plt.show()

### Interpretation of Visualizations

#### 1. Feature Correlation Heatmap
**High Correlations:**
- **CREDIT_UTILIZATION and TOTAL_BILLS**: These features are strongly correlated, as higher credit utilization is often driven by higher total bills.
- **TOTAL_BILLS and LIMIT_BAL**: Individuals with higher credit limits tend to have higher total bills.

**Weak Correlations:**
- **AGE**, **MARRIAGE**, **SEX**, and most demographic features show weak correlations with numeric features like **CREDIT_UTILIZATION**, suggesting less direct influence on financial behavior.

---

#### 2. Feature Distributions
- **LIMIT_BAL (Credit Limit):**
  - The distribution is right-skewed, with most values clustering below 300,000.
  - Few individuals have exceptionally high limits, creating outliers.

- **AGE:**
  - A fairly normal distribution, peaking around 30–40 years.
  - Few data points for ages above 60.

- **CREDIT_UTILIZATION:**
  - A right-skewed distribution, with a peak near lower utilization rates.
  - A small number of extreme outliers suggest heavy over-utilization.

- **AVG_REPAY_DELAY and MAX_REPAY_DELAY:**
  - Peaks near 0 indicate that most individuals either repay on time or experience minor delays.
  - A smaller portion of extreme delays shows heightened risk behaviors.

- **TOTAL_PAYMENTS and TOTAL_BILLS:**
  - Both are heavily right-skewed, with the majority of values concentrated at lower ranges.
  - Extreme values likely indicate higher-income individuals or atypical billing patterns.

- **PAYMENT_TO_BILL_RATIO:**
  - Distribution indicates most individuals pay a small fraction of their bills regularly.
  - Negative or extreme high ratios likely result from specific billing anomalies or refunds.


In [None]:
#Display the statistical summary as plain output
feature_stats

This summary highlights:

* Wide ranges in `TOTAL_PAYMENTS` and `TOTAL_BILLS`.
* Potential outliers in features like `CREDIT_UTILIZATION`, `PAYMENT_TO_BILL_RATIO`, and `TOTAL_BILLS` with extreme values.
* Some rows may require further review or capping for outliers.

# Next Steps
1. Handle Outliers:

* Address extreme values in key features like `CREDIT_UTILIZATION`, `TOTAL_BILLS`, and `PAYMENT_TO_BILL_RATIO`.

2. Feature Scaling:

* Apply scaling (e.g., MinMaxScaler or StandardScaler) to normalize the feature ranges, ensuring all features contribute proportionally to model training.

3. Model Training:

* Train a predictive model (e.g., logistic regression, random forest) using the downsized dataset.
Evaluate using metrics like precision, recall, and balanced accuracy.

4. Fairness and Explainability:

* Use Fairlearn for fairness analysis and LIME/SHAP to interpret feature importance.

In [None]:
# Convert the entire dataset to numeric where possible
data1_numeric = data1.apply(pd.to_numeric, errors='coerce')
# Check for problematic values (NaN or infinite) in the dataset
problematic_summary = {
    "NaN_Count": data1_numeric.isna().sum(),
    "Inf_Count": (data1_numeric == float('inf')).sum() + (data1_numeric == -float('inf')).sum(),
}

problematic_summary_df = pd.DataFrame(problematic_summary)
problematic_summary_df


### Problematic Values Identified:
1. The `PAYMENT_TO_BILL_RATIO` column contains 91 infinite values.
2. No NaN values were detected in the dataset

### Solution:
* Replace infinite values in `PAYMENT_TO_BILL_RATIO` with a reasonable approximation, such as the column's maximum finite value.

In [None]:
# Import the RobustScaler again
from sklearn.preprocessing import RobustScaler

# Redefine the RobustScaler instance
scaler = RobustScaler()

# Reapply scaling using RobustScaler
scaled_data = pd.DataFrame(scaler.fit_transform(data1_numeric), columns=data1_numeric.columns)

# Display the first few rows of the scaled dataset
scaled_data.head()


All the features are now normalized!

# Model Training

Let’s begin by training a basic predictive model on the scaled data. Here's the plan:

Split the Dataset:

Separate features (X) and target (y) using the preprocessed data.
Use the Default column as the target variable.
Train a Logistic Regression Model:

Train the model and evaluate its performance.
Evaluate Performance:

Calculate metrics like accuracy, precision, recall, and F1-score.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score
from xgboost import XGBClassifier

# Ensure correct target column reference
X = data.drop(columns=["default.payment.next.month", "data_ID"], errors="ignore")  # Drop ID and irrelevant columns
y = data["default.payment.next.month"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Combine training features and target for resampling
train_data = X_train.copy()
train_data['default.payment.next.month'] = y_train

# Separate majority and minority classes
majority_class = train_data[train_data['default.payment.next.month'] == 0]
minority_class = train_data[train_data['default.payment.next.month'] == 1]

# Oversample the minority class
minority_oversampled = resample(
    minority_class,
    replace=True,  # Sample with replacement
    n_samples=len(majority_class),  # Match the number of majority class samples
    random_state=42
)

# Combine oversampled minority class with majority class
balanced_train_data = pd.concat([majority_class, minority_oversampled])

# Separate features and target
X_train_balanced = balanced_train_data.drop(columns=['default.payment.next.month'])
y_train_balanced = balanced_train_data['default.payment.next.month']

# Check the distribution of the target variable before and after oversampling
before_balance = y_train.value_counts()
after_balance = y_train_balanced.value_counts()

print("Distribution before balancing:")
print(before_balance)
print("\nDistribution after balancing:")
print(after_balance)

In [None]:
# Ensure numeric data for XGBoost
X_train_balanced_numeric = X_train_balanced.select_dtypes(include=["number"])
X_test_numeric = X_test.select_dtypes(include=["number"])

# Train a Random Forest Classifier on the balanced dataset
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
conf_matrix = confusion_matrix(y_test, y_pred)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
classification_report_summary = classification_report(y_test, y_pred)

# Display the results
print("Confusion Matrix:")
print(conf_matrix)
print("\nBalanced Accuracy:", f"{balanced_acc:.2%}")
print("\nClassification Report:")
print(classification_report_summary)

In [None]:
# Train an XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_balanced_numeric, y_train_balanced)

# Predict on the test set using XGBoost
y_pred_xgb = xgb_model.predict(X_test_numeric)

# Evaluate XGBoost model performance
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
balanced_acc_xgb = balanced_accuracy_score(y_test, y_pred_xgb)
classification_report_xgb = classification_report(y_test, y_pred_xgb)

print("\nXGBoost Confusion Matrix:")
print(conf_matrix_xgb)
print("\nXGBoost Balanced Accuracy:", f"{balanced_acc_xgb:.2%}")
print("\nXGBoost Classification Report:")
print(classification_report_xgb)