# Data Description:

In [1]:
import zipfile
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline as pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, make_scorer

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler


In [2]:
# Define the file path for the input dataset.
file_location = '/kaggle/input/loan-default/Loan_default.csv'


In [3]:

# Load the CSV data directly into a pandas DataFrame.
Data = pd.read_csv(file_location)

# Preview the first few rows to confirm successful loading.
Data.head()


Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [4]:


# Drop the 'LoanID' column as it is an identifier and not useful for prediction.
Data = Data.drop(columns='LoanID')

# Verify that the column has been dropped by checking the DataFrame shape.
print("Updated Data Shape:", Data.shape)


Updated Data Shape: (255347, 17)


In this step, we drop the 'LoanID' column from the dataset. Often, certain columns are identifiers that donâ€™t provide useful information for modeling, and dropping them can reduce noise and improve model performance.

In [5]:


# Select numerical columns (int and float types) for further analysis and model training.
num = Data.select_dtypes(include=['int', 'float'])

# Display the selected numerical features to verify.
print("Selected Numerical Features:", num.columns.tolist())


Selected Numerical Features: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Default']


In [6]:

def ColumnTrans(cat):
    """
    Function to convert categorical variables into numerical variables
    by mapping unique values to integer indices.

    Parameters:
    ----------
    cat : DataFrame
        A pandas DataFrame containing categorical columns to be transformed.

    Returns:
    -------
    cat : DataFrame
        The original DataFrame with categorical columns transformed into numerical values.
    """
    # Iterate over each column in the DataFrame.
    for column in cat.columns:
        # Get unique values for the column
        unique_values = cat[column].unique()
        
        # Create a mapping of each unique value to a corresponding integer
        value_map = {value: index for index, value in enumerate(unique_values)}
        
        # Map the column's categorical values to their integer indices
        cat[column] = cat[column].map(value_map)
    
    return cat


In [7]:


# Select the categorical columns (object type) from the DataFrame.
cat = Data.select_dtypes(include='object')

# Apply the custom ColumnTrans function to transform categorical columns into numerical format.
cat = ColumnTrans(cat)

# Verify the transformation by displaying the first few rows of the transformed categorical data.
print("Transformed Categorical Columns (first few rows):")
print(cat.head())


Transformed Categorical Columns (first few rows):
   Education  EmploymentType  MaritalStatus  HasMortgage  HasDependents  \
0          0               0              0            0              0   
1          1               0              1            1              1   
2          1               1              0            0              0   
3          2               0              1            1              1   
4          0               1              0            1              0   

   LoanPurpose  HasCoSigner  
0            0            0  
1            0            0  
2            1            1  
3            2            1  
4            1            1  


In [8]:


# Concatenate the numerical and transformed categorical features along the columns (axis=1).
df = pd.concat([num, cat], axis=1)

# Verify the combined DataFrame by displaying the first few rows.
print("Combined DataFrame (first few rows):")
print(df.head())


Combined DataFrame (first few rows):
   Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  \
0   56   85994       50587          520              80               4   
1   69   50432      124440          458              15               1   
2   46   84208      129188          451              26               3   
3   32   31713       44799          743               0               3   
4   60   20437        9139          633               8               4   

   InterestRate  LoanTerm  DTIRatio  Default  Education  EmploymentType  \
0         15.23        36      0.44        0          0               0   
1          4.81        60      0.68        0          1               0   
2         21.17        24      0.31        1          1               1   
3          7.07        24      0.23        0          2               0   
4          6.51        48      0.73        0          0               1   

   MaritalStatus  HasMortgage  HasDependents  LoanPurpose  Ha

In [9]:


# Separate features (X) from the target variable (y)
x1 = df.drop(columns='Default')  # Features: All columns except 'Default'
y1 = df['Default']  # Target: 'Default' column

# Verify the separation by displaying the shapes of the features and target.
print("Shape of Features (X):", x1.shape)
print("Shape of Target (y):", y1.shape)


Shape of Features (X): (255347, 16)
Shape of Target (y): (255347,)


In [10]:


# Initialize the resampling techniques
ros = RandomOverSampler()  # Random Over-Sampling to balance the dataset by increasing the minority class.
rus = RandomUnderSampler()  # Random Under-Sampling to balance the dataset by decreasing the majority class.
smote = SMOTE()  # SMOTE (Synthetic Minority Over-sampling Technique) to generate synthetic examples for the minority class.

# Verify that the resampling methods are correctly initialized
print("Resampling techniques initialized:")
print("RandomOverSampler:", ros)
print("RandomUnderSampler:", rus)
print("SMOTE:", smote)


Resampling techniques initialized:
RandomOverSampler: RandomOverSampler()
RandomUnderSampler: RandomUnderSampler()
SMOTE: SMOTE()


In [11]:


# Apply Random Over-Sampling to increase the minority class
x2, y2 = ros.fit_resample(x1, y1)
print("Shape after RandomOverSampler (ROS):", x2.shape, y2.shape)

# Apply SMOTE to generate synthetic samples for the minority class
x3, y3 = smote.fit_resample(x2, y2)
print("Shape after SMOTE:", x3.shape, y3.shape)

# Apply Random Under-Sampling to decrease the majority class
x, y = rus.fit_resample(x3, y3)
print("Shape after RandomUnderSampler (RUS):", x.shape, y.shape)

# Final balanced dataset
print("Final Balanced Dataset Shape (X):", x.shape)
print("Final Balanced Dataset Shape (Y):", y.shape)


Shape after RandomOverSampler (ROS): (451388, 16) (451388,)
Shape after SMOTE: (451388, 16) (451388,)
Shape after RandomUnderSampler (RUS): (451388, 16) (451388,)
Final Balanced Dataset Shape (X): (451388, 16)
Final Balanced Dataset Shape (Y): (451388,)


In [12]:


# Split the balanced dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets to verify the split
print("Shape of Training Features (X_train):", x_train.shape)
print("Shape of Testing Features (X_test):", x_test.shape)
print("Shape of Training Target (y_train):", y_train.shape)
print("Shape of Testing Target (y_test):", y_test.shape)


Shape of Training Features (X_train): (361110, 16)
Shape of Testing Features (X_test): (90278, 16)
Shape of Training Target (y_train): (361110,)
Shape of Testing Target (y_test): (90278,)


In [13]:


# Initialize the Random Forest Classifier with 2000 estimators (trees)
model = RandomForestClassifier(n_estimators=2000, random_state=42)

# Display the model's parameters to verify the configuration
print("Random Forest Classifier initialized with parameters:")
print(model)


Random Forest Classifier initialized with parameters:
RandomForestClassifier(n_estimators=2000, random_state=42)


In [None]:

# Train the model using the training data
model.fit(x_train, y_train)

# Display a message to indicate that training is complete
print("Model training complete with Random Forest Classifier.")


In [None]:


# Use the trained model to make predictions on the test data
prediction = model.predict(x_test)

# Display the shape of the predictions to verify
print("Shape of predictions:", prediction.shape)


In [None]:
# Since the data was imbalanced looking to see if the model predicts both values or needs more work.
(prediction == 0).sum()

In [None]:
(prediction == 1).sum()

In [None]:


# Import the evaluation metrics from sklearn
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)

# Display the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
# Import XGBoost
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Initialize XGBoost Classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=20,
    learning_rate=0.01,
    max_depth=6,
    random_state=42,
    eval_metric='logloss'
)

# Train the XGBoost model
print("Training XGBoost model...")
xgb_model.fit(x_train, y_train)
print("XGBoost model training complete.")

# Make predictions
xgb_prediction = xgb_model.predict(x_test)

# Check prediction distribution
print(f"Predictions for class 0: {(xgb_prediction == 0).sum()}")
print(f"Predictions for class 1: {(xgb_prediction == 1).sum()}")

# Calculate evaluation metrics
xgb_accuracy = accuracy_score(y_test, xgb_prediction)
xgb_precision = precision_score(y_test, xgb_prediction)
xgb_recall = recall_score(y_test, xgb_prediction)
xgb_f1 = f1_score(y_test, xgb_prediction)

# Display XGBoost metrics
print("\n=== XGBoost Model Performance ===")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")

In [None]:
# Import LightGBM
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Initialize LightGBM Classifier
lgb_model = lgb.LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=6,
    random_state=42,
    verbose=-1
)

# Train the LightGBM model
print("Training LightGBM model...")
lgb_model.fit(x_train, y_train)
print("LightGBM model training complete.")

# Make predictions
lgb_prediction = lgb_model.predict(x_test)

# Check prediction distribution
print(f"Predictions for class 0: {(lgb_prediction == 0).sum()}")
print(f"Predictions for class 1: {(lgb_prediction == 1).sum()}")

# Calculate evaluation metrics
lgb_accuracy = accuracy_score(y_test, lgb_prediction)
lgb_precision = precision_score(y_test, lgb_prediction)
lgb_recall = recall_score(y_test, lgb_prediction)
lgb_f1 = f1_score(y_test, lgb_prediction)

# Display LightGBM metrics
print("\n=== LightGBM Model Performance ===")
print(f"Accuracy: {lgb_accuracy:.4f}")
print(f"Precision: {lgb_precision:.4f}")
print(f"Recall: {lgb_recall:.4f}")
print(f"F1 Score: {lgb_f1:.4f}")