In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/loan-data/loan_data.csv


# Introduction
This project is called "Loan Approval Classifier" using a loan dataset with 9,500 rows and 14 columns. The project's goal is to predict whether a loan will not be paid back rather than if a loan is paid back. Key steps in the project include:

1. Data exploration and preprocessing: The data is cleaned and preprocessed, including handling missing values, removing unnecessary columns, encoding categorical features, and scaling numerical features.

1. Handling class imbalance: The SMOTE technique is applied to balance the classes in the training data.

1. Splitting the data: The dataset is split into training and testing sets.

1. Model training and selection: The RandomForestClassifier and CatBoostClassifier are combined into a custom classifier called CombinedClassifier. A Pipeline object is created, and hyperparameters are tuned using GridSearchCV. Evaluation metrics such as accuracy, precision, recall, and F1 score are used to assess the models' performance.

1. Saving the model: The best-performing model is saved to disk using the joblib library.

1. Creating a web app: A Gradio web app is built to allow users to interact with the trained model and make predictions.

1. Deployment: The web app is deployed on a loveable space, Hugging Face.

TIP: for deploying the application app, preprocessing data step is very important that the amounts of features in the model steps and the web app step should be the same. 

# Data exploration

We can find out that the class 'not.fully.paid' is imbalanced which the amount of '0' is 8045 and the amount of '1' is 1533 and apply the 'SMOTE' method to deal this problem in the step 'Preprocessing and Balancing the Data'.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
import datetime as dt

# Load the dataset
file_path = '/kaggle/input/loan-data/loan_data.csv' # Replace with the path to your dataset file
df = pd.read_csv(file_path)

# pandas_profiling report
start_time = dt.datetime.now()
print("Started at ", start_time)
report = ProfileReport(df)
report


Started at  2023-04-15 13:51:45.864215


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Features Engineering

In [3]:
from sklearn.preprocessing import StandardScaler

def feature_engineering(df):
    # Create a new feature 'income_to_debt' by dividing 'log.annual.inc' by 'dti'
    df['income_to_debt'] = df['log.annual.inc'] / df['dti']

    # Create a new feature 'credit_utilization' by dividing 'revol.bal' by 'revol.util'
    df['credit_utilization'] = df['revol.bal'] / df['revol.util']

    # Create a new feature 'fico_to_income' by dividing 'fico' by 'log.annual.inc'
    df['fico_to_income'] = df['fico'] / df['log.annual.inc']

    # Create a new feature 'installment_to_income' by dividing 'installment' by 'log.annual.inc'
    df['installment_to_income'] = df['installment'] / df['log.annual.inc']

    # Fill in any missing or infinite values generated during feature creation
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)

    return df


# Perform feature engineering on the dataset
df = feature_engineering(df)

# Display the first 5 rows of the transformed dataset
print(df.head())


   credit.policy             purpose  int.rate  installment  log.annual.inc  \
0              1  debt_consolidation    0.1189       829.10       11.350407   
1              1         credit_card    0.1071       228.22       11.082143   
2              1  debt_consolidation    0.1357       366.86       10.373491   
3              1  debt_consolidation    0.1008       162.34       11.350407   
4              1         credit_card    0.1426       102.92       11.299732   

     dti  fico  days.with.cr.line  revol.bal  revol.util  inq.last.6mths  \
0  19.48   737        5639.958333      28854        52.1               0   
1  14.29   707        2760.000000      33623        76.7               0   
2  11.63   682        4710.000000       3511        25.6               1   
3   8.10   712        2699.958333      33667        73.2               1   
4  14.97   667        4066.000000       4740        39.5               0   

   delinq.2yrs  pub.rec  not.fully.paid  income_to_debt  credit_util

# Preprocessing and Balancing the Data(SMOTE)

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def preprocess_data(df):
    # Handle missing values
    df.fillna(df.median(), inplace=True)  # Using median to fill missing values

    # Calculate additional features
    df['installment_to_income_ratio'] = df['installment'] / df['log.annual.inc']
    df['credit_history'] = (df['days.with.cr.line'] / 365).round()

    # Drop unnecessary columns
    df = df.drop(['credit.policy', 'days.with.cr.line'], axis=1)

    # Encode categorical features
    categorical_features = ['purpose']
    one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
    one_hot_encoded = one_hot_encoder.fit_transform(df[categorical_features])
    encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names(categorical_features))
    df = pd.concat([df.drop(categorical_features, axis=1), encoded_df], axis=1)

    # Scale numerical features
    numerical_features = [
        'int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
        'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec',
        'installment_to_income_ratio', 'credit_history'
    ]
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

    # Keep only the 12 features
    columns_to_keep = [
        'int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
        'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec',
        'installment_to_income_ratio', 'credit_history', 'not.fully.paid'
    ]
    df = df[columns_to_keep]

    return df

# Preprocess the dataset
preprocessed_df = preprocess_data(df)

# Split the dataset into training and testing sets
X = preprocessed_df.drop('not.fully.paid', axis=1)
y = preprocessed_df['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the first 5 rows of the preprocessed dataset
print(preprocessed_df.head())

# SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

loan_df = pd.concat([X_train_smote, y_train_smote], axis=1)
loan_df['not.fully.paid'].value_counts()


  if __name__ == "__main__":


   int.rate  installment  log.annual.inc       dti      fico  revol.bal  \
0 -0.139318     2.463099        0.680388  0.998505  0.688825   0.353732   
1 -0.578868    -0.438854        0.244031  0.244540 -0.101303   0.495018   
2  0.486484     0.230708       -0.908659 -0.141885 -0.759742  -0.397073   
3 -0.813544    -0.757022        0.680388 -0.654697  0.030385   0.496321   
4  0.743509    -1.043992        0.597961  0.343326 -1.154806  -0.360663   

   revol.util  inq.last.6mths  delinq.2yrs   pub.rec  \
0    0.182704       -0.716989    -0.299730 -0.237003   
1    1.030602       -0.716989    -0.299730 -0.237003   
2   -0.730683       -0.262470    -0.299730 -0.237003   
3    0.909966       -0.262470    -0.299730 -0.237003   
4   -0.251586       -0.716989     1.531147 -0.237003   

   installment_to_income_ratio  credit_history  not.fully.paid  
0                     2.465358        0.366773               0  
1                    -0.457185       -0.655114               0  
2                

0    6434
1    6434
Name: not.fully.paid, dtype: int64

# Model Selection

In [5]:
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score

# Define the models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'LightGBM': LGBMClassifier(random_state=42)
}

# Function to train and evaluate each model
def evaluate_models(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        print(f"Evaluating {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Using cross-validation for a more reliable performance estimation
        cv_accuracy = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))
        cv_f1 = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='f1'))

        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  F1-score: {f1:.4f}")
        print(f"  Cross-validated Accuracy: {cv_accuracy:.4f}")
        print(f"  Cross-validated F1-score: {cv_f1:.4f}")
        print()

# Evaluate the models
evaluate_models(models, X_train_smote, y_train_smote, X_test, y_test)


Evaluating Random Forest...
  Accuracy: 0.8090
  F1-score: 0.1903
  Cross-validated Accuracy: 0.9050
  Cross-validated F1-score: 0.8953

Evaluating CatBoost...
  Accuracy: 0.8330
  F1-score: 0.1304
  Cross-validated Accuracy: 0.8779
  Cross-validated F1-score: 0.7922

Evaluating LightGBM...
  Accuracy: 0.8278
  F1-score: 0.1270
  Cross-validated Accuracy: 0.8744
  Cross-validated F1-score: 0.7892



In [6]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.base import ClassifierMixin
import joblib

# Define a combined classifier
class CombinedClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, clf1, clf2):
        self.clf1 = clf1
        self.clf2 = clf2
        self.n_features_in_ = None  # 添加这一行

    def fit(self, X, y):
        self.clf1.fit(X, y)
        self.clf2.fit(X, y)
        self.n_features_in_ = X.shape[1]  # 添加这一行
        return self

    def predict(self, X):
        y_pred1 = self.clf1.predict(X)
        y_pred2 = self.clf2.predict(X)
        y_pred = np.round((y_pred1 + y_pred2) / 2).astype(int)
        return y_pred

# Define the models
pipelines = {
    'CombinedClassifier': Pipeline([
        ('classifier', CombinedClassifier(
            RandomForestClassifier(random_state=42),
            CatBoostClassifier(random_state=42, verbose=0)
        ))
    ])
}

# Hyperparameters to be tuned
params = {
    'CombinedClassifier': {
        'classifier__clf1__n_estimators': [500],
        'classifier__clf1__max_depth': [None],
        'classifier__clf2__iterations': [750],
        'classifier__clf2__learning_rate': [0.2]
    }
}

# Evaluation metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Function to perform hyperparameter tuning and cross-validated evaluation
def evaluate_pipelines(pipelines, params, X_train, y_train):
    best_models = {}

    for name, pipeline in pipelines.items():
        print(f"Evaluating {name}...")

        # Hyperparameter tuning using GridSearchCV
        grid_search = GridSearchCV(pipeline, params[name], cv=5, scoring=scoring, refit='accuracy', verbose=1, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        print(f"  Best Parameters: {best_params}")

        # Retrain the model with the best parameters
        best_model = pipeline.set_params(**best_params)
        best_model.fit(X_train, y_train)
        best_models[name] = best_model

        # Cross-validated evaluation
        cv_results = cross_validate(best_model, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1)
        for metric in scoring:
            mean_score = np.mean(cv_results[f'test_{metric}'])
            print(f"  Cross-validated {metric}: {mean_score:.4f}")

        print()

    return best_models

# Evaluate the pipelines and obtain the best models
best_models = evaluate_pipelines(pipelines, params, X_train_smote, y_train_smote)

# Save the best-performing models
for name, model in best_models.items():
    joblib.dump(model, f"{name}_best_model.pkl")

import joblib
# Save the best model to disk
joblib.dump(model, 'loan_classifier.joblib')


Evaluating CombinedClassifier...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
  Best Parameters: {'classifier__clf1__max_depth': None, 'classifier__clf1__n_estimators': 500, 'classifier__clf2__iterations': 750, 'classifier__clf2__learning_rate': 0.2}
  Cross-validated accuracy: 0.8762
  Cross-validated precision: 0.9670
  Cross-validated recall: 0.7857
  Cross-validated f1: 0.7943



['loan_classifier.joblib']

# Conclusion
After Gradio web app is built, we upload this application on HuggingFace Hub( https address: https://huggingface.co/spaces/joshchentw/LoanApprovalPrediction).