In [1]:
import sys
import warnings

In [2]:
sys.path.append("../")

In [3]:
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [4]:
import pandas as pd
import numpy as np
import logging

import lightgbm as lgb

In [5]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

In [6]:
from typing import Dict, List, Tuple
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder

In [7]:
from preprocessing import create_target_variable, correct_data_types, seperate_train_inference_data
from feature_engineering import (create_duration_features, encode_categorical_features, impute_with_zero,
                                impute_missing_value, feature_selection)
#from model_pipeline import train_loan_model, make_predictions

from config import generate_model_predictions, cat_feat_list, num_feat_list

In [8]:
# list of features for missing value imputation
impute_features_column_names_list = ["loan_amnt",
    "funded_amnt",
    "int_rate",
    "annual_inc",
    "dti",
    "delinq_2yrs",
    "mths_since_last_delinq",
    "open_acc",
    "mths_since_first_cr_line",
    "term"]

# list of features for missing value imputation with zero
impute_zero_column_list = ["mths_since_last_delinq", "mths_since_first_cr_line"]

# feature to drop based on high multi-collinearity
features_to_drop = ["funded_amnt"]

In [9]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(threadName)s] [%(levelname)s] %(message)s",
)

In [10]:
input_data_file_path = "../data/loan_data.csv"
raw_data = pd.read_csv(input_data_file_path)
raw_data.sample(5)

Unnamed: 0,id,loan_amnt,funded_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,loan_status,...,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,wtd_loans,interest_rate,int_rate2,num_rate,numrate,int_rate3
983,1508735,15000,15000,36 months,17.77,540.56,2 years,MORTGAGE,96818.0,Current,...,8489.36,9725.66,6510.64,3215.02,,,17.77%,,,17.77
614,1331373,9925,9925,36 months,20.49,371.34,< 1 year,RENT,44000.0,Current,...,5164.17,7420.91,4760.83,2660.08,,,20.49%,,,20.49
2518,5030330,14825,14825,36 months,7.62,461.97,10+ years,MORTGAGE,90000.0,Current,...,11816.11,3695.76,3008.89,686.87,,,7.62%,,,7.62
6516,6617653,12000,12000,36 months,18.85,438.97,10+ years,RENT,78000.0,Current,...,10440.26,2628.18,1559.74,1068.44,,,18.85%,,,18.85
965,1499431,27000,27000,36 months,16.29,953.12,2 years,MORTGAGE,145000.0,Current,...,6930.68,24288.27,20069.32,4218.95,,,16.29%,,,16.29


In [11]:
raw_data.columns

Index(['id', 'loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
       'emp_length', 'home_ownership', 'annual_inc', 'loan_status', 'purpose',
       'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'mths_since_last_delinq', 'open_acc', 'revol_bal', 'total_acc',
       'out_prncp', 'total_pymnt', 'total_rec_prncp', 'total_rec_int',
       'wtd_loans', 'interest_rate', 'int_rate2', 'num_rate', 'numrate',
       'int_rate3'],
      dtype='object')

In [12]:
raw_data.loan_status.value_counts()

loan_status
Current               8122
Fully Paid             951
Charged Off            218
Late (31-120 days)     148
In Grace Period         48
Late (16-30 days)       21
Default                 16
Name: count, dtype: int64

In [13]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      10000 non-null  int64  
 1   loan_amnt               10000 non-null  int64  
 2   funded_amnt             10000 non-null  int64  
 3   term                    9524 non-null   object 
 4   int_rate                9524 non-null   float64
 5   installment             9524 non-null   float64
 6   emp_length              9119 non-null   object 
 7   home_ownership          9524 non-null   object 
 8   annual_inc              9524 non-null   float64
 9   loan_status             9524 non-null   object 
 10  purpose                 9524 non-null   object 
 11  addr_state              10000 non-null  object 
 12  dti                     9524 non-null   float64
 13  delinq_2yrs             9524 non-null   float64
 14  earliest_cr_line        9524 non-null  

In [14]:
raw_data = create_target_variable(raw_data)
raw_data.predicted_loan_status.value_counts()

2023-12-17 01:16:05,741 [MainThread] [INFO] Generating target variable.


predicted_loan_status
inference    8815
good          951
bad           234
Name: count, dtype: int64

In [15]:
today_date = datetime.today()
raw_data = create_duration_features(raw_data, today_date)
raw_data.mths_since_first_cr_line.value_counts()

mths_since_first_cr_line
295.3    14
294.4    13
298.1    13
258.0    13
293.2    13
         ..
618.2     1
198.5     1
478.2     1
345.7     1
524.4     1
Name: count, Length: 3110, dtype: int64

In [16]:
raw_data, categorical_column_names_list_encoded = encode_categorical_features(raw_data, cat_feat_list)

2023-12-17 01:16:05,783 [MainThread] [INFO] Finished Encoding the categorical variables


In [17]:
categorical_column_names_list_encoded

['emp_length Encoded',
 'home_ownership Encoded',
 'purpose Encoded',
 'addr_state Encoded']

In [18]:
raw_data = correct_data_types(raw_data, num_feat_list)

2023-12-17 01:16:05,808 [MainThread] [INFO] Corrected data types


In [19]:
raw_data.predicted_loan_status.value_counts()

predicted_loan_status
inference    8815
good          951
bad           234
Name: count, dtype: int64

In [87]:
def feature_selection_infer(
    df: pd.DataFrame,
    categorical_column_names_list_encoded: List[str],
    num_feat_list: List[str]
) -> pd.DataFrame:
    """
    Drop features based on high multicolinearity and select final columns for inference

    Args:
        df (pd.DataFrame): A pandas dataframe containing the test data
        categorical_column_names_list_encoded (List[str]): List of encoded categorical column names
        num_feat_list (List[str]): List of numberical column names
        features_to_drop (List[str]): List of column names to drop from training
        target_var_name (str): name of the target variable

    """
    all_feature_list = (
        ['id'] + categorical_column_names_list_encoded + num_feat_list
    )

    df = df[all_feature_list]

    return df

In [88]:
raw_data = impute_with_zero(raw_data, impute_zero_column_list)

input_data, inference_data = seperate_train_inference_data(raw_data)

input_data = impute_missing_value(input_data, impute_features_column_names_list,
                                  categorical_column_names_list_encoded)

inference_data = impute_missing_value(inference_data, impute_features_column_names_list,
                                     categorical_column_names_list_encoded)

input_data, final_num_feature_list = feature_selection(
    input_data, categorical_column_names_list_encoded, num_feat_list, features_to_drop)

inference_data = feature_selection_infer(
    inference_data, categorical_column_names_list_encoded, num_feat_list)

2023-12-17 01:56:10,040 [MainThread] [INFO] Identified 1185 data points for model training
2023-12-17 01:56:10,041 [MainThread] [INFO] Identified 8815 data points for making inferences


In [89]:
inference_data['funded_amnt']

0       18000.0
1       15675.0
2       16500.0
4        6400.0
6        6250.0
         ...   
9995    10000.0
9996    15000.0
9997     3500.0
9998    10000.0
9999     4000.0
Name: funded_amnt, Length: 8815, dtype: float64

In [45]:
input_data_good = input_data[input_data['predicted_loan_status']=='good'].sample(900)
input_data_bad = input_data[input_data['predicted_loan_status']=='bad']
input_data_balanced = pd.concat([input_data_good, input_data_bad])

In [24]:
final_feature_list = final_num_feature_list + categorical_column_names_list_encoded
final_feature_list

['loan_amnt',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'mths_since_last_delinq',
 'open_acc',
 'mths_since_first_cr_line',
 'term',
 'revol_bal',
 'total_acc',
 'out_prncp',
 'emp_length Encoded',
 'home_ownership Encoded',
 'purpose Encoded',
 'addr_state Encoded']

In [25]:
def train_lightgbm_model(
    X_train: pd.DataFrame,
    Y_train: pd.DataFrame,
    feature_name: List[str],
    categorical_feature: List[str],
    label: str,
) -> lgb.Booster:
    """Trains a LightGBM model, outputs the model object.

    Args:
        data (pd.DataFrame): Training data
        feature_name (list[str]): Names of the features in the raw data
        label (str): Name of the target/label column
        categorical_feature (optional): Names of the encoded categorical
        features.

    Returns:
        lgb.Booster: A LightGBM model object
    """
    logger.info("Training LightGBM")
    params = { 
    'objective': 'binary', 
    'boosting_type': 'gbdt', 
    'num_leaves': 31, 
    'learning_rate': 0.05
    } 
    
    logger.info("Defined model hyperparameters")
    
    logger.info("Starting model training") 

    model_object = lgb.LGBMClassifier(**params)
    model_object.fit(X_train, Y_train, feature_name=feature_name,
                     categorical_feature=categorical_feature)

    logger.info("Finished training the model")

    return model_object

In [90]:
def return_lightgbm_prediction(
    model_object: lgb.Booster, inference_features: pd.DataFrame, final_feature_list: List[str],
) -> pd.DataFrame:
    """Returns the prediction from the model object on each observation in the
    inference dataset.
    Args:
        model_object (lgb.Booster): A lightGBM model object.
        inference_features (pd.DataFrame): A dataframe with the features for the
        inference dataset.

    Returns:
        pd.DataFrame: A dataframe with point predictions for the inference dataset
    """
    logger.info("Starting predictions on the inference dataset")
    predictions = model_object.predict(inference_features[final_feature_list])
    probabilities = model_object.predict_proba(inference_features[final_feature_list])
    predictions_df = pd.DataFrame(
        {   "ID": inference_features['id'].to_numpy().tolist(),
            "funded_loan_amount": inference_features['funded_amnt'].to_numpy().tolist(),
            "prediction": predictions,
            "default_probability": probabilities[:, 0]
        }
    )
    logger.info("Predictions generated successfully")
    
    return predictions_df

In [27]:
def train_loan_model(
    df: pd.DataFrame,
    final_feature_list: List[str],
    categorical_column_names_list: List[str],
    target_column_name: str = "predicted_loan_status",
):
    df.dropna(subset=[target_column_name], inplace=True)
    df.reset_index(drop=True)
    
    features = df.drop(target_column_name, axis=1)
    X_train, X_val, Y_train, Y_val = train_test_split( 
    features, df[target_column_name], random_state=1, test_size=0.3)
    
    model_object_lightgbm = train_lightgbm_model(
        X_train,
        Y_train,
        final_feature_list,
        categorical_column_names_list,
        target_column_name
    )
    
    predictions = model_object_lightgbm.predict(X_val)
    
    accuracy = accuracy_score(Y_val, predictions)
    logger.info(f"Accuracy score from validation set: {accuracy}")
    
    logger.info(f"Validation Metrics below:\n{classification_report(Y_val, predictions)}")
    metrics_df = classification_report(Y_val, predictions, output_dict=True)
    metrics_df = pd.DataFrame(metrics_df).transpose()
    
    logger.info(f"Confusion Matrix:\n{confusion_matrix(Y_val, predictions)}")
    
    feature_importances = model_object_lightgbm.feature_importances_
    gain_importance_df = pd.DataFrame({'Feature': final_feature_list, 'Gain': feature_importances})
    gain_importance_df = gain_importance_df.sort_values(by='Gain', ascending=False)
    
    return model_object_lightgbm, predictions, gain_importance_df, metrics_df

In [46]:
loan_model, predictions, feat_importance, metrics_df \
= train_loan_model(input_data_balanced, final_feature_list, categorical_column_names_list_encoded)

2023-12-17 01:21:04,705 [MainThread] [INFO] Training LightGBM
2023-12-17 01:21:04,706 [MainThread] [INFO] Defined model hyperparameters
2023-12-17 01:21:04,707 [MainThread] [INFO] Starting model training
2023-12-17 01:21:04,795 [MainThread] [INFO] Finished training the model
2023-12-17 01:21:04,799 [MainThread] [INFO] Accuracy score from validation set: 0.7683284457478006
2023-12-17 01:21:04,813 [MainThread] [INFO] Validation Metrics below:
              precision    recall  f1-score   support

         bad       0.41      0.18      0.25        73
        good       0.81      0.93      0.86       268

    accuracy                           0.77       341
   macro avg       0.61      0.55      0.56       341
weighted avg       0.72      0.77      0.73       341

2023-12-17 01:21:04,829 [MainThread] [INFO] Confusion Matrix:
[[ 13  60]
 [ 19 249]]




In [47]:
metrics_df

Unnamed: 0,precision,recall,f1-score,support
bad,0.40625,0.178082,0.247619,73.0
good,0.805825,0.929104,0.863085,268.0
accuracy,0.768328,0.768328,0.768328,0.768328
macro avg,0.606038,0.553593,0.555352,341.0
weighted avg,0.720286,0.768328,0.731328,341.0


In [54]:
inference_data[final_feature_list]

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,mths_since_last_delinq,open_acc,mths_since_first_cr_line,term,revol_bal,total_acc,out_prncp,emp_length Encoded,home_ownership Encoded,purpose Encoded,addr_state Encoded
0,18000.0,10.16,383.870000,72804.0,16.73,0.0,0.0,21.0,340.5,60.0,8751.0,49.0,13263.18,1,0,1,17
1,15675.0,8.90,497.740000,100000.0,9.10,0.0,0.0,16.0,361.5,36.0,20650.0,45.0,15294.25,1,0,10,41
2,16500.0,7.90,333.780000,42000.0,10.43,0.0,0.0,9.0,370.3,60.0,2229.0,17.0,12966.64,2,0,10,28
4,6400.0,15.80,224.380000,34000.0,32.40,0.0,47.0,6.0,313.6,36.0,4915.0,15.0,4430.59,2,4,2,39
6,6250.0,7.51,194.450000,33600.0,18.05,0.0,0.0,7.0,223.3,36.0,5174.0,10.0,2072.55,7,0,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10000.0,14.09,277.777778,62000.0,16.94,0.0,0.0,10.0,0.0,36.0,,,,11,5,13,15
9996,15000.0,14.09,416.666667,62000.0,16.94,0.0,0.0,10.0,0.0,36.0,,,,11,5,13,4
9997,3500.0,14.09,97.222222,62000.0,16.94,0.0,0.0,10.0,0.0,36.0,,,,11,5,13,28
9998,10000.0,14.09,277.777778,62000.0,16.94,0.0,0.0,10.0,0.0,36.0,,,,11,5,13,4


In [91]:
predictions_df = return_lightgbm_prediction(loan_model, inference_data, final_feature_list)

2023-12-17 01:56:24,784 [MainThread] [INFO] Starting predictions on the inference dataset
2023-12-17 01:56:24,825 [MainThread] [INFO] Predictions generated successfully


In [92]:
predictions_df.prediction.value_counts()

prediction
good    8739
bad       76
Name: count, dtype: int64

In [93]:
predictions_df

Unnamed: 0,ID,funded_loan_amount,prediction,default_probability
0,571203,18000.0,good,0.224998
1,694891,15675.0,good,0.182151
2,784712,16500.0,good,0.190942
3,974654,6400.0,good,0.213906
4,1042871,6250.0,good,0.213906
...,...,...,...,...
8810,10105778,10000.0,good,0.103975
8811,10109949,15000.0,good,0.103975
8812,10112187,3500.0,good,0.103975
8813,10119897,10000.0,good,0.103975
