In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Data/credit_risk_data.csv")
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [28]:
print(df.columns)
print(df.info())

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 3258

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin


class WoEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, smoothing=0.5):
        self.cols = cols
        self.smoothing = smoothing


    def fit(self, X, y):
        self.woe_maps_ = {}
        self.iv_ = {}
        for col in self.cols:
            df = pd.DataFrame({'x': X[col], 'y': y})
            stats = df.groupby('x')['y'].agg(['count','sum'])
            stats['non_default'] = stats['count'] - stats['sum']
            stats['dist_default'] = (stats['sum'] + self.smoothing) / (y.sum() + self.smoothing)
            stats['dist_non_default'] = (stats['non_default'] + self.smoothing) / ((len(y)-y.sum()) + self.smoothing)
            stats['woe'] = np.log(stats['dist_non_default'] / stats['dist_default'])
            stats['iv'] = (stats['dist_non_default'] - stats['dist_default']) * stats['woe']
            self.woe_maps_[col] = stats['woe'].to_dict()
            self.iv_[col] = stats['iv'].sum()
        return self


    def transform(self, X):
        X = X.copy()
        for col in self.cols:
            X[col + '_woe'] = X[col].map(self.woe_maps_[col]).fillna(0)
            X.drop(columns=[col], inplace=True)
        return X

In [6]:
numeric_cols = df.select_dtypes(['int','float'])
categorical_cols = df.select_dtypes(['object'])

In [15]:
print("Shape of Data :", df.shape,"TotalColumns in Data : ",len(df.columns))
print("Shape Of Numeric Cols :" , numeric_cols.shape)
print("Shape Of Categorical Cols :",categorical_cols.shape)

Shape of Data : (32581, 12) TotalColumns in Data :  12
Shape Of Numeric Cols : (32581, 8)
Shape Of Categorical Cols : (32581, 4)


In [19]:
y = df['loan_status']
X = df.drop(columns=["loan_status"])

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score
import xgboost as xgb
import lightgbm as lgb

In [30]:
X_woe = X.copy()
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_woe, y, test_size=0.2, stratify=y, random_state=42)


woe = WoEEncoder(cols=categorical_cols)
X_train_w = woe.fit_transform(X_train_w, y_train_w)
X_test_w = woe.transform(X_test_w)

In [31]:
xgb_model = xgb.XGBClassifier(
    n_estimators= 200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree =0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

In [45]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    verbosity=-1,
    n_jobs=-1
)

In [46]:
baseline_xgbModel =xgb_model.fit(X_train_w,y_train_w)
baseline_lgbModel=lgb_model.fit(X_train_w,y_train_w)

In [40]:
y_pred_w = xgb_model.predict(X_test_w)
y_proba_w = xgb_model.predict_proba(X_test_w)[:,1]

y_pred_w_lgb = lgb_model.predict(X_test_w)
y_proba_w_lgb = lgb_model.predict_proba(X_test_w)[:,1]



In [47]:
print("========== XGB ===========")
print("Classification Report-XGB",classification_report(y_test_w,y_pred_w))
print("Accuracy Score-XGB : ",accuracy_score(y_test_w,y_pred_w))
print("Precion Score-XBG : ",precision_score(y_test_w,y_pred_w))
print("\n")

print("========== LGB ===========")

print("Classification Report-LGB",classification_report(y_test_w,y_pred_w_lgb))
print("Accuracy Score-LGB : ",accuracy_score(y_test_w,y_pred_w_lgb))
print("Precion Score-LGB : ",precision_score(y_test_w,y_pred_w_lgb))

Classification Report-XGB               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.97      0.72      0.83      1422

    accuracy                           0.93      6517
   macro avg       0.95      0.86      0.89      6517
weighted avg       0.94      0.93      0.93      6517

Accuracy Score-XGB :  0.934632499616388
Precion Score-XBG :  0.9715909090909091


Classification Report-LGB               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.97      0.72      0.83      1422

    accuracy                           0.94      6517
   macro avg       0.95      0.86      0.90      6517
weighted avg       0.94      0.94      0.93      6517

Accuracy Score-LGB :  0.9353997237992941
Precion Score-LGB :  0.9726156751652503
