In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

In [2]:
df_loan = pd.read_csv('loan.csv', sep=',', header=0, na_values={'NA': np.nan})
df_loan.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
0,5000,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.65,0,83.7,9,0,26,verified
1,2500,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0,9.4,4,1,12,verified
2,2400,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0,98.5,10,0,10,not verified
3,10000,36 months,13.49,10.0,RENT,49200.0,other,CA,20.0,0,21.0,37,0,15,verified
4,5000,36 months,7.9,3.0,RENT,36000.0,wedding,AZ,11.2,0,28.3,12,0,7,verified


In [3]:
target = 'bad_loan'

vars = ['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'dti', 'delinq_2yrs',
       'revol_util', 'total_acc', 'bad_loan', 'longest_credit_length']

# Splitting into numeric and character variables
varsc=list(df_loan[vars].select_dtypes(include='object'))
varsn=list(df_loan[vars].select_dtypes(include='number'))

print(varsc, varsn)

['term', 'home_ownership', 'purpose'] ['loan_amnt', 'int_rate', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'bad_loan', 'longest_credit_length']


In [4]:
def calculate_woe(df, feature, target):
    
    woe_df = df.groupby(feature)[target].agg(['count', 'sum'])
    woe_df.columns = ['total', 'events']
    woe_df['non_events'] = woe_df['total'] - woe_df['events']
    woe_df['event_rate'] = (woe_df['events'] + 0.5) / (woe_df['events'].sum() + 0.5)
    woe_df['non_event_rate'] = (woe_df['non_events'] + 0.5) / (woe_df['non_events'].sum() + 0.5)
    woe_df['woe'] = np.log(woe_df['non_event_rate'] / woe_df['event_rate'])
    woe_dict = woe_df['woe'].to_dict()
    
    return woe_dict

In [5]:
df_loan_woe = df_loan.copy()

for vc in varsc:
    woe_dict = calculate_woe(df_loan, vc, target)
    df_loan_woe[f'{vc}_woe'] = df_loan_woe[vc].map(woe_dict)
    df_loan_woe = df_loan_woe.drop(vc, axis = 1)
    varsn.append(f'{vc}_woe')

df_loan_woe[varsn].head()

Unnamed: 0,loan_amnt,int_rate,emp_length,annual_inc,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,term_woe,home_ownership_woe,purpose_woe
0,5000,10.65,10.0,24000.0,27.65,0,83.7,9,0,26,0.31,-0.13,0.5
1,2500,15.27,0.0,30000.0,1.0,0,9.4,4,1,12,-0.99,-0.13,-0.32
2,2400,15.96,10.0,12252.0,8.72,0,98.5,10,0,10,0.31,-0.13,-0.75
3,10000,13.49,10.0,49200.0,20.0,0,21.0,37,0,15,0.31,-0.13,-0.46
4,5000,7.9,3.0,36000.0,11.2,0,28.3,12,0,7,0.31,-0.13,-0.47


In [6]:
from typing import List
import statsmodels.api as sm

def stepwise_logistic_regression(df: pd.DataFrame
                                 , predictors: List[str]
                                 , target: str
                                 , significance_level: float = 0.05):
    """
    Perform stepwise logistic regression (both forward and backward selection)

    Parameters:
    -----------
    df : pandas DataFrame
        The input DataFrame containing both predictors and the target variable.
    predictors : List[str]
        A list of column names representing the predictor variables in the DataFrame.
    target : str
        The name of the column representing the target variable in the DataFrame.
    significance_level : float, optional (default=0.05)
        Significance level for adding or removing predictors.

    Returns:
    --------
    dict
        A dictionary containing the final model details:
            'selected_predictors': A list of the predictor variables selected in the final model.
            'summary': A pandas DataFrame with the final model summary.
            'model': The final fitted statsmodels logistic regression model.
    """

    # Prepare the data
    X_with_constant = sm.add_constant(df[predictors])
    y = df[target].values

    # Initialize variables
    included = []
    excluded = list(df.columns)

    def get_model(X, y, predictors):
        """Helper function to create logistic regression model"""
        return sm.Logit(y, X[predictors]).fit(disp = 0, method = "newton")

    def p_values(model):
        """Extract p-values from the model"""
        return model.pvalues

    # convert summary to DataFrame
    def summary_to_dataframe(model):
        """Convert statsmodels summary to pandas DataFrame"""
        summary_df = pd.DataFrame({
            'Coefficient': model.params,
            'Std Err': model.bse,
            'z-value': model.tvalues,
            'P>|z|': model.pvalues,
            '[0.025': model.conf_int()[0],
            '0.975]': model.conf_int()[1],
        })
        return summary_df

    while excluded:
        changed = False

        # Forward selection
        p_forward = {}
        for feature in excluded:
            try:
                current_predictors = included + [feature]
                model = get_model(X_with_constant, y, ['const'] + current_predictors)
                p_forward[feature] = p_values(model)[feature]
            except:
                continue

        # Add the most significant feature
        if p_forward:
            best_feature = min(p_forward, key=p_forward.get)
            if p_forward[best_feature] < significance_level:
                included.append(best_feature)
                excluded.remove(best_feature)
                changed = True
                print(f"Added {best_feature} with p-value {p_forward[best_feature]}")

        # Backward elimination
        if included:
            current_model = get_model(X_with_constant, y, ['const'] + included)
            p_backward = p_values(current_model)[1:]  # Exclude constant

            # Remove least significant feature if above threshold
            worst_feature = max(zip(included, p_backward), key=lambda x: x[1])
            if worst_feature[1] > significance_level:
                included.remove(worst_feature[0])
                excluded.append(worst_feature[0])
                changed = True
                print(f"Removed {worst_feature[0]} with p-value {worst_feature[1]}")

        # Stop if no changes were made
        if not changed:
            break

    # Final model
    final_predictors = included
    final_model = get_model(X_with_constant, y, ['const'] + final_predictors)

    return {
        'selected_predictors': final_predictors,
        'summary': summary_to_dataframe(final_model),
        'model': final_model
    }

In [7]:
from sklearn.metrics import auc, roc_curve

# Perform stepwise logistic regression
result = stepwise_logistic_regression(df_loan_woe, varsn, target)

print()
print("Selected Features:")
for predictor in result['selected_predictors']:
    print(predictor)

print()
print("Final Model Summary:")
preds = result['summary']
preds['Odds'] = np.exp(preds['Coefficient'])
display(preds)

X = sm.add_constant(df_loan_woe)
cols = list(preds.index)
ypred = result['model'].predict(X[cols])

df_loan_woe['ypred'] = ypred

# Calculate the false positive rate (FPR), true positive rate (TPR), and thresholds
y = df_loan[target].values
fpr, tpr, thresholds = roc_curve(y, ypred)

# Calculate the AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)
gini = 2.0*roc_auc-1.0
print('Fit statistics:')
print(f'AUC: {roc_auc:.5}')
print(f'Gini: {gini:.5}')

Added term_woe with p-value 4.548929681837627e-13
Added purpose_woe with p-value 4.370720608487864e-05
Added int_rate with p-value 4.141650217575633e-05
Added annual_inc with p-value 0.0003016729934227592
Added revol_util with p-value 0.018780511743367676
Added home_ownership_woe with p-value 0.03940522190283071

Selected Features:
term_woe
purpose_woe
int_rate
annual_inc
revol_util
home_ownership_woe

Final Model Summary:


Unnamed: 0,Coefficient,Std Err,z-value,P>|z|,[0.025,0.975],Odds
const,-2.57,0.42,-6.16,0.0,-3.39,-1.76,0.08
term_woe,-0.83,0.18,-4.71,0.0,-1.18,-0.49,0.43
purpose_woe,-1.29,0.26,-5.03,0.0,-1.79,-0.79,0.28
int_rate,0.08,0.03,2.56,0.01,0.02,0.13,1.08
annual_inc,-0.0,0.0,-2.78,0.01,-0.0,-0.0,1.0
revol_util,0.01,0.0,2.37,0.02,0.0,0.02,1.01
home_ownership_woe,-1.03,0.5,-2.06,0.04,-2.0,-0.05,0.36


Fit statistics:
AUC: 0.72719
Gini: 0.45438
