In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
import statsmodels.api as sm

In [5]:
def clean(df):
    return df[~df['bank_fico_buckets_20'].isin(['Exception', 'Missing'])].copy()

def convert_fico(score):
    if score == '<= 560':
        return 560
    elif score =='761+':
        return 761

    bounds = score.split('-')
    return (int(bounds[0]) + int(bounds[1])) / 2

complete = pd.read_csv('data/training_data.csv',low_memory=False)
train, test = train_test_split(complete, test_size=0.1)
train_clean, test_clean = clean(train), clean(test)

train_clean['fico_score'] = train_clean['bank_fico_buckets_20'] \
    .apply(lambda score: convert_fico(score))

test_clean['fico_score'] = test_clean['bank_fico_buckets_20'] \
    .apply(lambda score: convert_fico(score))

In [7]:
def normalize(X):
    return (X - X.mean()) / X.std()

features = [
    'financial_active',
    'promotion_flag',
    'ever_delinquent_flg',
    'stmt_balance',
    'prev_balance',
    'credit_limit_amt',
    'promo_bal_amt',
    'fico_score'
]

X = train_clean[features]

X = normalize(X)

X = sm.add_constant(X)

y = train_clean['charge_off']

model = sm.Logit(y, X)

results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.008898
         Iterations 14


0,1,2,3
Dep. Variable:,charge_off,No. Observations:,5147578.0
Model:,Logit,Df Residuals:,5147569.0
Method:,MLE,Df Model:,8.0
Date:,"Sat, 25 Mar 2023",Pseudo R-squ.:,0.5226
Time:,18:13:46,Log-Likelihood:,-45803.0
converged:,True,LL-Null:,-95949.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-12.8400,0.057,-226.714,0.000,-12.951,-12.729
financial_active,1.3097,0.023,57.864,0.000,1.265,1.354
promotion_flag,0.3634,0.009,39.656,0.000,0.345,0.381
ever_delinquent_flg,0.1431,0.008,17.005,0.000,0.127,0.160
stmt_balance,-1.1015,0.028,-38.655,0.000,-1.157,-1.046
prev_balance,0.3691,0.005,71.938,0.000,0.359,0.379
credit_limit_amt,-5.5182,0.048,-115.128,0.000,-5.612,-5.424
promo_bal_amt,-1.1326,0.039,-29.355,0.000,-1.208,-1.057
fico_score,-1.3859,0.009,-146.373,0.000,-1.404,-1.367


In [11]:
# Getting our predictions.
X = test_clean[features]
X = normalize(X)
X = sm.add_constant(X)

predictions = results.predict(X).round()
predictions

123310     0.0
348157     0.0
1201095    0.0
4367768    0.0
4947906    0.0
          ... 
1178400    0.0
1274247    0.0
1373461    0.0
2689923    0.0
4992849    0.0
Length: 572029, dtype: float64

In [12]:
predictions.value_counts()

0.0    571569
1.0       460
dtype: int64

In [16]:
confusion_matrix(test_clean['charge_off'], predictions)

array([[570366,     55],
       [  1203,    405]], dtype=int64)

In [17]:
f1_score(test_clean['charge_off'], predictions)

0.39168278529980655