In [3]:
## Example Implementation

In [17]:
# import packages
import roc_utils as ru
from scipy.stats import kendalltau
from scipy.stats import t
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from scipy.stats import norm

In [2]:
# import the dataset
default_dataset = pd.read_csv('credit_default.csv')

In [3]:
default_dataset.head()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
0,66155.9251,59.017015,8106.532131,0.122537,0
1,34415.15397,48.117153,6564.745018,0.190752,0
2,57317.17006,63.108049,8020.953296,0.13994,0
3,42709.5342,45.751972,6103.64226,0.142911,0
4,66952.68885,18.584336,8770.099235,0.13099,1


In [4]:
default_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Income          2000 non-null   float64
 1   Age             2000 non-null   float64
 2   Loan            2000 non-null   float64
 3   Loan to Income  2000 non-null   float64
 4   Default         2000 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 78.2 KB


In [5]:
default_dataset.describe()

Unnamed: 0,Income,Age,Loan,Loan to Income,Default
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,45331.600018,40.927143,4444.369695,0.098403,0.1415
std,14326.327119,13.26245,3045.410024,0.05762,0.348624
min,20014.48947,18.055189,1.37763,4.9e-05,0.0
25%,32796.45972,29.062492,1939.708847,0.047903,0.0
50%,45789.11731,41.382673,3974.719418,0.099437,0.0
75%,57791.28167,52.596993,6432.410625,0.147585,0.0
max,69995.68558,63.971796,13766.05124,0.199938,1.0


In [6]:
# get count of defaults and non-defaults
default_dataset['Default'].value_counts()

Default
0    1717
1     283
Name: count, dtype: int64

In [7]:
# set up independent and test vars
y = default_dataset['Default']
X = default_dataset.drop(columns=['Default'], axis=1)

In [8]:
# set up testing and traing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
# set up two models to compare
logit_model = LogisticRegression()
gradient_boosting_classifier = xgb.XGBClassifier(objective='binary:logistic', n_jobs=12)

In [10]:
# fit both models on training data
logit_model = logit_model.fit(X_train, y_train)
xgbc_model = gradient_boosting_classifier.fit(X_train, y_train)

In [11]:
# make predicitons for each model
# we want the probabilities not the actual prediction
# we will use these to make the roc comparison
log_model_pred = logit_model.predict_proba(X_test)[:,1]
xgbc_model_pred = xgbc_model.predict_proba(X_test)[:,1]

In [15]:
# get the t-stat based on Liu-Moench methodology
t_stat = ru.roc_t_stat(y_test, log_model_pred, xgbc_model_pred, kendalltau)
print('calulated t-stat: ', t_stat)
z_score = ru.roc_z_score(y_test, log_model_pred, xgbc_model_pred, kendalltau)
print('calculate z-score: ', z_score)

AVG CORR:  0.4871443431609619
AVG AREA:  1.9774273909345
R VALUE:  0.3
calulated t-stat:  -1.4863076004063425
calculate z-score:  -0.00859981024424306


In [20]:
# get degrees of freedom and p-val
deg_freedom = len(y_test) - 1
p_val = t.sf(abs(t_stat), deg_freedom) * 2
print('P-Value T-Test: ', p_val)
p_val_z = 2 * (1 - norm.cdf(abs(z_score)))
print('P-Value Z-Test: ', p_val_z)

P-Value T-Test:  0.13782930303502713
P-Value Z-Test:  0.9931384287569576
