### This notebook requires scikit-learn 1.0.2 (ignore error message regarding OSError)

In [1]:
# Please make sure the current python environment is NOT running any python instances with sklearn imported or this will cause error
!pip install scikit-learn==1.0.2 --user



In [2]:
# Check if 1.0.2 actually installed... if not please re-reun cell 1 
import sklearn
if sklearn.__version__ != '1.0.2':
    print("Please re-run cell 1: scikit-learn version still not satisfied - currently {}".format(sklearn.__version__)) 
else:
    print("Good to go!")

Good to go!


In [3]:
import warnings
warnings.filterwarnings("ignore")
import sys
import pickle as pkl
sys.path.insert(1, '../src')
from creditrisk_pipeline import *
import pandas as pd
pd.set_option('display.max_rows', None) 
import matplotlib.pyplot as plt
import time
import yaml

### Load model class and data sorted by issue date

In [4]:
cr = CreditRiskModel(model_config_path = "../config/creditrisk_config.yaml")

In [5]:
config = yaml.safe_load(open('../config/creditrisk_config.yaml'))

In [6]:
loans = cr.load_data_from_csv(earliest_issue_date = '2015-01-01')

Fully Paid     863756
Charged Off    170660
Name: loan_status, dtype: int64


### Sample/save recent 10K datapoints for portfolio selection set

In [7]:
test_set = loans.tail(n = 10000)

In [8]:
np.save(config['benchmark_testset_id'], test_set['id'].tolist())

### Split Train/Test. For test set, use the 10K custom test ID's for consistent evaluation- instead of a seeded random sample.

In [9]:
X_train, X_test, y_train, y_test, X, y = cr.split_data(loans, use_custom_set= True)
X_train.shape, X_test.shape

((120000, 141), (10000, 141))

In [10]:
cr.save_data(X_train, y_train)
cr.save_data(X_test, y_test, is_train_data = False)
y_test.value_counts()

0    9419
1     581
Name: default, dtype: int64

### Train Pipeline

In [11]:
start = time.time()
pipe = cr.fit(X_train, y_train)
end = time.time()
print('Training time in seconds:', end - start)

Training time in seconds: 221.90713334083557


In [12]:
cr.draw_pipeline(pipe)

In [13]:
pipeline_cleaned_train = pd.DataFrame(np.concatenate([pipe[:3].transform(X_train), y_train.values.reshape(-1,1)], axis = 1), columns = np.append(pipe[:3].get_feature_names_out(), 'default'))
# pipeline_cleaned_train.to_csv('../data/train/pipeline_cleaned_train.csv')

### Verify that class rebalancing within pipeline

In [14]:
resampled_X, resampled_y = pipe[3].fit_resample(pipe[:3].transform(X_train), y_train)
oversampled_counts = np.unique(resampled_y, return_counts = True)
print('After oversampling:', oversampled_counts)
resampled_X, resampled_y = pipe[4].fit_resample(resampled_X, resampled_y)
undersampled_counts = np.unique(resampled_y, return_counts = True)
print('After undersampling', undersampled_counts)

After oversampling: (array([0, 1], dtype=uint8), array([97538, 80250], dtype=int64))
After undersampling (array([0, 1], dtype=uint8), array([80258, 80250], dtype=int64))


### Training/Test AUC

In [15]:
cr.evaluate_pipeline(pipe, X_train, y_train)

ROC AUC: 0.796


0.7961552835128164

In [16]:
cr.evaluate_pipeline(pipe, X_test, y_test)

ROC AUC: 0.708


0.708428179829871

### Training/Test Accuracy

In [17]:
cr.evaluate_pipeline(pipe, X_train, y_train, 'accuracy')

Accuracy: 0.834


0.8343166666666667

In [18]:
cr.evaluate_pipeline(pipe, X_test, y_test, 'accuracy')

Accuracy: 0.927


0.9269

### Save pipeline into directory

In [19]:
cr.save_pipeline(pipe)