In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [15]:
!wget -P "../data" -nc 'https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CreditScoring.csv'

File ‘../data/CreditScoring.csv’ already there; not retrieving.



In [16]:
df = pd.read_csv('../data/CreditScoring.csv')
df.columns = df.columns.str.lower()

In [17]:
# map numerical codes to their real values. The real values were from the R code found under the same repo where the csv was download.
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}
records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}
job_values = {
    1: 'fixed',
    2: 'part-time',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.status = df.status.map(status_values)
df.home = df.home.map(home_values)
df.marital = df.marital.map(marital_values)
df.records = df.records.map(records_values)
df.job = df.job.map(job_values)

In [18]:
# Looks like we have some unusual max numbers. We need them to 'na' as they are really missing values.
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

# We have only one record where status is 'unk'. This is not useful for us. So delete the record.
df = df[df.status != 'unk'].reset_index()

In [19]:
# data is now clean. Now do the train, val, test split.

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)
print(type(df_train))
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
len(df_train), len(df_val), len(df_test)

y_train = (df_train.status == 'default').astype('int')
y_val = (df_val.status == 'default').astype('int')
y_test = (df_test.status == 'default').astype('int')

del df_train['status']
del df_val['status']
del df_test['status']
del df_train['index']
del df_val['index']
del df_test['index']

<class 'pandas.core.frame.DataFrame'>


# Model
## Train

In [20]:
#
# train the model
#

dv = DictVectorizer(sparse=False)

df_full_train = df_full_train.reset_index(drop=True)
y_full_train = (df_full_train.status == 'default').astype(int).values
del df_full_train['status']
dicts_full_train = df_full_train.to_dict(orient='records')

In [21]:

X_full_train = dv.fit_transform(dicts_full_train)
# dm_full_train = xgb.DMatrix(X_full_train,
#                             label=y_full_train,
#                             feature_names=dv.get_feature_names_out())
dm_full_train = xgb.DMatrix(X_full_train,
                            label=y_full_train)  # removed the feature names because otherwise we must pass the feature names when predicting (in the bentoml service. If not the predict.run() fails with an error.)

xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,

    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1
}

evals_result = {}
auc_summary = []

model = xgb.train(params=xgb_params,
                  dtrain=dm_full_train,
                  num_boost_round=175,
                  evals_result=evals_result)

#

# BentoML

Problem with Pickle is we might not capture what's required by different ML frameworks (e.g. xgboost). BentoML has framework specific methods we can call to perform when serialising the model, so it knows what to capture with the model file.  So it has everything it needs when we deserialize it to perform prediction.

* You can start bentoml service manually or have a bentomlfile.yaml in the directory (similar to Dokerfile.yaml).

In [22]:
import bentoml

bentoml.xgboost.save_model("credit_risk_model", model,
                           custom_objects={
                               "dictVectorizer": dv
                           },
                           signatures={
                               "predict": {
                                   "batchable": True,
                                   "batch_dim": 0
                               }
                           })

Model(tag="credit_risk_model:5eueuasrlsjce6cp", path="/Users/kaushalya/bentoml/models/credit_risk_model/5eueuasrlsjce6cp/")

## Test

In [23]:
# test the model
#
dicts_test = df_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)
dm_test = xgb.DMatrix(X_test,
                      feature_names=dv.get_feature_names_out())  # make sure you do not do this before saving the model. Otherwise, this go insert the feature_names in the model. We do not want to include the feature names in the model. See my other comment in the block where we train the model.

y_pred = model.predict(dm_test)

## Measure performance

In [24]:
#
# measure performance
#
roc_auc_score(y_test, y_pred)

0.8216607203948976

In [25]:
dv.feature_names_

['age',
 'amount',
 'assets',
 'debt',
 'expenses',
 'home=ignore',
 'home=other',
 'home=owner',
 'home=parents',
 'home=private',
 'home=rent',
 'home=unk',
 'income',
 'index',
 'job=fixed',
 'job=freelance',
 'job=others',
 'job=part-time',
 'job=unk',
 'marital=divorced',
 'marital=married',
 'marital=separated',
 'marital=single',
 'marital=unk',
 'marital=widow',
 'price',
 'records=no',
 'records=yes',
 'seniority',
 'time']

# Sample record for testing

In [26]:
df_train.iloc[0].to_dict()

{'seniority': 10,
 'home': 'owner',
 'time': 36,
 'age': 36,
 'marital': 'married',
 'records': 'no',
 'job': 'freelance',
 'expenses': 75,
 'income': 0.0,
 'assets': 10000.0,
 'debt': 0.0,
 'amount': 1000,
 'price': 1400}