In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [28]:
!wget -P data -nc 'https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CreditScoring.csv'

File ‘data/CreditScoring.csv’ already there; not retrieving.



In [29]:
df = pd.read_csv('data/CreditScoring.csv')
df.columns = df.columns.str.lower()

In [30]:
# map numerical codes to their real values. The real values were from the R code found under the same repo where the csv was download.
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}
records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}
job_values = {
    1: 'fixed',
    2: 'part-time',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.status = df.status.map(status_values)
df.home = df.home.map(home_values)
df.marital = df.marital.map(marital_values)
df.records = df.records.map(records_values)
df.job = df.job.map(job_values)

In [31]:
# Looks like we have some unusual max numbers. We need them to 'na' as they are really missing values.
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

# We have only one record where status is 'unk'. This is not useful for us. So delete the record.
df = df[df.status != 'unk'].reset_index()

In [32]:
# data is now clean. Now do the train, val, test split.

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)
print(type(df_train))
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
len(df_train), len(df_val), len(df_test)

y_train = (df_train.status == 'default').astype('int')
y_val = (df_val.status == 'default').astype('int')
y_test = (df_test.status == 'default').astype('int')

del df_train['status']
del df_val['status']
del df_test['status']

<class 'pandas.core.frame.DataFrame'>


# Gradient Boosting with XGBoost

In [33]:
#
# train the model
#

dv = DictVectorizer(sparse=False)

df_full_train = df_full_train.reset_index(drop=True)
y_full_train = (df_full_train.status == 'default').astype(int).values
del df_full_train['status']
dicts_full_train = df_full_train.to_dict(orient='records')

X_full_train = dv.fit_transform(dicts_full_train)
dm_full_train = xgb.DMatrix(X_full_train,
                            label=y_full_train,
                            feature_names=dv.get_feature_names_out())

xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,

    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1
}

evals_result = {}
auc_summary = []

model = xgb.train(params=xgb_params,
                  dtrain=dm_full_train,
                  num_boost_round=175,
                  evals_result=evals_result)

#
# test the model
#
dicts_test = df_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)
dm_test = xgb.DMatrix(X_test, feature_names=dv.get_feature_names_out())

y_pred = model.predict(dm_test)

In [34]:
#
# measure performance
#
roc_auc_score(y_test, y_pred)

0.8270917843702653