# Decision Trees and Ensemble Learning 

## 6.1 Credit Risk scoreing project 

- Dataset: https://github.com/gastonstat/CreditScoring

## 6.2 Data Cleaning Preparation

- Downloading the dataset
- Re-enconding the categorical variables
- Doing the train/validation/test split

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

In [2]:
# !wget "https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv"

In [3]:
df = pd.read_csv('CreditScoring.csv')

In [4]:
df.columns = df.columns.str.lower()

status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

df = df[df.status != 'unk'].reset_index(drop=True)

In [5]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=11)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.status == 'default').astype('int').values
y_test = (df_test.status == 'default').astype('int').values

del df_train['status']
del df_test['status']

In [6]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train.fillna(0).to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

test_dicts = df_test.fillna(0).to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [7]:
#features = dv.get_feature_names_out()
#dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)

dtrain = xgb.DMatrix(X_train, label=y_train)

In [8]:
xgb_params = {
    'eta':0.1, 
    'max_depth':3, #how many levels
    'min_child_weight':1, #how many min sample leafs. How many observations we should have in a leaf node
    
    'objective':'binary:logistic',
    'eval_metric':'auc',
    
    'seed': 1, #for reproducibility
    'nthread:':4,
    'verbosity': 0, #show only warnings
    
}

model = xgb.train(xgb_params, dtrain,
                  num_boost_round=175)#same as n_estimators

In [9]:
#here we are actully calling special xgboost code
#based off of their documentation and we are saving
# the model in the way that it is supposed to be saved 
#for this particular version

#we tag it with a version ID, so that anytime you call
#save model you know that you are getting that specific model
#at that particular moment in the notebook

#Evrytime you call save model you are goint get a different
#unique tag

import bentoml

In [10]:
bentoml.xgboost.save_model("credit_risk_model", model,
                          custom_objects={
                              "dictVectorizer": dv
                          },
                          signatures={
                              "predict":{
                                  "batchable": True,
                                  "batch_dim": 0,
                              }
                          })

Model(tag="credit_risk_model:qnvlhxsqdwqk5ahg", path="/Users/humbertorodriguez/bentoml/models/credit_risk_model/qnvlhxsqdwqk5ahg/")

In [11]:
import json

In [12]:
request = df_test.iloc[0].to_dict()
print(json.dumps(request, indent=2))

{
  "seniority": 3,
  "home": "owner",
  "time": 36,
  "age": 26,
  "marital": "single",
  "records": "no",
  "job": "freelance",
  "expenses": 35,
  "income": 0.0,
  "assets": 60000.0,
  "debt": 3000.0,
  "amount": 800,
  "price": 1000
}
