# Sample Weighting with XGBoost
I want to test sample weighting for optimizing an XGBoost model to perform just as well on a group of samples.

In [1]:
import itertools
from collections import OrderedDict

import numpy as np
import pandas as pd
import scipy as sc
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from skopt import gp_minimize
from skopt.space import Integer, Real
from skopt.utils import use_named_args

Prep the input data for modeling.

In [2]:
df_full = pd.read_parquet("data/application_train_proc.parquet")

In [3]:
df_full.shape

(307507, 136)

In [4]:
df_full.dtypes.value_counts()

float64    70
int64      41
object     16
int8        9
dtype: int64

Drop `object` columns and convert data to `np.float32` to keeps things consistent for XGBoost modeling.

In [5]:
drop_cols = df_full.dtypes.pipe(lambda x: x[x.eq("object")]).index.difference(
    ["code_gender", "name_contract_type"]
)
df = df_full.drop(columns=[*drop_cols, "cat_name_contract_type", "cat_code_gender"])
# For now we will just start off testing with a sample of the data to speed things up.
df = df.sample(200_000, random_state=123)

In [2]:
# df.shape

In [7]:
np.random.seed(123)
rand_num = np.random.uniform(size=df.shape[0])
dev = rand_num < 0.5
val = (0.5 <= rand_num) & (rand_num < 0.75)
tst = 0.75 <= rand_num

In [8]:
X = df.drop(columns=["target", "name_contract_type", "code_gender"]).astype(np.float32)
y = df["target"].astype(np.float32)

In [9]:
xdev = xgb.DMatrix(X[dev], label=y[dev], missing=np.nan)
xtst = xgb.DMatrix(X[tst], label=y[tst], missing=np.nan)
xval = xgb.DMatrix(X[val], label=y[val], missing=np.nan)

### Training a Baseline Model
Let's train a simple model to use as a baseline for our testing.  
We are going to try to perform bayesian optimization to get some better parameter weights here. For our "base" parameters, we will use the following...
```python
params = dict(
    objective="binary:logitraw",
    eta=0.1,
    max_depth=0,
    min_child_weight=50,
    max_leaves=35,
    tree_method="hist",
    grow_policy="lossguide",
    subsample=0.8,
    colsample_bytree=0.50,
    eval_metric="auc",
    seed=0,
)
```

In [10]:
xgb_search_space = [
    Real(0.05, 0.2, name="eta"),
    Integer(50, 200, name="min_child_weight"),
    Integer(15, 35, name="max_leaves"),
    Real(0.5, 1.0, name="subsample"),
    Real(0.5, 1.0, name="colsample_bytree"),
]

In [11]:
# These are our constant parameters, that will remain unchanged through
# all of modeling.
base_params = dict(
    objective="binary:logitraw",
    max_depth=0,
    tree_method="hist",
    grow_policy="lossguide",
    eval_metric="auc",
    seed=0,
)


@use_named_args(xgb_search_space)
def xgb_obj_func(**points):
    mod_points = {**points, **base_params}
    # print(mod_points)
    watchlist = [(xdev, "dev"), (xtst, "test")]
    bayes_mod = xgb.train(
        params=mod_points,
        dtrain=xdev,
        num_boost_round=1_000,
        early_stopping_rounds=50,
        evals=watchlist,
        verbose_eval=False,
    )
    print(f"score: {bayes_mod.best_score}, iterations: {bayes_mod.best_iteration}")
    return -bayes_mod.best_score

In [12]:
start_params = params = dict(
    eta=0.1, min_child_weight=50, max_leaves=35, subsample=0.8, colsample_bytree=0.50,
)

In [13]:
np.random.seed(0)
bayes_params = gp_minimize(
    xgb_obj_func, xgb_search_space, n_calls=40, x0=[0.1, 50, 35, 0.8, 0.50,]
)

score: 0.742452, iterations: 82
score: 0.742312, iterations: 75
score: 0.743218, iterations: 114
score: 0.742947, iterations: 68
score: 0.741239, iterations: 46
score: 0.742735, iterations: 70
score: 0.743957, iterations: 94
score: 0.744133, iterations: 168
score: 0.743576, iterations: 160
score: 0.742585, iterations: 79
score: 0.742967, iterations: 166
score: 0.742406, iterations: 73
score: 0.742013, iterations: 77
score: 0.744938, iterations: 237
score: 0.742563, iterations: 78
score: 0.743334, iterations: 162
score: 0.742416, iterations: 99
score: 0.741713, iterations: 41
score: 0.740593, iterations: 55
score: 0.743631, iterations: 80
score: 0.739836, iterations: 50
score: 0.745093, iterations: 276
score: 0.745002, iterations: 264
score: 0.744763, iterations: 274
score: 0.745638, iterations: 338
score: 0.74351, iterations: 168
score: 0.741958, iterations: 49
score: 0.743775, iterations: 209
score: 0.740278, iterations: 43
score: 0.743642, iterations: 182
score: 0.744886, iterations:

Now we will take the top parameters, and use that in a model.

In [14]:
best_bayes_params = np.argmin(bayes_params.func_vals)
best_bayes_params = {
    s.name: p for s, p in zip(xgb_search_space, bayes_params.x_iters[best_bayes_params])
}

In [15]:
best_bayes_params = {**base_params, **best_bayes_params}
best_bayes_params

{'objective': 'binary:logitraw',
 'max_depth': 0,
 'tree_method': 'hist',
 'grow_policy': 'lossguide',
 'eval_metric': 'auc',
 'seed': 0,
 'eta': 0.05143687646997485,
 'min_child_weight': 194,
 'max_leaves': 35,
 'subsample': 0.5040451907112526,
 'colsample_bytree': 0.7373502951988324}

In [16]:
np.min(bayes_params.func_vals) * -1

0.745638

In [17]:
dir(bayes_params)

['fun',
 'func_vals',
 'models',
 'random_state',
 'space',
 'specs',
 'x',
 'x_iters']

In [18]:
watchlist = [(xdev, "dev"), (xtst, "test")]

In [19]:
base_mod = xgb.train(
    params=best_bayes_params,
    dtrain=xdev,
    num_boost_round=1_000,
    early_stopping_rounds=50,
    evals=watchlist,
    verbose_eval=100,
)

[0]	dev-auc:0.71538	test-auc:0.70429
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
[100]	dev-auc:0.76297	test-auc:0.73947
[200]	dev-auc:0.77615	test-auc:0.74429
[300]	dev-auc:0.78508	test-auc:0.74521
Stopping. Best iteration:
[338]	dev-auc:0.78807	test-auc:0.74564



Now let's see what the performance is like on our validation dataset, and performance for each of our subsets, created by the pclass variable.

In [20]:
base_preds = base_mod.predict(xval, ntree_limit=base_mod.best_ntree_limit)

In [21]:
df_val = df.loc[val].copy()
df_val["base_preds"] = base_preds

In [22]:
roc_auc_score(df_val["target"], df_val["base_preds"])

0.7525782050584848

In [23]:
df_val.groupby(["name_contract_type", "code_gender"]).apply(
    lambda x: roc_auc_score(x["target"], x["base_preds"])
)

name_contract_type  code_gender
Cash loans          F              0.753583
                    M              0.743710
Revolving loans     F              0.734121
                    M              0.728449
dtype: float64

### Optimizing Weights by Sample
What we will do now, is try to optimize the sample weights of the goods and bads by each level of pclass and goods and bads.

In [24]:
df_dev = df[dev]
df_tst = df[tst]
dev_sample_levels = list(
    zip(df_dev["name_contract_type"], df_dev["code_gender"], df_dev["target"])
)
# tst_sample_levels = list(
#     zip(df_tst["name_contract_type"], df_tst["code_gender"], df_tst["target"])
# )

In [25]:
# create model weights, we will initialize them as 1
# Originally I Was weighting by two fields, and good and bad, but let's just weight by bad.
model_samples = list(
    itertools.product(
        df["name_contract_type"].unique(), df["code_gender"].unique() , [0.0, 1.0]
    )
)
model_weights = OrderedDict()
for s in model_samples:
    model_weights[s] = 1.0

In [26]:
model_weights

OrderedDict([(('Cash loans', 'M', 0.0), 1.0),
             (('Cash loans', 'M', 1.0), 1.0),
             (('Cash loans', 'F', 0.0), 1.0),
             (('Cash loans', 'F', 1.0), 1.0),
             (('Revolving loans', 'M', 0.0), 1.0),
             (('Revolving loans', 'M', 1.0), 1.0),
             (('Revolving loans', 'F', 0.0), 1.0),
             (('Revolving loans', 'F', 1.0), 1.0)])

In [27]:
sample_space = [Real(0.2, 5) for _ in model_weights]

In [28]:
sample_space

[Real(low=0.2, high=5, prior='uniform', transform='identity'),
 Real(low=0.2, high=5, prior='uniform', transform='identity'),
 Real(low=0.2, high=5, prior='uniform', transform='identity'),
 Real(low=0.2, high=5, prior='uniform', transform='identity'),
 Real(low=0.2, high=5, prior='uniform', transform='identity'),
 Real(low=0.2, high=5, prior='uniform', transform='identity'),
 Real(low=0.2, high=5, prior='uniform', transform='identity'),
 Real(low=0.2, high=5, prior='uniform', transform='identity')]

In [29]:
xdev_w = xgb.DMatrix(X[dev], label=y[dev], missing=-1.0)
# xtst_w = xgb.DMatrix(X[tst], label=y[tst], missing=-1.0)
# xval_w = xgb.DMatrix(X[val], label=y[val], missing=-1.0)

df_tst_w = df_tst[["target", "name_contract_type", "code_gender"]].copy()

n_iter = [0]


def obj_func(points):
    # Update the weights for the model
    for p, k in zip(points, model_weights):
        model_weights[k] = p

    # Create dev and validation weights
    dev_weights = [model_weights.get(lev) for lev in dev_sample_levels]

    # Set weights for dev
    xdev_w.set_weight(np.array(dev_weights))

    # Create model
    watchlist = [(xdev_w, "dev"), (xtst, "test")]
    weight_model = xgb.train(
        params=best_bayes_params, dtrain=xdev_w, num_boost_round=150
    )
    # weight_model = xgb.train(
    #     params=best_bayes_params,
    #     dtrain=xdev_w,
    #     num_boost_round=2_000,
    #     early_stopping_rounds=50,
    #     evals=watchlist,
    #     verbose_eval=False,
    # )

    # Predict
    df_tst_w["w_preds"] = weight_model.predict(xtst)
    tst_performance = df_tst_w.groupby(["name_contract_type", "code_gender"]).apply(
        lambda x: roc_auc_score(x["target"], x["w_preds"])
    )

    # Calculate haromonic mean for the auc
    obj_auc = sc.stats.hmean(tst_performance)
    # obj_auc = np.sum(tst_performance)
    # obj_auc = np.mean(tst_performance)

    # We will try to weight it by the recipricol of base model auc of
    # each of the subsamples
    # obj_auc = np.average(tst_performance, weights=)
    print_list = [
        n_iter[0],
        round(obj_auc, 4),
        *tst_performance.round(4),
        weight_model.best_ntree_limit,
    ]
    for v in print_list:
        print(str(v).ljust(7), end="")
    print()
    n_iter[0] += 1
    # print(round(obj_auc, 4), *tst_performance.round(4))
    return -obj_auc

In [30]:
np.random.seed(0)
max_weights = gp_minimize(
    obj_func, sample_space, x0=[1.0 for _ in model_weights], n_calls=40
)

0      0.7399 0.7365 0.7388 0.7334 0.7513 150    
1      0.7336 0.737  0.7374 0.7304 0.7299 150    
2      0.7331 0.7355 0.7318 0.7385 0.7266 150    
3      0.7349 0.7325 0.7373 0.7269 0.743  150    
4      0.7382 0.7359 0.7383 0.7334 0.745  150    
5      0.7371 0.7354 0.7358 0.735  0.7424 150    
6      0.7301 0.7243 0.733  0.7152 0.7485 150    
7      0.7216 0.7076 0.7176 0.7088 0.7546 150    
8      0.7372 0.7372 0.7382 0.733  0.7404 150    
9      0.7244 0.7062 0.7238 0.715  0.7545 150    
10     0.722  0.7287 0.7344 0.7004 0.7253 150    
11     0.7272 0.7301 0.719  0.7402 0.7199 150    
12     0.7232 0.7265 0.7219 0.7129 0.7316 150    
13     0.7277 0.7234 0.7251 0.7336 0.7287 150    
14     0.7349 0.7323 0.737  0.7348 0.7357 150    
15     0.7143 0.6951 0.7123 0.7073 0.7442 150    
16     0.7123 0.7324 0.7323 0.6857 0.7009 150    
17     0.726  0.7273 0.719  0.7158 0.7424 150    
18     0.7135 0.7235 0.7098 0.7155 0.7055 150    
19     0.7391 0.7381 0.7367 0.7381 0.7436 150    


In [31]:
best_weight_round = np.argmin(max_weights.func_vals[1:]) + 1
best_weight_round

32

In [32]:
best_weights = max_weights.x_iters[best_weight_round]

In [33]:
[(l, w) for l, w in zip(model_weights, best_weights)]

[(('Cash loans', 'M', 0.0), 2.5901676776738416),
 (('Cash loans', 'M', 1.0), 1.3391165255458282),
 (('Cash loans', 'F', 0.0), 1.9632202368845033),
 (('Cash loans', 'F', 1.0), 1.3050192098764302),
 (('Revolving loans', 'M', 0.0), 0.32052742599689765),
 (('Revolving loans', 'M', 1.0), 4.6704010530751106),
 (('Revolving loans', 'F', 0.0), 1.2398942862551503),
 (('Revolving loans', 'F', 1.0), 0.5587798817832852)]

In [34]:
best_weights

[2.5901676776738416,
 1.3391165255458282,
 1.9632202368845033,
 1.3050192098764302,
 0.32052742599689765,
 4.6704010530751106,
 1.2398942862551503,
 0.5587798817832852]

In [35]:
for p, k in zip(best_weights, model_weights):
    model_weights[k] = p

In [36]:
dev_weights = [model_weights.get(lev) for lev in dev_sample_levels]

In [37]:
xdev_w.set_weight(np.array(dev_weights))

In [38]:
watchlist = [(xdev_w, "dev"), (xtst, "test")]
weight_model = xgb.train(
    params=best_bayes_params,
    dtrain=xdev_w,
    num_boost_round=2_000,
    early_stopping_rounds=50,
    evals=watchlist,
    verbose_eval=100,
)

[0]	dev-auc:0.71468	test-auc:0.69967
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
[100]	dev-auc:0.76833	test-auc:0.73673
[200]	dev-auc:0.78763	test-auc:0.73796
Stopping. Best iteration:
[168]	dev-auc:0.78257	test-auc:0.73864



In [39]:
weight_preds = weight_model.predict(xval, ntree_limit=base_mod.best_ntree_limit)

In [40]:
df_val["weight_preds"] = weight_preds

In [41]:
roc_auc_score(df_val["target"], df_val["weight_preds"])

0.7454123298549721

In [42]:
# 0.7661 0.7235 0.7503 0.6699 0.9863

In [43]:
df_val.groupby(["name_contract_type", "code_gender"]).apply(
    lambda x: roc_auc_score(x["target"], x["weight_preds"])
)

name_contract_type  code_gender
Cash loans          F              0.750153
                    M              0.741886
Revolving loans     F              0.742355
                    M              0.731648
dtype: float64

In [44]:
df_val.groupby(["name_contract_type", "code_gender"]).apply(
    lambda x: roc_auc_score(x["target"], x["base_preds"])
)

name_contract_type  code_gender
Cash loans          F              0.753583
                    M              0.743710
Revolving loans     F              0.734121
                    M              0.728449
dtype: float64

In [45]:
roc_auc_score(df_val["target"], df_val["base_preds"])

0.7525782050584848