In [1]:
# post votes problem from the stack exchange 
# dataset from Relbench
# https://relbench.stanford.edu

In [36]:
import pandas as pd
from torch_frame.utils import infer_df_stype
import catboost
from sklearn import metrics

In [70]:
# first with a cut date of 1/1/2020
train_path = 'https://kurve-customers.s3.amazonaws.com/3d95ca0a-b09a-4cc7-8bbf-9c0e98932af2/3/output/post_votes_train'

In [38]:
df = pd.read_parquet(train_path)

In [39]:
df.shape

(77375, 244)

In [40]:
df.head()

Unnamed: 0,parent_Id,parent_OwnerUserId,parent_LastEditorUserId,parent_PostTypeId,parent_AcceptedAnswerId,parent_ParentId,parent_OwnerDisplayName,parent_LastEditorDisplayName,parent_Title,parent_Tags,...,PostH_num_events_60d,PostH_num_events_90d,PostH_num_events_180d,PostH_num_events_365d,PostH_num_events_730d,PostH_30dv60_change,PostH_60dv90_change,PostH_90dv180_change,PostH_180dv365_change,PostH_365dv730_change
0,355959,166351,,2,,286842,,,,,...,0.0,0.0,0.0,0.0,1.0,,,,,0.0
1,442883,166526,,2,,442875,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,412341,167900,,2,,412145,,,,,...,0.0,0.0,0.0,1.0,1.0,,,,0.0,1.0
3,341540,166327,,2,,277953,,,,,...,0.0,0.0,0.0,0.0,3.0,,,,,0.0
4,440469,166526,,2,,440324,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [41]:
df.columns

Index(['parent_Id', 'parent_OwnerUserId', 'parent_LastEditorUserId',
       'parent_PostTypeId', 'parent_AcceptedAnswerId', 'parent_ParentId',
       'parent_OwnerDisplayName', 'parent_LastEditorDisplayName',
       'parent_Title', 'parent_Tags',
       ...
       'PostH_num_events_60d', 'PostH_num_events_90d', 'PostH_num_events_180d',
       'PostH_num_events_365d', 'PostH_num_events_730d', 'PostH_30dv60_change',
       'PostH_60dv90_change', 'PostH_90dv180_change', 'PostH_180dv365_change',
       'PostH_365dv730_change'],
      dtype='object', length=244)

In [42]:
[c for c in df.columns if 'label' in c]

['pvote_Id_label']

In [43]:
target = [c for c in df.columns if 'label' in c][0]

In [44]:
df[target]

0           1
1           1
2           1
3           1
4           1
         ... 
77370    <NA>
77371    <NA>
77372    <NA>
77373    <NA>
77374    <NA>
Name: pvote_Id_label, Length: 77375, dtype: Int64

In [45]:
df[target] = df[target].apply(lambda x: x if not pd.isnull(x) else 0)

In [46]:
df.shape

(77375, 244)

In [47]:
df = df[~df['parent_OwnerUserId'].isnull()]

In [48]:
df.shape

(77375, 244)

In [49]:
df.head()

Unnamed: 0,parent_Id,parent_OwnerUserId,parent_LastEditorUserId,parent_PostTypeId,parent_AcceptedAnswerId,parent_ParentId,parent_OwnerDisplayName,parent_LastEditorDisplayName,parent_Title,parent_Tags,...,PostH_num_events_60d,PostH_num_events_90d,PostH_num_events_180d,PostH_num_events_365d,PostH_num_events_730d,PostH_30dv60_change,PostH_60dv90_change,PostH_90dv180_change,PostH_180dv365_change,PostH_365dv730_change
0,355959,166351,,2,,286842,,,,,...,0.0,0.0,0.0,0.0,1.0,,,,,0.0
1,442883,166526,,2,,442875,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,412341,167900,,2,,412145,,,,,...,0.0,0.0,0.0,1.0,1.0,,,,0.0,1.0
3,341540,166327,,2,,277953,,,,,...,0.0,0.0,0.0,0.0,3.0,,,,,0.0
4,440469,166526,,2,,440324,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [69]:
# cut date of 1/1/2021
test = pd.read_parquet(train_path)

In [51]:
test.shape

(77375, 244)

In [52]:
test[target] = test[target].apply(lambda x: x if not pd.isnull(x) else 0)

In [53]:
test[target].mean()

0.06823909531502423

In [54]:
df[target].mean()

0.06823909531502423

In [55]:
stypes = infer_df_stype(df)

In [56]:
len(stypes)

243

In [57]:
df.head(2)

Unnamed: 0,parent_Id,parent_OwnerUserId,parent_LastEditorUserId,parent_PostTypeId,parent_AcceptedAnswerId,parent_ParentId,parent_OwnerDisplayName,parent_LastEditorDisplayName,parent_Title,parent_Tags,...,PostH_num_events_60d,PostH_num_events_90d,PostH_num_events_180d,PostH_num_events_365d,PostH_num_events_730d,PostH_30dv60_change,PostH_60dv90_change,PostH_90dv180_change,PostH_180dv365_change,PostH_365dv730_change
0,355959,166351,,2,,286842,,,,,...,0.0,0.0,0.0,0.0,1.0,,,,,0.0
1,442883,166526,,2,,442875,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [58]:
features = [
    k for k,v in stypes.items()
    if str(v) == 'numerical'
    and k not in ['parent_Id', 'parent_OwnerUserId', 'parent_LastEditorUserId',
                 'parent_PostTypeId','parent_AcceptedAnswerId','parent_ParentId',
                  'parent_OwnerDisplayName','parent_LastEditorDisplayName'
                 ]
    and 'label' not in k and k != 'had_engagement']

In [59]:
[f for f in features if f not in test.columns]

[]

In [65]:
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool

# --------------------------------------------
# 0) Split once into train_full / test
# --------------------------------------------
X_train_full, X_test, y_train_full, y_test = train_test_split(
    df[features], df[target],
    test_size=0.20,
    random_state=42
)

# log targets (training/eval are done in log-space)
y_train_full_log = np.log1p(y_train_full)
y_test_log       = np.log1p(y_test)

# --------------------------------------------
# 1) K-Fold CV on train_full
# --------------------------------------------
k = 3
kf = KFold(n_splits=k, shuffle=True, random_state=42)

fold_maes = []
test_preds_log = np.zeros(len(X_test))      # ensemble predictions on test (log-space)
oof_preds_log  = np.zeros(len(X_train_full))  # optional OOF preds (log-space)
best_iters = []

cat_features = None  # replace with a list of categorical column indices if you have them

for fold, (idx_tr, idx_va) in enumerate(kf.split(X_train_full), 1):
    print(f"\n=== Fold {fold} ===")

    X_tr, X_va = X_train_full.iloc[idx_tr], X_train_full.iloc[idx_va]
    y_tr_log   = y_train_full_log.iloc[idx_tr]
    y_va_log   = y_train_full_log.iloc[idx_va]

    train_pool = Pool(X_tr, y_tr_log, cat_features=cat_features)
    valid_pool = Pool(X_va, y_va_log, cat_features=cat_features)

    mdl = CatBoostRegressor(
        # objective & metrics (L1 in log-space)
        loss_function="MAE",
        eval_metric="MAE",
        custom_metric=["MAE", "RMSE", "R2"],
        use_best_model=True,

        # capacity vs regularization
        iterations=2000,
        learning_rate=0.05,
        depth=5,
        l2_leaf_reg=10,

        # randomness / bagging / feature subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.75,
        rsm=0.8,
        random_strength=0.5,

        # tree / leaves
        feature_border_type="GreedyLogSum",
        min_data_in_leaf=20,
        boosting_type="Plain",

        # early stopping
        od_type="Iter",
        od_wait=150,

        verbose=False
    )

    mdl.fit(train_pool, eval_set=valid_pool)
    best_iters.append(mdl.best_iteration_ if mdl.best_iteration_ is not None else mdl.tree_count_)

    # ---- Fold validation (MAE in log-space) ----
    val_pred_log = mdl.predict(valid_pool)
    val_mae = mean_absolute_error(y_va_log, val_pred_log)
    fold_maes.append(val_mae)
    print(f"Fold {fold} validation Log-MAE : {val_mae:.4f}")

    # ---- Accumulate ensemble predictions on the held-out test (log-space) ----
    test_preds_log += mdl.predict(Pool(X_test, cat_features=cat_features)) / k

    # (optional) OOF predictions
    oof_preds_log[idx_va] = val_pred_log

# --------------------------------------------
# 2) CV summary and final test metric (log-space)
# --------------------------------------------
print("\n=== CV Summary ===")
print(f"Mean CV Log-MAE : {np.mean(fold_maes):.4f} ± {np.std(fold_maes):.4f}")
print(f"Folds Log-MAE   : {[f'{a:.4f}' for a in fold_maes]}")

test_log_mae = mean_absolute_error(y_test_log, test_preds_log)
print(f"\nFinal Test Log-MAE (averaged over {k} folds): {test_log_mae:.4f}")

# If you want predictions back in original count space:
yhat_test = np.expm1(test_preds_log)

# --------------------------------------------
# 3) Refit on the full training data for deployment
# --------------------------------------------
final_iters = int(np.median(best_iters)) if len(best_iters) > 0 else 2000

final_mdl = CatBoostRegressor(
    loss_function="MAE",
    eval_metric="MAE",
    custom_metric=["MAE", "RMSE", "R2"],
    use_best_model=True,

    iterations=final_iters,
    learning_rate=0.05,
    depth=5,
    l2_leaf_reg=10,

    bootstrap_type="Bayesian",
    bagging_temperature=0.75,
    rsm=0.8,
    random_strength=0.5,

    feature_border_type="GreedyLogSum",
    min_data_in_leaf=20,
    boosting_type="Plain",

    od_type="Iter",
    od_wait=150,

    verbose=False
)

final_mdl.fit(
    Pool(X_train_full, y_train_full_log, cat_features=cat_features),
    eval_set=Pool(X_test, y_test_log, cat_features=cat_features)  # keep a sanity eval
)
# For deployment-time predictions:
# preds_log = final_mdl.predict(new_X)
# preds = np.expm1(preds_log)


=== Fold 1 ===
Fold 1 validation Log-MAE : 0.0437

=== Fold 2 ===
Fold 2 validation Log-MAE : 0.0410

=== Fold 3 ===
Fold 3 validation Log-MAE : 0.0423

=== CV Summary ===
Mean CV Log-MAE : 0.0423 ± 0.0011
Folds Log-MAE   : ['0.0437', '0.0410', '0.0423']

Final Test Log-MAE (averaged over 3 folds): 0.0418


<catboost.core.CatBoostRegressor at 0x45a117190>

In [66]:
test['target_log'] = np.log1p(test[target])

In [67]:
test['pred_log'] = final_mdl.predict(test[features])

In [68]:
metrics.mean_absolute_error(test['target_log'], test['pred_log'])

0.04202306539686002

In [None]:
# According to the Relational deep learning paper
# Data scientist 20+ hours: 0.065
# Kumo: 0.065
# Kumo fine tuned: 0.065