In [1]:
# Item LTV problem from the rel-amazon dataset
# on relbench: https://relbench.stanford.edu/datasets/rel-amazon/#item-ltv

# 1) https://demo.kurve.ai
# 2) create graph of rel-amazon data
# 3) assign user as parent node article with depth 4

In [2]:
import pandas as pd
from torch_frame.utils import infer_df_stype
import catboost
from sklearn import metrics

In [3]:
# train cut date of 10/1/15
train_path = 'https://kurve-customers.s3.amazonaws.com/4e1a245a-3065-4600-bb0e-a92e06ee835c/5/output/item_ltv_train'

In [4]:
df = pd.read_parquet(train_path)

In [5]:
df.shape

(506012, 41)

In [10]:
df.head()

Unnamed: 0,prod_product_id,prod_category,prod_brand,prod_title,prod_description,prod_price,revi_review_time_min,revi_review_time_max,revi_customer_id_count,revi_rating_avg,...,revi_3dv4_change,revi_4dv7_change,revi_7dv14_change,revi_14dv30_change,revi_30dv60_change,revi_60dv90_change,revi_90dv180_change,revi_180dv365_change,revi_365dv730_change,revi_price_label
0,451524,"[Books, History, Europe]",Warlord Games,Bolt Action: Armies of Italy and the Axis,Warlord Games is one of the world's leading pr...,13.5,2013-11-30,2015-09-21,5,4.8,...,,,0.0,1.0,1.0,1.0,1.0,0.333333,0.6,13.5
1,451920,"[Books, Children's Books, Fairy Tales, Folk Ta...",Visit Amazon's Hans Christian Andersen Page,An Illustrated Treasury of Hans Christian Ande...,,15.26,2015-01-07,2015-09-02,5,5.0,...,,,,0.0,1.0,0.333333,0.75,0.8,1.0,91.56
2,452468,"[Books, Literature &amp; Fiction, Genre Fiction]",Visit Amazon's Isabella Barclay Page,A Bachelor Establishment,,10.33,2015-06-27,2015-09-23,36,4.444444,...,,,0.0,0.375,0.380952,0.636364,0.916667,1.0,1.0,113.63
3,454146,"[Books, History, Military]",Visit Amazon's Christopher Webber Page,The Thracians 700 BCAD 46 (Men-at-Arms),"Packed with specially commissioned artwork, ma...",17.95,2011-02-22,2014-10-06,5,4.2,...,,,,,,,,0.0,0.5,17.95
4,454608,"[Books, Arts & Photography, Graphic Design]",Michael O'Mara Books,The Creative Colouring Book for Grown-Ups,"<DIV>Now in its 26th year, independent publish...",14.92,2014-07-12,2015-03-20,5,3.8,...,,,,,,,,0.0,0.4,29.84


In [11]:
target = [c for c in df.columns if 'label' in c][0]

In [12]:
target

'revi_price_label'

In [13]:
df[target].apply(lambda x: x if not pd.isnull(x) else 0)

0          13.50
1          91.56
2         113.63
3          17.95
4          29.84
           ...  
506007      0.00
506008      0.00
506009      0.00
506010      0.00
506011      0.00
Name: revi_price_label, Length: 506012, dtype: float64

In [14]:
df[target] = df[target].apply(lambda x: x if not pd.isnull(x) else 0)

In [16]:
# execute compute graph with
# cut date 1/1/2016
test = pd.read_parquet(train_path)

In [17]:
test[target] = test[target].apply(lambda x: x if not pd.isnull(x) else 0)

In [18]:
stypes = infer_df_stype(test.tail(1000))

In [20]:
features = [
    k for k,v in stypes.items()
    if str(v) == 'numerical'
    and 'label' not in k
    and k != 'prod_product_id'
]
features

['prod_price',
 'revi_customer_id_count',
 'revi_rating_avg',
 'revi_rating_sum',
 'revi_rating_min',
 'revi_rating_max',
 'revi_verified_sum',
 'revi_price_avg',
 'revi_price_sum',
 'revi_price_min',
 'revi_price_max',
 'revi_seconds_since_last',
 'revi_num_events_1d',
 'revi_num_events_3d',
 'revi_num_events_4d',
 'revi_num_events_7d',
 'revi_num_events_14d',
 'revi_num_events_30d',
 'revi_num_events_60d',
 'revi_num_events_90d',
 'revi_num_events_180d',
 'revi_num_events_365d',
 'revi_num_events_730d',
 'revi_1dv3_change',
 'revi_3dv4_change',
 'revi_7dv14_change',
 'revi_14dv30_change',
 'revi_30dv60_change',
 'revi_60dv90_change',
 'revi_90dv180_change',
 'revi_180dv365_change',
 'revi_365dv730_change']

In [21]:
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype, is_categorical_dtype, is_bool_dtype
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool

# --------------------------------------------
# 0) Split once into train_full / test
# --------------------------------------------
X_train_full, X_test, y_train_full, y_test = train_test_split(
    df[features], df[target],
    test_size=0.20,
    random_state=42
)

# --------------------------------------------
# Identify categorical columns (by name)
# --------------------------------------------
cat_features = [
    col for col in X_train_full.columns
    if is_object_dtype(X_train_full[col])
       or is_categorical_dtype(X_train_full[col])
       or is_bool_dtype(X_train_full[col])
]
print(f"Categorical features: {cat_features}")

# Optional (harmless): ensure 'category' dtype for object-y cats (speeds up a bit)
for c in cat_features:
    if not is_categorical_dtype(X_train_full[c]):
        X_train_full[c] = X_train_full[c].astype('category')
        X_test[c]       = X_test[c].astype('category')

# --------------------------------------------
# 1) K-Fold CV on train_full (no log transforms)
# --------------------------------------------
k = 3
kf = KFold(n_splits=k, shuffle=True, random_state=42)

fold_maes = []
test_preds = np.zeros(len(X_test))        # ensemble predictions on test (original scale)
oof_preds  = np.zeros(len(X_train_full))  # optional OOF preds (original scale)
best_iters = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(X_train_full), 1):
    print(f"\n=== Fold {fold} ===")

    X_tr, X_va = X_train_full.iloc[idx_tr], X_train_full.iloc[idx_va]
    y_tr, y_va = y_train_full.iloc[idx_tr], y_train_full.iloc[idx_va]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_features)
    valid_pool = Pool(X_va, y_va, cat_features=cat_features)

    mdl = CatBoostRegressor(
        # objective & metrics (L1 on original scale)
        loss_function="MAE",
        eval_metric="MAE",
        custom_metric=["MAE", "RMSE", "R2"],
        use_best_model=True,

        # capacity vs regularization
        iterations=2000,
        learning_rate=0.03,
        depth=7,
        l2_leaf_reg=10,

        # randomness / bagging / feature subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.75,
        rsm=0.8,
        random_strength=0.5,

        # tree / leaves
        feature_border_type="GreedyLogSum",
        min_data_in_leaf=20,
        boosting_type="Plain",

        # early stopping
        od_type="Iter",
        od_wait=150,

        verbose=False
    )

    mdl.fit(train_pool, eval_set=valid_pool)
    best_iters.append(mdl.best_iteration_ if mdl.best_iteration_ is not None else mdl.tree_count_)

    # ---- Fold validation (MAE on original scale) ----
    val_pred = mdl.predict(valid_pool)
    val_mae = mean_absolute_error(y_va, val_pred)
    fold_maes.append(val_mae)
    print(f"Fold {fold} validation MAE : {val_mae:.4f}")

    # ---- Accumulate ensemble predictions on the held-out test ----
    test_preds += mdl.predict(Pool(X_test, cat_features=cat_features)) / k

    # (optional) OOF predictions
    oof_preds[idx_va] = val_pred

# --------------------------------------------
# 2) CV summary and final test metric (original scale)
# --------------------------------------------
print("\n=== CV Summary ===")
print(f"Mean CV MAE : {np.mean(fold_maes):.4f} ± {np.std(fold_maes):.4f}")
print(f"Folds MAE   : {[f'{a:.4f}' for a in fold_maes]}")

test_mae = mean_absolute_error(y_test, test_preds)
print(f"\nFinal Test MAE (averaged over {k} folds): {test_mae:.4f}")

yhat_test = test_preds  # already in original target units

# --------------------------------------------
# 3) Refit on the full training data for deployment
# --------------------------------------------
final_iters = int(np.median(best_iters)) if len(best_iters) > 0 else 2000

final_mdl = CatBoostRegressor(
    loss_function="MAE",
    eval_metric="MAE",
    custom_metric=["MAE", "RMSE", "R2"],
    use_best_model=True,

    iterations=final_iters,
    learning_rate=0.03,
    depth=7,
    l2_leaf_reg=10,

    bootstrap_type="Bayesian",
    bagging_temperature=0.75,
    rsm=0.8,
    random_strength=0.5,

    feature_border_type="GreedyLogSum",
    min_data_in_leaf=20,
    boosting_type="Plain",

    od_type="Iter",
    od_wait=150,

    verbose=False
)

final_mdl.fit(
    Pool(X_train_full, y_train_full, cat_features=cat_features),
    eval_set=Pool(X_test, y_test, cat_features=cat_features)  # sanity eval on original scale
)

  or is_categorical_dtype(X_train_full[col])


Categorical features: []

=== Fold 1 ===
Fold 1 validation MAE : 17.1844

=== Fold 2 ===
Fold 2 validation MAE : 16.3178

=== Fold 3 ===
Fold 3 validation MAE : 16.4742

=== CV Summary ===
Mean CV MAE : 16.6588 ± 0.3771
Folds MAE   : ['17.1844', '16.3178', '16.4742']

Final Test MAE (averaged over 3 folds): 16.2640


<catboost.core.CatBoostRegressor at 0x30646e080>

In [22]:
test['pred'] = final_mdl.predict(test[features])

In [27]:
metrics.mean_absolute_error(test[target], test['pred'])

20.13166992277577

In [24]:
list(reversed(sorted(zip(mdl.feature_names_, mdl.feature_importances_),key=lambda x: x[1])))

[('prod_price', 11.133643596750234),
 ('revi_num_events_14d', 10.268994783784157),
 ('revi_num_events_3d', 9.763864737125198),
 ('revi_price_max', 8.047206407310158),
 ('revi_num_events_7d', 7.4844552436557255),
 ('revi_num_events_4d', 6.427336286146067),
 ('revi_num_events_30d', 6.280774583418543),
 ('revi_price_sum', 6.207833954213011),
 ('revi_price_avg', 5.804397424286282),
 ('revi_num_events_90d', 4.876463570704342),
 ('revi_365dv730_change', 3.6305552351031802),
 ('revi_num_events_180d', 3.3231050732233225),
 ('revi_num_events_60d', 3.1635284909342185),
 ('revi_price_min', 2.724328735979323),
 ('revi_rating_avg', 2.301804424846099),
 ('revi_num_events_365d', 2.0105120038514297),
 ('revi_60dv90_change', 1.3644417853618602),
 ('revi_verified_sum', 1.1250172362907158),
 ('revi_180dv365_change', 0.8805658258739847),
 ('revi_90dv180_change', 0.740406254493432),
 ('revi_num_events_1d', 0.6873622320590069),
 ('revi_customer_id_count', 0.6847999356850558),
 ('revi_num_events_730d', 0.401