In [1]:
# Item churn problem from the rel-amazon dataset
# on relbench: https://relbench.stanford.edu/datasets/rel-amazon/#item-churn

# 1) https://demo.kurve.ai
# 2) create graph of rel-amazon data
# 3) assign product as parent node product with depth 4

In [49]:
import pandas as pd
from torch_frame.utils import infer_df_stype
import catboost
from sklearn import metrics

In [51]:
# train cut date of 10/1/15
train_path = 'https://kurve-customers.s3.amazonaws.com/4e1a245a-3065-4600-bb0e-a92e06ee835c/5/output/item_churn_train'

In [52]:
df = pd.read_parquet(train_path)
#df = pd.concat([df, pd.read_parquet(train_path)])

In [53]:
df.shape

(506012, 37)

In [54]:
df.head()

Unnamed: 0,prod_product_id,prod_category,prod_brand,prod_title,prod_description,prod_price,revi_review_time_min,revi_review_time_max,revi_customer_id_count,revi_rating_avg,...,revi_3dv4_change,revi_4dv7_change,revi_7dv14_change,revi_14dv30_change,revi_30dv60_change,revi_60dv90_change,revi_90dv180_change,revi_180dv365_change,revi_365dv730_change,revi_customer_id_label
0,460547,"[Books, Crafts, Hobbies & Home, Crafts & Hobbies]",Nancy Smith Lynda Milligan,HouseWarmers,,8.55,2012-12-28,2015-03-12,3,4.0,...,,,,,,,0.0,1.0,0.5,1
1,461394,"[Books, Health, Fitness &amp; Dieting, Exercis...",Visit Amazon's Muata Ashby Page,Egyptian Yoga II: The Supreme Wisdom of Enligh...,,20.97,2013-03-23,2015-03-29,3,5.0,...,,,,,,,0.0,1.0,0.5,1
2,461855,"[Books, Literature & Fiction, Short Stories & ...",Visit Amazon's Michael Collins Page,Fortune's World,Michael Collins won the Mystery Writers of Ame...,11.76,2008-07-11,2014-09-16,2,5.0,...,,,,,,,,0.0,1.0,1
3,462349,"[Books, Cookbooks, Food &amp; Wine, Entertaini...",Visit Amazon's Gooseberry Patch Page,Holidays at Home Cookbook (Seasonal Cookbook C...,<i><span>Enjoy this recipe from <b>Holidays at...,7.91,2009-05-13,2013-11-24,4,4.5,...,,,,,,,,,0.0,1
4,462937,"[Books, Religion &amp; Spirituality, New Age &...",Visit Amazon's Norman Friedman Page,Bridging Science and Spirit: Common Elements i...,"""This book will be a valuable research documen...",15.95,2013-11-21,2015-06-26,3,5.0,...,,,,,,0.0,0.5,1.0,0.666667,1


In [55]:
[c for c in df.columns if 'label' in c]

['revi_customer_id_label']

In [56]:
df.head()

Unnamed: 0,prod_product_id,prod_category,prod_brand,prod_title,prod_description,prod_price,revi_review_time_min,revi_review_time_max,revi_customer_id_count,revi_rating_avg,...,revi_3dv4_change,revi_4dv7_change,revi_7dv14_change,revi_14dv30_change,revi_30dv60_change,revi_60dv90_change,revi_90dv180_change,revi_180dv365_change,revi_365dv730_change,revi_customer_id_label
0,460547,"[Books, Crafts, Hobbies & Home, Crafts & Hobbies]",Nancy Smith Lynda Milligan,HouseWarmers,,8.55,2012-12-28,2015-03-12,3,4.0,...,,,,,,,0.0,1.0,0.5,1
1,461394,"[Books, Health, Fitness &amp; Dieting, Exercis...",Visit Amazon's Muata Ashby Page,Egyptian Yoga II: The Supreme Wisdom of Enligh...,,20.97,2013-03-23,2015-03-29,3,5.0,...,,,,,,,0.0,1.0,0.5,1
2,461855,"[Books, Literature & Fiction, Short Stories & ...",Visit Amazon's Michael Collins Page,Fortune's World,Michael Collins won the Mystery Writers of Ame...,11.76,2008-07-11,2014-09-16,2,5.0,...,,,,,,,,0.0,1.0,1
3,462349,"[Books, Cookbooks, Food &amp; Wine, Entertaini...",Visit Amazon's Gooseberry Patch Page,Holidays at Home Cookbook (Seasonal Cookbook C...,<i><span>Enjoy this recipe from <b>Holidays at...,7.91,2009-05-13,2013-11-24,4,4.5,...,,,,,,,,,0.0,1
4,462937,"[Books, Religion &amp; Spirituality, New Age &...",Visit Amazon's Norman Friedman Page,Bridging Science and Spirit: Common Elements i...,"""This book will be a valuable research documen...",15.95,2013-11-21,2015-06-26,3,5.0,...,,,,,,0.0,0.5,1.0,0.666667,1


In [40]:
target = [c for c in df.columns if 'label' in c][0]

In [41]:

df[target] = df[target].apply(
    lambda x: 1 if x == 0 or pd.isnull(x) else 0)

In [42]:
df[target].sum()/len(df)

0.6437564721785254

In [43]:
# execute compute graph with
# cut date 1/1/2016 and load data

In [18]:
test = pd.read_parquet(train_path)
test.shape

(506012, 37)

In [19]:

test[target] = test[target].apply(
    lambda x: 1 if x == 0 or pd.isnull(x) else 0)

In [20]:
test[target].sum()/len(test)

0.6732923329881505

In [23]:
stypes = infer_df_stype(test.tail(1000))
features = [
    k for k,v in stypes.items()
    if str(v) == 'numerical'
    and 'label' not in k
    and k != 'prod_product_id'
]

In [24]:
features

['prod_price',
 'revi_customer_id_count',
 'revi_rating_avg',
 'revi_rating_sum',
 'revi_rating_min',
 'revi_rating_max',
 'revi_verified_sum',
 'revi_seconds_since_last',
 'revi_num_events_1d',
 'revi_num_events_3d',
 'revi_num_events_4d',
 'revi_num_events_7d',
 'revi_num_events_14d',
 'revi_num_events_30d',
 'revi_num_events_60d',
 'revi_num_events_90d',
 'revi_num_events_180d',
 'revi_num_events_365d',
 'revi_num_events_730d',
 'revi_3dv4_change',
 'revi_7dv14_change',
 'revi_14dv30_change',
 'revi_30dv60_change',
 'revi_60dv90_change',
 'revi_90dv180_change',
 'revi_180dv365_change',
 'revi_365dv730_change']

In [44]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool

# -------------------------------------------------
# 1. Split off the *final* test set (once!)
# -------------------------------------------------
X_train_full, X_test, y_train_full, y_test = train_test_split(
    df[features], df[target],
    test_size=0.20,          # 20 % held-out test
    stratify=df[target],
    random_state=42
)

# -------------------------------------------------
# 2. K-Fold CV on the remaining 80 %
# -------------------------------------------------
k = 3                                   # change to 10, etc.
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

fold_aucs = []
test_preds = np.zeros(len(X_test))      # out-of-fold predictions on *test*
oof_preds = np.zeros(len(X_train_full)) # optional: OOF on training data

# CatBoost pools (optional but faster)
train_pool = Pool(X_train_full, y_train_full)  # , cat_features=cat_features)

for fold, (idx_tr, idx_va) in enumerate(skf.split(X_train_full, y_train_full), 1):
    print(f"\n=== Fold {fold} ===")
    
    X_tr, X_va = X_train_full.iloc[idx_tr], X_train_full.iloc[idx_va]
    y_tr, y_va = y_train_full.iloc[idx_tr], y_train_full.iloc[idx_va]

    # -----------------------------------------------------------------
    # 3. Fit on the *training* split of this fold
    # -----------------------------------------------------------------
    mdl = catboost.CatBoostClassifier(
        # objective & metrics
        loss_function="Logloss",
        eval_metric="AUC",
        custom_metric=["AUC", "PRAUC", "F1", "Recall", "Precision", "Logloss"],
        use_best_model=True,
    
        # capacity vs regularization
        iterations=2000,
        learning_rate=0.05,
        depth=5,
        l2_leaf_reg=10,
    
        # randomness / bagging / feature subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.75,
        rsm=0.8,
        random_strength=0.5,
    
        # class imbalance handling
        auto_class_weights="Balanced",
        #class_weights=[0.6,0.4],
    
        # borders / leaves
        feature_border_type="GreedyLogSum",
        min_data_in_leaf=20,
        boosting_type="Plain",
    
        # early stopping
        od_type="Iter",
        od_wait=150,
    
        verbose=200
    )
    mdl.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        use_best_model=False,
        verbose=False
    )

    # -------------------------------------------------
    # 4. Validation AUC for this fold
    # -------------------------------------------------
    val_pred = mdl.predict_proba(X_va)[:, 1]
    val_auc  = roc_auc_score(y_va, val_pred)
    fold_aucs.append(val_auc)
    print(f"Fold {fold} validation AUC : {val_auc:.4f}")

    # -------------------------------------------------
    # 5. Accumulate predictions on the *final* test set
    # -------------------------------------------------
    test_preds += mdl.predict_proba(X_test)[:, 1] / k

    # (optional) OOF on training data
    oof_preds[idx_va] = val_pred

# -------------------------------------------------
# 6. Final metrics
# -------------------------------------------------
print("\n=== CV Summary ===")
print(f"Mean CV AUC : {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")
print(f"Folds AUC   : {[f'{a:.4f}' for a in fold_aucs]}")

test_auc = roc_auc_score(y_test, test_preds)
print(f"\nFinal test AUC (averaged over {k} folds): {test_auc:.4f}")

# -------------------------------------------------
# 7. (Optional) Refit on the *whole* train_full for deployment
# -------------------------------------------------
final_mdl = catboost.CatBoostClassifier(
        # objective & metrics
        loss_function="Logloss",
        eval_metric="AUC",
        custom_metric=["AUC", "PRAUC", "F1", "Recall", "Precision", "Logloss"],
        #use_best_model=True,
    
        # capacity vs regularization
        #iterations=2000,
        iterations=int(mdl.best_iteration_ * 1.1),  # a bit more than best
        learning_rate=0.05,
        depth=5,
        l2_leaf_reg=10,
    
        # randomness / bagging / feature subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.75,
        rsm=0.8,
        random_strength=0.5,
    
        # class imbalance handling
        auto_class_weights="Balanced",
        #class_weights=[0.6,0.4],
    
        # borders / leaves
        feature_border_type="GreedyLogSum",
        min_data_in_leaf=20,
        boosting_type="Plain",
    
        # early stopping
        od_type="Iter",
        od_wait=150,
        verbose=200
    )
final_mdl.fit(X_train_full, y_train_full)


=== Fold 1 ===
Fold 1 validation AUC : 0.8196

=== Fold 2 ===
Fold 2 validation AUC : 0.8197

=== Fold 3 ===
Fold 3 validation AUC : 0.8205

=== CV Summary ===
Mean CV AUC : 0.8199 ± 0.0004
Folds AUC   : ['0.8196', '0.8197', '0.8205']

Final test AUC (averaged over 3 folds): 0.8180
0:	total: 48.4ms	remaining: 1m 14s
200:	total: 20.9s	remaining: 2m 18s
400:	total: 43.8s	remaining: 2m 4s
600:	total: 1m 6s	remaining: 1m 44s
800:	total: 1m 30s	remaining: 1m 22s
1000:	total: 1m 53s	remaining: 1m
1200:	total: 2m 16s	remaining: 38.4s
1400:	total: 2m 40s	remaining: 15.7s
1537:	total: 2m 56s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x3a4fdae90>

In [47]:
test['prob'] = final_mdl.predict_proba(test[features])[:,1]

In [48]:
metrics.roc_auc_score(test[target], test['prob'])

0.8124921859557072