In [1]:
#!pip install pandas pytorch-frame catboost scikit-learn

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import catboost
from torch_frame.utils import infer_df_stype

In [2]:
pd.set_option('display.max_columns', 1000)


In [10]:
train_path = 'https://kurve-customers.s3.amazonaws.com/3d95ca0a-b09a-4cc7-8bbf-9c0e98932af2/output/stackex_user_engagement_train'

In [71]:
# 1/1/2020 cut date
df = pd.read_parquet(train_path)


In [72]:
print(df.shape)

(77814, 102)


In [73]:
df['User_had_engagement'].sum()/len(df)

0.032102192407535916

In [19]:
# 1/1/2021 cut date
test = pd.read_parquet(train_path)
print(test.shape)

(87789, 102)


In [20]:
test['User_had_engagement'].sum()/len(test)

0.027338277005091755

In [24]:
print(test['User_had_engagement'].sum()/len(test))
print(df['User_had_engagement'].sum()/len(df))

0.027338277005091755
0.032102192407535916


In [74]:
stypes = infer_df_stype(test)

In [75]:
features = [
    k for k,v in stypes.items()
    if str(v) == 'numerical'
    and k not in ['User_Id', 'User_AccountId']
    and 'label' not in k and 'had_engagement' not in k]

In [76]:
len(features)

72

In [77]:
features = [c for c in features if c in df.columns]

In [98]:
target = "User_had_engagement"

import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool

# -------------------------------------------------
# 1. Split off the *final* test set (once!)
# -------------------------------------------------
X_train_full, X_test, y_train_full, y_test = train_test_split(
    df[features], df[target],
    test_size=0.20,          # 20 % held-out test
    stratify=df[target],
    random_state=42
)

# -------------------------------------------------
# 2. K-Fold CV on the remaining 80 %
# -------------------------------------------------
k = 3                                   # change to 10, etc.
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

fold_aucs = []
test_preds = np.zeros(len(X_test))      # ensemble predictions on test
oof_preds = np.zeros(len(X_train_full)) # optional: OOF on training data

# CatBoost pools (optional but faster)
train_pool = Pool(X_train_full, y_train_full)  # , cat_features=cat_features)

for fold, (idx_tr, idx_va) in enumerate(skf.split(X_train_full, y_train_full), 1):
    print(f"\n=== Fold {fold} ===")
    
    X_tr, X_va = X_train_full.iloc[idx_tr], X_train_full.iloc[idx_va]
    y_tr, y_va = y_train_full.iloc[idx_tr], y_train_full.iloc[idx_va]

    # -----------------------------------------------------------------
    # 3. Fit on the *training* split of this fold
    # -----------------------------------------------------------------
    pos_weight = (len(y_train_full) - y_train_full.sum()) / y_train_full.sum()

    mdl = CatBoostClassifier(
        # objective & metrics
        loss_function="Logloss",
        eval_metric="AUC",
        custom_metric=["AUC","PRAUC","F1","Recall","Precision","Logloss"],
        use_best_model=True,
    
        # capacity vs regularization
        iterations=8000,
        learning_rate=0.02,
        depth=6,
        l2_leaf_reg=5.0,
        min_data_in_leaf=20,
        boosting_type="Ordered",
    
        # class imbalance handling
        auto_class_weights="Balanced",
        # auto_class_weights removed
    
        # randomness / subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.5,
        random_strength=0.8,
        rsm=0.8,
    
        # borders
        feature_border_type="GreedyLogSum",
    
        # early stopping
        od_type="Iter",
        od_wait=250,
    
        verbose=200
    )

    mdl.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        use_best_model=True,
        verbose=False
    )

    # -------------------------------------------------
    # 4. Validation AUC for this fold
    # -------------------------------------------------
    val_pred = mdl.predict_proba(X_va)[:, 1]
    val_auc  = roc_auc_score(y_va, val_pred)
    fold_aucs.append(val_auc)
    print(f"Fold {fold} validation AUC : {val_auc:.4f}")

    # -------------------------------------------------
    # 5. Accumulate predictions on the *final* test set
    # -------------------------------------------------
    test_preds += mdl.predict_proba(X_test)[:, 1] / k

    # (optional) OOF on training data
    oof_preds[idx_va] = val_pred

# -------------------------------------------------
# 6. Final metrics
# -------------------------------------------------
print("\n=== CV Summary ===")
print(f"Mean CV AUC : {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")
print(f"Folds AUC   : {[f'{a:.4f}' for a in fold_aucs]}")

test_auc = roc_auc_score(y_test, test_preds)
print(f"\nFinal test AUC (averaged over {k} folds): {test_auc:.4f}")

# -------------------------------------------------
# 7. (Optional) Refit on the *whole* train_full for deployment
# -------------------------------------------------
# Note: use_best_model=False here because we are not passing an eval_set.
final_mdl = CatBoostClassifier(
        # objective & metrics
        loss_function="Logloss",
        eval_metric="AUC",
        custom_metric=["AUC","PRAUC","F1","Recall","Precision","Logloss"],    
        # capacity vs regularization
        iterations=int(mdl.best_iteration_ * 1.1),
        learning_rate=0.02,
        depth=6,
        l2_leaf_reg=5.0,
        min_data_in_leaf=20,
        boosting_type="Ordered",
    
        # class imbalance handling
    auto_class_weights="Balanced",
        # auto_class_weights removed
    
        # randomness / subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.5,
        random_strength=0.8,
        rsm=0.8,
    
        # borders
        feature_border_type="GreedyLogSum",
    
        # early stopping
        od_type="Iter",
        od_wait=250,
    
        verbose=200
    )

final_mdl.fit(X_train_full, y_train_full)



=== Fold 1 ===
Fold 1 validation AUC : 0.8813

=== Fold 2 ===
Fold 2 validation AUC : 0.8740

=== Fold 3 ===
Fold 3 validation AUC : 0.8894

=== CV Summary ===
Mean CV AUC : 0.8816 ± 0.0063
Folds AUC   : ['0.8813', '0.8740', '0.8894']

Final test AUC (averaged over 3 folds): 0.9040
0:	total: 23ms	remaining: 18.9s
200:	total: 5.56s	remaining: 17.2s
400:	total: 10.8s	remaining: 11.4s
600:	total: 16.1s	remaining: 5.95s
800:	total: 21.3s	remaining: 585ms
822:	total: 21.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x342ab8a00>

In [99]:
test['pred'] = final_mdl.predict_proba(test[features])[:,1]


In [100]:
from sklearn import metrics# import roc_auc_score

In [103]:
print(round(metrics.roc_auc_score(test[target], test['pred']), 2))\

0.9


In [93]:
list(reversed(sorted(zip(final_mdl.feature_names_, final_mdl.feature_importances_), key=lambda x: x[1])))

[('PostH_seconds_since_last', 11.16630106593973),
 ('post_seconds_since_last', 7.850562503376391),
 ('comm_seconds_since_last', 5.402530825664172),
 ('post_Id_count', 4.888035160190467),
 ('PostH_Id_count', 4.553025615800313),
 ('comm_num_events_730d', 4.38954763900675),
 ('PostH_ContentLicense_count', 3.8318417430861973),
 ('Badg_Id_count', 3.6458802921017655),
 ('user_age_days', 3.529980366510347),
 ('post_ContentLicense_count', 3.179003792277052),
 ('Badg_Class_count', 2.982806629632114),
 ('comm_Id_count', 2.590137742420932),
 ('comm_ContentLicense_count', 2.0408701931770463),
 ('post_num_events_730d', 2.02417273594251),
 ('Badg_num_events_730d', 2.0135709138924383),
 ('PostH_90dv180_change', 1.9764563584704344),
 ('comm_num_events_365d', 1.9222580056458636),
 ('comm_365dv730_change', 1.8057720029975588),
 ('PostH_num_events_730d', 1.4946936161488396),
 ('PostH_30dv60_change', 1.458494905095234),
 ('Badg_seconds_since_last', 1.4404703955423368),
 ('comm_num_events_180d', 1.41323300