In [1]:
import pandas as pd
from torch_frame.utils import infer_df_stype
import catboost
from sklearn import metrics

In [34]:
# compute graph cut date 1/1/2020
train_path = 'https://kurve-customers.s3.amazonaws.com/3d95ca0a-b09a-4cc7-8bbf-9c0e98932af2/3/output/stack_badge_train'

In [7]:
df = pd.read_parquet(train_path)

In [8]:
df.shape

(220788, 159)

In [9]:
df.head()

Unnamed: 0,User_Id,User_AccountId,User_Reputation,User_Views,User_DownVotes,User_UpVotes,User_DisplayName,User_Location,User_ProfileImageUrl,User_WebsiteUrl,...,ph_num_events_60d,ph_num_events_90d,ph_num_events_180d,ph_num_events_365d,ph_num_events_730d,ph_30dv60_change,ph_60dv90_change,ph_90dv180_change,ph_180dv365_change,ph_365dv730_change
0,166286,4854026,11,0,0,0,starbuck,,,,...,0.0,0.0,0.0,0.0,0.0,,,,,
1,166293,11175242,11,4,0,0,jenny,,,,...,0.0,0.0,0.0,0.0,0.0,,,,,
2,166295,11174666,353,57,0,36,Sanket Agrawal,,,,...,0.0,0.0,0.0,30.0,37.0,,,,0.0,0.810811
3,166307,11175691,11,37,0,0,Arash,,,,...,0.0,0.0,0.0,0.0,0.0,,,,,
4,166313,11175755,31,9,0,0,David,,,,...,0.0,0.0,0.0,0.0,0.0,,,,,


In [10]:
[c for c in df.columns if 'label' in c]

['Badg_Id_label']

In [11]:
target = [c for c in df.columns if 'label' in c][0]

In [12]:
df[target]

0         <NA>
1         <NA>
2         <NA>
3         <NA>
4         <NA>
          ... 
220783       1
220784       1
220785       1
220786       1
220787       1
Name: Badg_Id_label, Length: 220788, dtype: Int32

In [13]:
df[target] = df[target].apply(lambda x: 1 if not pd.isnull(x) and x > 0 else 0)

In [15]:
df[target].unique()

array([0, 1])

In [16]:
df[target].sum()/len(df)

0.02877873797488994

In [35]:
# compute graph cut_date 1/1/2021
test = pd.read_parquet(train_path)

In [19]:
test[target] = test[target].apply(lambda x: 1 if not pd.isnull(x) and x > 0 else 0)

In [20]:
test[target].sum()/len(test)

0.02611998746867168

In [21]:
stypes = infer_df_stype(df)

In [25]:
features = [
    k for k,v in stypes.items()
    if str(v) == 'numerical'
    and k not in ['User_Id', 'User_AccountId', 'User_Reputation', 'User_Views', 'User_DownVotes', 'User_UpVotes']
    and 'label' not in k and k != 'had_engagement']

In [26]:
len(features)

123

In [27]:
features

['user_age_days',
 'vote_Id_count',
 'vote_seconds_since_last',
 'vote_num_events_30d',
 'vote_num_events_60d',
 'vote_num_events_90d',
 'vote_num_events_180d',
 'vote_num_events_365d',
 'vote_num_events_730d',
 'vote_30dv60_change',
 'vote_60dv90_change',
 'vote_90dv180_change',
 'vote_180dv365_change',
 'vote_365dv730_change',
 'post_Id_count',
 'post_ContentLicense_count',
 'subvote_Id_count_avg',
 'subvote_Id_count_sum',
 'subvote_Id_count_min',
 'subvote_Id_count_max',
 'subvote_seconds_since_last_avg',
 'subvote_seconds_since_last_sum',
 'subvote_seconds_since_last_min',
 'subvote_seconds_since_last_max',
 'subvote_num_events_30d_avg',
 'subvote_num_events_30d_sum',
 'subvote_num_events_30d_min',
 'subvote_num_events_30d_max',
 'subvote_num_events_60d_avg',
 'subvote_num_events_60d_sum',
 'subvote_num_events_60d_min',
 'subvote_num_events_60d_max',
 'subvote_num_events_90d_avg',
 'subvote_num_events_90d_sum',
 'subvote_num_events_90d_min',
 'subvote_num_events_90d_max',
 'subvote

In [28]:
[f for f in features if f not in test.columns]

[]

In [29]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool

# -------------------------------------------------
# 1. Split off the *final* test set (once!)
# -------------------------------------------------
X_train_full, X_test, y_train_full, y_test = train_test_split(
    df[features], df[target],
    test_size=0.20,          # 20 % held-out test
    stratify=df[target],
    random_state=42
)

# -------------------------------------------------
# 2. K-Fold CV on the remaining 80 %
# -------------------------------------------------
k = 3                                   # change to 10, etc.
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

fold_aucs = []
test_preds = np.zeros(len(X_test))      # out-of-fold predictions on *test*
oof_preds = np.zeros(len(X_train_full)) # optional: OOF on training data

# CatBoost pools (optional but faster)
train_pool = Pool(X_train_full, y_train_full)  # , cat_features=cat_features)

for fold, (idx_tr, idx_va) in enumerate(skf.split(X_train_full, y_train_full), 1):
    print(f"\n=== Fold {fold} ===")
    
    X_tr, X_va = X_train_full.iloc[idx_tr], X_train_full.iloc[idx_va]
    y_tr, y_va = y_train_full.iloc[idx_tr], y_train_full.iloc[idx_va]

    # -----------------------------------------------------------------
    # 3. Fit on the *training* split of this fold
    # -----------------------------------------------------------------
    mdl = catboost.CatBoostClassifier(
        # objective & metrics
        loss_function="Logloss",
        eval_metric="AUC",
        custom_metric=["AUC", "PRAUC", "F1", "Recall", "Precision", "Logloss"],
        use_best_model=True,
    
        # capacity vs regularization
        iterations=2000,
        learning_rate=0.05,
        depth=5,
        l2_leaf_reg=10,
    
        # randomness / bagging / feature subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.75,
        rsm=0.8,
        random_strength=0.5,
    
        # class imbalance handling
        auto_class_weights="Balanced",
        #class_weights=[1,36],
    
        # borders / leaves
        feature_border_type="GreedyLogSum",
        min_data_in_leaf=20,
        boosting_type="Plain",
    
        # early stopping
        od_type="Iter",
        od_wait=150,
    
        verbose=200
    )
    mdl.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        use_best_model=False,
        verbose=False
    )

    # -------------------------------------------------
    # 4. Validation AUC for this fold
    # -------------------------------------------------
    val_pred = mdl.predict_proba(X_va)[:, 1]
    val_auc  = roc_auc_score(y_va, val_pred)
    fold_aucs.append(val_auc)
    print(f"Fold {fold} validation AUC : {val_auc:.4f}")

    # -------------------------------------------------
    # 5. Accumulate predictions on the *final* test set
    # -------------------------------------------------
    test_preds += mdl.predict_proba(X_test)[:, 1] / k

    # (optional) OOF on training data
    oof_preds[idx_va] = val_pred

# -------------------------------------------------
# 6. Final metrics
# -------------------------------------------------
print("\n=== CV Summary ===")
print(f"Mean CV AUC : {np.mean(fold_aucs):.4f} ± {np.std(fold_aucs):.4f}")
print(f"Folds AUC   : {[f'{a:.4f}' for a in fold_aucs]}")

test_auc = roc_auc_score(y_test, test_preds)
print(f"\nFinal test AUC (averaged over {k} folds): {test_auc:.4f}")

# -------------------------------------------------
# 7. (Optional) Refit on the *whole* train_full for deployment
# -------------------------------------------------
final_mdl = catboost.CatBoostClassifier(
        # objective & metrics
        loss_function="Logloss",
        eval_metric="AUC",
        custom_metric=["AUC", "PRAUC", "F1", "Recall", "Precision", "Logloss"],
        #use_best_model=True,
    
        # capacity vs regularization
        #iterations=2000,
        iterations=int(mdl.best_iteration_ * 1.1),  # a bit more than best
        learning_rate=0.05,
        depth=5,
        l2_leaf_reg=10,
    
        # randomness / bagging / feature subsampling
        bootstrap_type="Bayesian",
        bagging_temperature=0.75,
        rsm=0.8,
        random_strength=0.5,
    
        # class imbalance handling
        auto_class_weights="Balanced",
        #class_weights=[1,36],
    
        # borders / leaves
        feature_border_type="GreedyLogSum",
        min_data_in_leaf=20,
        boosting_type="Plain",
    
        # early stopping
        od_type="Iter",
        od_wait=150,
    
        verbose=200
    )
final_mdl.fit(X_train_full, y_train_full)


=== Fold 1 ===
Fold 1 validation AUC : 0.8363

=== Fold 2 ===
Fold 2 validation AUC : 0.8447

=== Fold 3 ===
Fold 3 validation AUC : 0.8515

=== CV Summary ===
Mean CV AUC : 0.8441 ± 0.0062
Folds AUC   : ['0.8363', '0.8447', '0.8515']

Final test AUC (averaged over 3 folds): 0.8478
0:	total: 20.6ms	remaining: 7.45s
200:	total: 5.53s	remaining: 4.46s
362:	total: 10.3s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x357ce4250>

In [30]:
test['pred'] = final_mdl.predict_proba(test[features])[:,1]

In [31]:
print(metrics.roc_auc_score(test[target], test['pred']))

0.8600202382824832


In [33]:
# relbench paper
# data scientist 20+ hours = 0.86
# LLM: 0.79
# Kumo: 0.80
# Kumo fine tuned: 0.89

In [32]:
list(reversed(sorted(zip(final_mdl.feature_names_, final_mdl.feature_importances_), key=lambda x: x[1])))

[('user_age_days', 10.871071714325769),
 ('Badg_seconds_since_last', 6.693912794917843),
 ('post_ContentLicense_count', 4.363706411361722),
 ('Badg_num_events_730d', 4.341232548731441),
 ('ph_ContentLicense_count', 4.121871765890861),
 ('ph_Id_count', 3.7911081551864885),
 ('ph_seconds_since_last', 3.6646928793255276),
 ('subvote_seconds_since_last_min', 3.373443557455643),
 ('post_Id_count', 3.1168698208493124),
 ('post_seconds_since_last', 2.756162039639991),
 ('ph_num_events_30d', 2.709478984742534),
 ('subvote_Id_count_sum', 2.035029734584143),
 ('Badg_Id_count', 1.9076802734147165),
 ('subvote_seconds_since_last_avg', 1.8738688189188872),
 ('post_num_events_60d', 1.8640642500818079),
 ('comm_seconds_since_last', 1.8373202494508791),
 ('Badg_180dv365_change', 1.8293191292598534),
 ('subvote_num_events_730d_sum', 1.7625564365742485),
 ('post_num_events_30d', 1.6172753589424547),
 ('comm_ContentLicense_count', 1.572874739051282),
 ('Badg_Class_count', 1.5664089591710153),
 ('Badg_365