In [2]:
# After you have built and executed a compute graph
# on https://demo.kurve.ai load the resultant
# dataset here.

In [3]:
# we'll be focused on the user badge problem first
# https://relbench.stanford.edu/datasets/rel-stack/#post-votes

In [125]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import catboost
from torch_frame.utils import infer_df_stype

In [126]:
# we'll be focused on the user badge problem first
# https://relbench.stanford.edu/datasets/rel-stack/#user-badge

In [154]:
df = pd.read_parquet('s3://kurve-customers/d6cfbf12-6265-4ffa-b741-84a7f767fbff/24/output/relbench_stackex_post_votes_train')

In [155]:
df.shape

(206263, 210)

In [156]:
df.head(2)

Unnamed: 0,post_Id,post_OwnerUserId,post_LastEditorUserId,post_PostTypeId,post_Title,post_Tags,post_Body,post_CreationDate,post_title_length,post_body_length,...,comm_comment_length_avg,comm_comment_length_sum,comm_comment_length_min,comm_comment_length_max,comm_num_events_30d,comm_num_events_60d,comm_num_events_90d,comm_num_events_180d,comm_num_events_365d,comm_num_events_730d
0,204745,29025,,1,Preventing overfitting of LSTM on small dataset,<deep-learning><regularization><overfitting><l...,<p>I'm modeling 15000 tweets for sentiment pre...,2016-03-31 13:46:08.497,47,839,...,233.0,233.0,233,233,0.0,0.0,0.0,0.0,0.0,0.0
1,483284,29107,29107.0,1,can the maximum likelihood estimator depend on...,<maximum-likelihood><mathematical-statistics>,"<p>That is, say I have a distribution with par...",2020-08-16 23:06:11.933,67,621,...,381.0,762.0,272,490,0.0,0.0,0.0,2.0,2.0,2.0


In [157]:
# get the label
[c for c in df.columns if 'label' in c]

['vote_Id_label']

In [158]:
label = [c for c in df.columns if 'label' in c][0]
df[label]


0            1
1            1
2            1
3            1
4            1
          ... 
206258       1
206259       1
206260       2
206261    <NA>
206262    <NA>
Name: vote_Id_label, Length: 206263, dtype: Int64

In [159]:
df[label] = df[label].apply(lambda x: x if not pd.isnull(x) else 0)

In [160]:
df[label].mean()

0.07681940047415194

In [161]:
df['post_CreationDate'].min()

Timestamp('2009-02-02 14:21:12.103000')

In [162]:
import datetime

In [163]:
cut = datetime.datetime(2021, 1, 1)


In [164]:
df[df['post_CreationDate']<cut].shape

(160903, 210)

In [165]:
df = df[df['post_CreationDate']<cut]

In [166]:
df['post_timesincepost'] = df.apply(lambda x: (cut-x['post_CreationDate']).total_seconds(), axis=1)

In [167]:
train, test = train_test_split(df)

In [168]:
train.shape

(120677, 211)

In [169]:
test.shape

(40226, 211)

In [170]:
stypes = infer_df_stype(df)

In [171]:
features = [
    k for k,v in stypes.items()
    if str(v) == 'numerical'
    and not k.startswith('User_')
    and 'label' not in k
    and k not in ['post_Id',
 'post_OwnerUserId',
 'post_LastEditorUserId',
 'post_body_length',
 'post_tag_length',
 'user_AccountId']
]

In [172]:
features

['badge_Id_count',
 'badge_UserId_count',
 'badge_Class_count',
 'badge_TagBased_sum',
 'badge_num_events_30d',
 'badge_num_events_60d',
 'badge_num_events_180d',
 'badge_num_events_365d',
 'badge_num_events_730d',
 'usercomm_Id_count',
 'usercomm_PostId_count',
 'usercomm_UserId_count',
 'usercomm_comment_len_avg',
 'usercomm_comment_len_sum',
 'usercomm_comment_len_min',
 'usercomm_num_events_30d',
 'usercomm_num_events_60d',
 'usercomm_num_events_90d',
 'usercomm_num_events_180d',
 'usercomm_num_events_365d',
 'usercomm_num_events_730d',
 'userpost_body_length_avg',
 'userpost_body_length_sum',
 'userpost_body_length_min',
 'userpost_body_length_max',
 'userpost_title_length_avg',
 'userpost_tag_length_avg',
 'userpost_tag_length_sum',
 'userpost_tag_length_min',
 'userpost_tag_length_max',
 'subcomm_Id_count_avg',
 'subcomm_Id_count_min',
 'subcomm_Id_count_max',
 'subcomm_PostId_count_avg',
 'subcomm_PostId_count_min',
 'subcomm_PostId_count_max',
 'subcomm_UserId_count_avg',
 'su

In [173]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=7,
    l2_leaf_reg=3,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=42,
    early_stopping_rounds=10,
    verbose=100
)

In [174]:
model.fit(train[features], train[label])

0:	learn: 0.0660805	total: 19.9ms	remaining: 19.9s
100:	learn: 0.0620874	total: 1.62s	remaining: 14.5s
200:	learn: 0.0620278	total: 3.02s	remaining: 12s
300:	learn: 0.0620096	total: 4.45s	remaining: 10.3s
400:	learn: 0.0619923	total: 5.88s	remaining: 8.79s
500:	learn: 0.0619923	total: 7.32s	remaining: 7.29s
600:	learn: 0.0619923	total: 8.75s	remaining: 5.81s
700:	learn: 0.0619858	total: 10.1s	remaining: 4.33s
800:	learn: 0.0619789	total: 11.4s	remaining: 2.83s
900:	learn: 0.0619733	total: 12.6s	remaining: 1.39s
999:	learn: 0.0619732	total: 13.8s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x40e7beec0>

In [175]:
test['pred'] = model.predict(test[features])

In [176]:
print(metrics.mean_absolute_error(test[label],test['pred']))

0.06298511761771242
