In [9]:
import pandas as pd
from utils import Timer

def clean_df(df, label = 'reply'):
    df['tweet_timestamp'] = pd.to_datetime(df['tweet_timestamp'],unit='s')
    df['engaging_user_account_creation'] = pd.to_datetime(df['engaging_user_account_creation'],unit='s')
    df['engaged_with_user_account_creation'] = pd.to_datetime(df['engaged_with_user_account_creation'],unit='s')
    
    if label == 'reply':
        df['reply'] = df['reply_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    if label == 'retweet':
        df['retweet'] = df['retweet_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    if label == 'comment':
        df['comment'] = df['retweet_with_comment_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    if label == 'like':
        df['like'] = df['like_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    df = df.drop(columns=['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp'])
    if 'tokens' in df.columns:
        df = df.drop(columns=['tokens'])
    return df

In [1]:
import pandas as pd
from utils import Timer

with Timer(f"Load train"):
    train_data = pd.read_parquet("twitterrecsys.train.parquet")
    train_data = train_data[:100000]
train_data = clean_df(train_data)
train_data.dtypes

Load train took 17.86636699922383 sec


In [None]:
train_data

In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor

model = TabularPredictor(label="reply")
predictor = model.fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20221201_215902/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20221201_215902/"
AutoGluon Version:  0.6.0
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Nov 8 23:39:32 UTC 2018
Train Data Rows:    100000
Train Data Columns: 20
Label Column: reply
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> la

# Evaluate

In [14]:
with Timer(f"Load train"):
    schema = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains", "tweet_type",
            "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count", "engaged_with_user_following_count",
            "engaged_with_user_is_verified", "engaged_with_user_account_creation", "engaging_user_id", "engaging_user_follower_count",
            "enaging_user_following_count", "enaging_user_is_verified", "engaging_user_account_creation", "engagee_follows_engager",
            "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]
    
    test_data = pd.read_csv("twitterrecsys.valid.csv", sep='\x01', names = schema, header=None, nrows=10000)
    
with Timer("clean data"):
    test_data = clean_df(test_data)
    
test_data

Load train took 0.0670201568864286 sec
clean data took 0.00727255130186677 sec


Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,...,engaged_with_user_following_count,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply
0,101\t157\t39554\t117\t51747\t10479\t27874\t10107\t10531\t13170\t10124\t10472\t14289\t100\t45194\t100\t111\t10392\t10410\t132\t146\t32608\t10105\t26591\t13051\t26777\t109361\t111\t10392\t10410\t132\t27920\t104277\t10142\t11345\t119\t146\t44856\t11345\t11639\t68094\t37413\t10142\t23602\t36064\t10230\t10105\t18713\t10108\t10455\t12542\t38144\t111\t10392\t10410\t132\t10151\t10119\t53504\t60508\t10240\t174\t47195\t10142\t23602\t36064\t10230\t14729\t119\t216\t216\t23553\t10271\t15127\t10680\t12117\t119\t146\t16938\t100\t188\t11850\t17416\t10262\t16883\t119\t102,,0C8E7372269942BB173EED7C0A72DA09,,,,Quote,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-28 14:49:42,8B7BB615A39DF112B0037E960C27F220,...,123,False,2020-08-02 17:13:46,3044AFDB7E977FB7F62D49C5C97794B7,1128,4028,False,2014-12-28 10:37:53,False,0
1,101\t14120\t131\t120\t120\t188\t119\t11170\t120\t178\t11369\t10129\t27128\t11530\t10759\t12022\t10686\t102,,858720854891DFED04A5B91758049833,Photo,,,TopLevel,313ECD3A1E5BB07406E4249475C2D6D6,2021-02-26 10:30:36,74B09D5BC3FCE5CC4DEE34BF049A0EE5,...,17,False,2020-02-13 18:36:45,23ACD97ACEF57BF6416670BE1133A664,72,115,False,2020-09-14 04:40:07,False,0
2,101\t56898\t137\t10279\t10908\t10138\t168\t39842\t131\t108\t8359\t4982\t1952\t111787\t124\t4460\t2762\t10060\t123\t120\t10306\t5712\t3175\t10061\t7082\t6192\t108\t2021\t111815\t111830\t11668\t10060\t108\t123\t96325\t10061\t108\t9581\t119342\t25486\t216\t216\t100\t5410\t2462\t2198\t1913\t4476\t7214\t1946\t54643\t111830\t25053\t2072\t7149\t1947\t28442\t30884\t18767\t216\t14120\t131\t120\t120\t188\t119\t11170\t120\t55541\t11403\t10116\t11565\t10929\t11273\t11542\t10410\t216\t216\t108\t2012\t11668\t54643\t111815\t111830\t11668\t108\t21644\t11211\t67403\t108\t71020\t55859\t14703\t69849\t108\t14...,90C52DDF506D1C98EE678E84C08C36AB\tCD8639BABE547D9B9BBBB8E962450E2B\t6F84D430D26397EF87A1DC2A7717E75F\t03176616480890152CF7583112DBBD87\t062B238B5042E31C6FD1468742975CDF\t78D6EF9ED489837B54DC07B2F7FFFB21\t16E89E56FDFA9CC96F1F39EFF822F0A3\t6DAEFDE99D17E5D7DF550AFE89432DBE\tE743B8FCBF73E688676EAB7DCA4AD5AB\tF6E3CABCCEAEFCC3030256CE4620875F,1988AA4069C46F57990B6983FBD427A6,,D249E94F6D37A7D5E37185E02C94CDFF,9EFF000CDB18B710CDDB43EE1D8C300B,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-03-01 17:02:33,316C2BE47E445DA930E56C12F3AF31C5,...,23,False,2009-06-15 03:32:36,99F829F88A12BF8B92EDF11A10B6533C,130,638,False,2009-11-01 11:58:13,False,0
3,101\t56898\t137\t12275\t10738\t11534\t15417\t93227\t168\t45657\t131\t10067\t216\t108\t2023\t14750\t47157\t67221\t21612\t2209\t6667\t1923\t33499\t69395\t109047\t100\t216\t108\t128\t4348\t5755\t216\t10092\t216\t216\t1906\t26554\t28073\t57743\t11668\t22396\t12236\t108\t2023\t14750\t11662\t47157\t67221\t15103\t1912\t216\t2087\t5621\t1912\t4949\t5755\t10573\t33708\t67221\t88972\t11377\t2757\t5318\t100\t216\t16181\t1888\t1962\t51143\t71774\t2731\t3878\t1889\t1939\t3212\t7555\t1980\t7533\t1919\t22445\t100\t216\t1892\t108\t4375\t5618\t2023\t14750\t47157\t67221\t21612\t1893\t124\t120\t10270\t2452\t...,D704B4128E35F9BC995E701523676542\tCFD3AF9039C66BB22F7463A6C901225B\t7791383E39A388C0E5915549805FB527\t1C795E14677C7DBF412738236634F599,89C86150124016236B7D4A286B041210,,,,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-03-02 19:55:32,C19FC40FBF0AA0BAC4797BD47483349B,...,1273,False,2015-11-14 05:15:25,A3A7B6D928FAD730BA9967B53EA7CC28,234,640,False,2020-09-10 08:10:09,False,0
4,101\t109821\t24093\t14703\t30118\t11259\t160\t37611\t12396\t17443\t73522\t32992\t13034\t11274\t11403\t14703\t21275\t27128\t10731\t52428\t32992\t11537\t67292\t11273\t40214\t147\t32612\t11090\t14120\t131\t120\t120\t188\t119\t11170\t120\t157\t12675\t10858\t10575\t59880\t11273\t10112\t10729\t102,,971197A8E595128D8BAF8F3F9D20CFF3,Photo\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-03-01 00:38:33,46846BF25E56F8BA11C0AE36ED31B28F,...,976,False,2019-09-19 22:02:19,BA94D75FC03B42F88CAE61485E075960,8,208,False,2018-07-29 15:59:42,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,101\t10882\t169\t12928\t10894\t13028\t16863\t14125\t10105\t10315\t11544\t10345\t10105\t17441\t11074\t136\t100\t102,,2227FE4B3B1D59BE04F11FBCE146E96B,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-28 22:11:16,2969768AB934329E3F77C6BF9389D2EC,...,6355,False,2020-05-25 22:12:02,5DAD7F04EF963C9AA67A2301A8B82D81,359,413,False,2020-06-01 11:52:22,True,0
9996,101\t56898\t137\t23152\t10350\t16039\t10815\t131\t99843\t15417\t14293\t11152\t22282\t10108\t11844\t21849\t171\t10157\t12212\t76557\t19132\t10667\t21454\t119\t13252\t76557\t10107\t10301\t15910\t10978\t31128\t119\t14120\t100\t102,,4CD4E99F32B693D7127185B99BBC16EC,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,2021-03-03 01:44:43,7C5CA5A790B34D58C92A6B1D66A5C29C,...,2360,False,2007-07-08 01:47:26,01449F871D9C25CD15A99A5BA7080B32,1036,2195,False,2011-03-12 16:31:57,False,0
9997,101\t56898\t137\t12322\t35713\t168\t162\t62146\t88592\t35999\t131\t1963\t60907\t25204\t11588\t37311\t106\t106\t100\t216\t216\t2187\t4348\t1966\t10898\t18628\t25417\t38352\t67946\t111756\t216\t216\t1910\t11588\t21612\t39004\t10059\t1906\t15221\t67946\t3765\t14813\t53106\t106\t106\t14120\t131\t120\t120\t188\t119\t11170\t120\t162\t11166\t11779\t10107\t10858\t11011\t46776\t10858\t10112\t102,,076F276223FEE8B65A8394000DDE1FF8,Photo,,,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-03-02 21:00:48,5BF5EB3CB78D855670FE5E823FD09C19,...,17598,False,2017-10-28 13:06:19,604AE4955F348F51DC45B3186DB5AE6E,259,913,False,2019-05-04 12:53:41,True,0
9998,101\t11835\t46090\t10327\t11503\t24499\t78578\t758\t42744\t11326\t63374\t10278\t55532\t10327\t766\t98273\t75699\t80493\t41865\t10658\t119\t216\t108\t15896\t88325\t216\t108\t766\t98273\t31520\t168\t758\t42744\t11326\t168\t63374\t10278\t39900\t21069\t168\t54179\t168\t35324\t216\t108\t103535\t168\t10383\t168\t38672\t11711\t168\t82936\t168\t15896\t88325\t102,5DF4827AA8F4AA4168CAC78DC7821AA4\tEA2DBD95359DD6F403EE42CDAAFEE1E9\t6C9C78DC9BD194777258B17B8EDA3AF6,E48DD62AF5F4970DFC1D8F9080DCC55C,,,,TopLevel,5B6973BEB05212E396F3F2DC6A31B71C,2021-03-03 11:37:57,8BBCDE5904602037AB3863910C5E724B,...,8654,False,2020-03-02 22:16:41,C094354FC31B6759C0FCC2EA886A0A11,342,370,False,2020-05-11 19:44:37,True,0


In [15]:
predictor.feature_importance(test_data)

Computing feature importance via permutation shuffling for 20 features using 5000 rows with 5 shuffle sets...
	182.3s	= Expected runtime (36.46s per shuffle set)
	193.15s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
engagee_follows_engager,8e-05,0.000303,0.293525,5,0.000705,-0.000545
hashtags,0.0,0.0,0.5,5,0.0,0.0
engaging_user_account_creation,0.0,0.0,0.5,5,0.0,0.0
enaging_user_is_verified,0.0,0.0,0.5,5,0.0,0.0
enaging_user_following_count,0.0,0.0,0.5,5,0.0,0.0
engaging_user_follower_count,0.0,0.0,0.5,5,0.0,0.0
engaging_user_id,0.0,0.0,0.5,5,0.0,0.0
engaged_with_user_account_creation,0.0,0.0,0.5,5,0.0,0.0
engaged_with_user_is_verified,0.0,0.0,0.5,5,0.0,0.0
engaged_with_user_following_count,0.0,0.0,0.5,5,0.0,0.0


In [16]:
predictor.features('transformed')

['engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_is_verified',
 'engaging_user_follower_count',
 'enaging_user_following_count',
 'enaging_user_is_verified',
 'engagee_follows_engager',
 'text_tokens',
 'hashtags',
 'tweet_id',
 'present_media',
 'present_links',
 'present_domains',
 'tweet_type',
 'language',
 'engaged_with_user_id',
 'engaging_user_id',
 'tweet_timestamp',
 'tweet_timestamp.day',
 'tweet_timestamp.dayofweek',
 'engaged_with_user_account_creation',
 'engaged_with_user_account_creation.year',
 'engaged_with_user_account_creation.month',
 'engaged_with_user_account_creation.day',
 'engaged_with_user_account_creation.dayofweek',
 'engaging_user_account_creation',
 'engaging_user_account_creation.year',
 'engaging_user_account_creation.month',
 'engaging_user_account_creation.day',
 'engaging_user_account_creation.dayofweek',
 'text_tokens.char_count',
 'text_tokens.word_count',
 'text_tokens.digit_ratio',
 'text_tokens.specia

In [24]:
from sklearn.metrics import log_loss, average_precision_score
import numpy as np
def compute_AP(pred, gt):
    return average_precision_score(gt, pred)
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [17]:
pred = predictor.predict(test_data)

In [22]:
compute_AP(pred = pred.to_numpy(), gt=test_data['reply'].to_numpy())

0.022330232558139533

In [25]:
compute_rce_fast(pred = pred.to_numpy(), gt=test_data['reply'].to_numpy())

-625.2358277947136