In [6]:
import pandas as pd
from utils import Timer
import os

def clean_df(df, label = 'reply'):
    df['tweet_timestamp'] = pd.to_datetime(df['tweet_timestamp'],unit='s')
    df['engaging_user_account_creation'] = pd.to_datetime(df['engaging_user_account_creation'],unit='s')
    df['engaged_with_user_account_creation'] = pd.to_datetime(df['engaged_with_user_account_creation'],unit='s')
    
    if label == 'reply':
        df['reply'] = df['reply_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    if label == 'retweet':
        df['retweet'] = df['retweet_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    if label == 'comment':
        df['comment'] = df['retweet_with_comment_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    if label == 'like':
        df['like'] = df['like_timestamp'].apply(lambda x: 1 if x>0 else 0).astype('int32')
    df = df.drop(columns=['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp','like_timestamp'])
    if 'tokens' in df.columns:
        df = df.drop(columns=['tokens'])
    return df

def apply_bert_decode(df):
    from transformers import BertTokenizer
    os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
    df["tweet"] = df["text_tokens"].progress_apply(lambda x: tokenizer.decode([int(n) for n in x.split('\t')]))
    df = df.drop(columns=['text_tokens'])
    return df
    

In [7]:
import pandas as pd
from utils import Timer
from tqdm import tqdm

tqdm.pandas()

with Timer(f"Load train"):
    train_data = pd.read_parquet("twitterrecsys.train.parquet")
    train_data = train_data[:100000]
with Timer(f"clean data"):
    train_data = clean_df(train_data)
with Timer(f"apply_bert_decode"):
    train_data = apply_bert_decode(train_data)
train_data.dtypes

Load train took 18.520480221137404 sec
clean data took 0.7230387460440397 sec


100%|██████████| 100000/100000 [00:09<00:00, 10884.84it/s]


apply_bert_decode took 29.55636406969279 sec


hashtags                                      object
tweet_id                                      object
present_media                                 object
present_links                                 object
present_domains                               object
tweet_type                                    object
language                                      object
tweet_timestamp                       datetime64[ns]
engaged_with_user_id                          object
engaged_with_user_follower_count               int64
engaged_with_user_following_count              int64
engaged_with_user_is_verified                   bool
engaged_with_user_account_creation    datetime64[ns]
engaging_user_id                              object
engaging_user_follower_count                   int64
enaging_user_following_count                   int64
enaging_user_is_verified                        bool
engaging_user_account_creation        datetime64[ns]
engagee_follows_engager                       

In [8]:
train_data

Unnamed: 0,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,...,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply,tweet
0,,E6D175543122CE8F508E17015E6610B2,,,,Quote,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-12 08:17:52,010E1103060789CF8CB5B95427B44964,95017,...,False,2016-01-28 12:27:19,D6074FA4F0C1AC715199E4F1426599F4,940,1066,False,2010-11-14 16:50:00,False,0,[CLS] Straight from the horse mouth. [SEP]
1,B6EC77A29C34939B81DCC38ECA00DB32,E35CD173274FE6434E7477F182C2DA39,,B412A1F69290A59CB5FE8C8F70B5F6BD,743332A23E1F15FA1F221F77D44DF7AC,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-07 04:37:11,9F68E2B5E896BF8E1A2B06E1985329B2,3529,...,False,2015-03-03 13:28:45,C0E8BA6661C2E8CEA7C64B769D597C04,24,350,False,2011-01-12 02:15:49,False,0,"[CLS] We can, we must, # DefundThePolice ¶ ¶ h..."
2,,7BB9A31FDE6868C1A1E17EA678FBF6C5,GIF,EDC6B9477A034FA8DF1D38701E90F6B7,37F6EF8038F7A0B9A07DF964E16EB09A,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-02-24 05:33:58,0CAEDCD7503E40E1E5BFF2AD112863F4,3247,...,False,2018-09-02 07:10:55,DF19EEA536FF691AC78603CCE666AF2E,1575,369,False,2019-11-12 01:36:57,True,0,[CLS] RT @ ixkaito : イラレのブレンドかな 。 https : / / ...
3,,D1989E07D803476B6025D13C8BCB6DD3,,,,TopLevel,D922D8FEA3EFAD3200455120B75BCEB8,2021-02-11 00:45:03,557051DEF71B797E0C6BB4DC7D04D037,1775,...,False,2010-12-11 16:10:50,912421CD187259420CDA7E1484C34044,3812,4629,False,2010-01-27 20:00:13,True,0,[CLS] @ lzcrowder Lololololol we die [SEP]
4,,B1ACDCB66DF98D4B2A94ED433FFC3E07,,E0C58892A48BE04BE8E7DD1088503156,8463A92AE38304444558059B409BCCF2,TopLevel,7D11A7AA105DAB4D6799AF863369DB9C,2021-02-16 06:18:21,83A7E60363FD6E4B3F8E17534A259C39,30163,...,False,2010-03-11 20:32:17,DB7670068F282465150E04547AADA78B,587,229,False,2019-06-09 14:55:47,False,0,[CLS] Eindelijk komt de discussie over de neve...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,,4A281F8D5D7048CC2DE676B8DF16A938,,,,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-02-15 04:57:20,A86315F66180AC40AD58AD56187B06C0,1310,...,False,2013-01-09 01:59:11,D8A3518F18025BCB10E9A7663638E25E,330,364,False,2014-02-03 13:35:36,True,0,[CLS] RT @ oscarnoyukue : 殺 人 鬼 フレディとジェイソンがデュエ...
99996,43B37225C841C6DB6E7D340EAFBA569C\t25A2367EE2D9...,7323EC68C992E122956D453E38D06365,Photo,,,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-02-15 09:13:19,561EEECDC817A9A3D63B083D028AC20B,267,...,False,2018-04-25 21:25:01,B39F029D04D00B8CC9C8883E66D73442,849,789,False,2018-12-25 17:20:35,True,0,[CLS] RT @ hikakin : BTSの 握 手 会 [UNK] ¶ ¶ # BT...
99997,,7B69BB8CE8DE95C772F3BB7F4417124E,,,,TopLevel,9A78FC330083E72BE0DD1EA92656F3B5,2021-02-06 16:18:13,BA747E2DB84357A0872A197BB7C753CC,364,...,False,2020-11-26 15:48:25,705B940153E60523DC0AD9DAED5A906B,319,336,False,2020-03-09 20:13:27,True,0,[CLS] الى كل من يتابعني................... الم...
99998,,C1683A7DD8F440B00C4BE51FF3969FDD,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-19 02:38:21,9C9B5E585778D1D3063B994E35376CEE,761283,...,False,2010-01-06 22:39:39,1D6A7DB39B73727120B10EF7027659FE,212,126,False,2017-08-25 12:44:17,False,0,"[CLS] RT @ SheriffClarke : Yeah, why not? A DC..."


In [9]:
from autogluon.tabular import TabularDataset, TabularPredictor

model = TabularPredictor(label="reply")
predictor = model.fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20221201_233617/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20221201_233617/"
AutoGluon Version:  0.6.0
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Nov 8 23:39:32 UTC 2018
Train Data Rows:    100000
Train Data Columns: 20
Label Column: reply
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> la

In [10]:
with Timer(f"Load train"):
    schema = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains", "tweet_type",
            "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count", "engaged_with_user_following_count",
            "engaged_with_user_is_verified", "engaged_with_user_account_creation", "engaging_user_id", "engaging_user_follower_count",
            "enaging_user_following_count", "enaging_user_is_verified", "engaging_user_account_creation", "engagee_follows_engager",
            "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]
    
    test_data = pd.read_csv("twitterrecsys.valid.csv", sep='\x01', names = schema, header=None, nrows=10000)
    
with Timer("clean data"):
    test_data = clean_df(test_data)
    
with Timer(f"apply_bert_decode"):
    test_data = apply_bert_decode(test_data)
    
test_data

Load train took 0.05262972880154848 sec
clean data took 0.0075564198195934296 sec


100%|██████████| 10000/10000 [00:00<00:00, 10211.38it/s]

apply_bert_decode took 21.34463684493676 sec





Unnamed: 0,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,engaged_with_user_follower_count,...,engaged_with_user_is_verified,engaged_with_user_account_creation,engaging_user_id,engaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,engaging_user_account_creation,engagee_follows_engager,reply,tweet
0,,0C8E7372269942BB173EED7C0A72DA09,,,,Quote,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-28 14:49:42,8B7BB615A39DF112B0037E960C27F220,956,...,False,2020-08-02 17:13:46,3044AFDB7E977FB7F62D49C5C97794B7,1128,4028,False,2014-12-28 10:37:53,False,0,"[CLS] Tbh, anyone who thinks this way is not considered [UNK] progressive [UNK] & amp ; I reserve the deepest contempt & amp ; disgust for them. I consider them regressive for ignoring the evidence of their own eyes & amp ; an unthinking fool for ignoring science. ¶ ¶ Call it my red line. I don [UNK] t like male cheats. [SEP]"
1,,858720854891DFED04A5B91758049833,Photo,,,TopLevel,313ECD3A1E5BB07406E4249475C2D6D6,2021-02-26 10:30:36,74B09D5BC3FCE5CC4DEE34BF049A0EE5,8723,...,False,2020-02-13 18:36:45,23ACD97ACEF57BF6416670BE1133A664,72,115,False,2020-09-14 04:40:07,False,0,[CLS] https : / / t. co / jLrINby1Ux [SEP]
2,90C52DDF506D1C98EE678E84C08C36AB\tCD8639BABE547D9B9BBBB8E962450E2B\t6F84D430D26397EF87A1DC2A7717E75F\t03176616480890152CF7583112DBBD87\t062B238B5042E31C6FD1468742975CDF\t78D6EF9ED489837B54DC07B2F7FFFB21\t16E89E56FDFA9CC96F1F39EFF822F0A3\t6DAEFDE99D17E5D7DF550AFE89432DBE\tE743B8FCBF73E688676EAB7DCA4AD5AB\tF6E3CABCCEAEFCC3030256CE4620875F,1988AA4069C46F57990B6983FBD427A6,,D249E94F6D37A7D5E37185E02C94CDFF,9EFF000CDB18B710CDDB43EE1D8C300B,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-03-01 17:02:33,316C2BE47E445DA930E56C12F3AF31C5,86709,...,False,2009-06-15 03:32:36,99F829F88A12BF8B92EDF11A10B6533C,130,638,False,2009-11-01 11:58:13,False,0,[CLS] RT @ hanryu _ pia : # 韓 流 ぴあ 3 月 号 （ 2 / 22 発 売 ） 表 紙 # テギョン （ # 2PM ） # 옥택연 ¶ ¶ [UNK] 特 典 付 き 本 誌 のショップ 一 覧 はこちら ¶ https : / / t. co / DDOiF0Raip ¶ ¶ # ソンシギョン # AB6IX # SOOHYUN # HOON （ # UKISS ） # ON [UNK] [SEP]
3,D704B4128E35F9BC995E701523676542\tCFD3AF9039C66BB22F7463A6C901225B\t7791383E39A388C0E5915549805FB527\t1C795E14677C7DBF412738236634F599,89C86150124016236B7D4A286B041210,,,,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-03-02 19:55:32,C19FC40FBF0AA0BAC4797BD47483349B,2445,...,False,2015-11-14 05:15:25,A3A7B6D928FAD730BA9967B53EA7CC28,234,640,False,2020-09-10 08:10:09,False,0,[CLS] RT @ TomAndJerry _ JP : ／ ¶ # トムジェリ 仲 良 しチャレンジ [UNK] ¶ # 7 日 目 ¶ ＼ ¶ ¶ いつものケンカで # トムとジェリー が ¶ 世 界 が 注 目 するウェディングを 台 無 [UNK] ¶ でも 《 まさかの 友 情 》 で 奇 跡 を 起 こす [UNK] ¶ 『 # 映 画 トムジェリ 』 3 / 19 公 開 [UNK] ¶ ¶ [UNK] ¶ [UNK] [UNK] [SEP]
4,,971197A8E595128D8BAF8F3F9D20CFF3,Photo\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-03-01 00:38:33,46846BF25E56F8BA11C0AE36ED31B28F,20672,...,False,2019-09-19 22:02:19,BA94D75FC03B42F88CAE61485E075960,8,208,False,2018-07-29 15:59:42,False,0,[CLS] EVERYONE WITH THEIR COWBOY SKINS AND CONNOR IS JUST https : / / t. co / TpiCfdzRe2 [SEP]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,2227FE4B3B1D59BE04F11FBCE146E96B,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,2021-02-28 22:11:16,2969768AB934329E3F77C6BF9389D2EC,6682,...,False,2020-05-25 22:12:02,5DAD7F04EF963C9AA67A2301A8B82D81,359,413,False,2020-06-01 11:52:22,True,0,[CLS] As a player would you rather win the World Cup or the Champions League? [UNK] [SEP]
9996,,4CD4E99F32B693D7127185B99BBC16EC,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,2021-03-03 01:44:43,7C5CA5A790B34D58C92A6B1D66A5C29C,53055,...,False,2007-07-08 01:47:26,01449F871D9C25CD15A99A5BA7080B32,1036,2195,False,2011-03-12 16:31:57,False,0,[CLS] RT @ ericgeller : DOJ making some kind of national security cyber announcement tomorrow. These announcements are usually about charges. https [UNK] [SEP]
9997,,076F276223FEE8B65A8394000DDE1FF8,Photo,,,Retweet,E7F038DE3EAD397AEC9193686C911677,2021-03-02 21:00:48,5BF5EB3CB78D855670FE5E823FD09C19,18422,...,False,2017-10-28 13:06:19,604AE4955F348F51DC45B3186DB5AE6E,259,913,False,2019-05-04 12:53:41,True,0,[CLS] RT @ ikirito _ YOTSUBA : みんなおはよう!! [UNK] ¶ ¶ 今 日 もがんばろね♪ ¶ ¶ おはリプ ＆ いいね 待 ってます!! https : / / t. co / Y5VsC4xyCe [SEP]
9998,5DF4827AA8F4AA4168CAC78DC7821AA4\tEA2DBD95359DD6F403EE42CDAAFEE1E9\t6C9C78DC9BD194777258B17B8EDA3AF6,E48DD62AF5F4970DFC1D8F9080DCC55C,,,,TopLevel,5B6973BEB05212E396F3F2DC6A31B71C,2021-03-03 11:37:57,8BBCDE5904602037AB3863910C5E724B,8644,...,False,2020-03-02 22:16:41,C094354FC31B6759C0FCC2EA886A0A11,342,370,False,2020-05-11 19:44:37,True,0,[CLS] برای کمک به کادر درمان آزمون دکتری رو به تعویق بندازید. ¶ # کرونا ¶ # تعويق _ آزمون _ دکتری1400 _ وزارت _ علوم ¶ # جلوگیری _ از _ پیک _ چهارم _ کرونا [SEP]


In [11]:
from sklearn.metrics import log_loss, average_precision_score
import numpy as np
def compute_AP(pred, gt):
    return average_precision_score(gt, pred)
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

pred = predictor.predict(test_data)

In [12]:
compute_AP(pred = pred.to_numpy(), gt=test_data['reply'].to_numpy())

0.0215

In [13]:
compute_rce_fast(pred = pred.to_numpy(), gt=test_data['reply'].to_numpy())

-618.5820598652679

In [14]:
predictor.features('transformed')

['engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_is_verified',
 'engaging_user_follower_count',
 'enaging_user_following_count',
 'enaging_user_is_verified',
 'engagee_follows_engager',
 'hashtags',
 'tweet_id',
 'present_media',
 'present_links',
 'present_domains',
 'tweet_type',
 'language',
 'engaged_with_user_id',
 'engaging_user_id',
 'tweet',
 'tweet_timestamp',
 'tweet_timestamp.day',
 'tweet_timestamp.dayofweek',
 'engaged_with_user_account_creation',
 'engaged_with_user_account_creation.year',
 'engaged_with_user_account_creation.month',
 'engaged_with_user_account_creation.day',
 'engaged_with_user_account_creation.dayofweek',
 'engaging_user_account_creation',
 'engaging_user_account_creation.year',
 'engaging_user_account_creation.month',
 'engaging_user_account_creation.day',
 'engaging_user_account_creation.dayofweek',
 'tweet.char_count',
 'tweet.word_count',
 'tweet.capital_ratio',
 'tweet.lower_ratio',
 'tweet.digit_ratio'

In [15]:
predictor.feature_importance(test_data, feature_stage='transformed')

Computing feature importance via permutation shuffling for 6451 features using 5000 rows with 5 shuffle sets...
	5124.57s	= Expected runtime (1024.91s per shuffle set)
	5657.87s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
engagee_follows_engager,0.00008,0.000110,0.088904,5,0.000306,-0.000146
engaged_with_user_follower_count,0.00000,0.000000,0.500000,5,0.000000,0.000000
__nlp__.solo,0.00000,0.000000,0.500000,5,0.000000,0.000000
__nlp__.sometimes,0.00000,0.000000,0.500000,5,0.000000,0.000000
__nlp__.something,0.00000,0.000000,0.500000,5,0.000000,0.000000
...,...,...,...,...,...,...
tweet.symbol_count.:,-0.00004,0.000089,0.813050,5,0.000144,-0.000224
tweet_type,-0.00004,0.000089,0.813050,5,0.000144,-0.000224
engaged_with_user_id,-0.00008,0.000110,0.911096,5,0.000146,-0.000306
engaging_user_account_creation.day,-0.00012,0.000110,0.964758,5,0.000106,-0.000346
