In [1]:
import os, time
start = time.time()
very_start = time.time()

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
import dask
from dask.distributed import Client, wait, LocalCluster
import dask.dataframe as dd

In [4]:

client = Client(n_workers=1, 
                       threads_per_worker=96,
                       memory_limit='480GB',ip='10.1.0.xxx')
dask.config.set(shuffle='disk')
dask.config.set({'temporary_directory': '/path/to/dask_tmp'})

<dask.config.set at 0x7f54785723d0>

# Load Train

In [5]:
%%time
path = '/path/to/bin/tmp/'
train = dd.read_parquet(f'/path/to/dask_tmp/train_10_raw/*.parquet')#,dtypes=dtypes)
valid = dd.read_parquet(f'/path/to/dask_tmp/valid_10_raw/*.parquet')#,dtypes=dtypes)

CPU times: user 100 ms, sys: 23.2 ms, total: 123 ms
Wall time: 112 ms


In [6]:
cols_drop = ['tweet','tweet_nortsign']
train = train.drop(cols_drop,axis=1)
valid = valid.drop(cols_drop,axis=1)

In [7]:
dtype_names = ['count_words', 'count_char', 'count_ats']
for col in dtype_names:
    train[col] = train[col].astype('int32')
    valid[col] = valid[col].astype('int32')

In [8]:
label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
DONT_USE = ['tweet_id','tweet_timestamp','engaged_with_user_account_creation','enaging_user_account_creation','engage_time',
            'fold','enaging_user_id','engaged_with_user_id', 'dt_dow',
            'engaged_with_user_account_creation', 'enaging_user_account_creation', 'elapsed_time',
             'present_links','present_domains']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

# Train Model Validate
We will train on random 10% of first 5 days and validation on last 2 days

In [9]:
%%time

SAMPLE_RATIO = 0.5
SEED = 1

if SAMPLE_RATIO < 1.0:
    print(len(train))
    train = train.sample(frac=SAMPLE_RATIO,random_state=42)
    train, = dask.persist(train)
    train.head()
    print(len(train))

train = train.compute()
# train = train.compute()
Y_train = train[label_names]
train = train.drop(label_names,axis=1)

features = [c for c in train.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),train.shape[1])
np.asarray(features)

21749425
10874713
Using 121 features: 121
CPU times: user 11.6 s, sys: 12.9 s, total: 24.6 s
Wall time: 2min 40s


array(['hashtags', 'present_media', 'tweet_type', 'language',
       'engaged_with_user_follower_count',
       'engaged_with_user_following_count',
       'engaged_with_user_is_verified', 'enaging_user_follower_count',
       'enaging_user_following_count', 'enaging_user_is_verified',
       'engagee_follows_engager', 'len_hashtags', 'len_domains',
       'len_links', 'dt_hour', 'dt_minute', 'dt_second', 'count_words',
       'count_char', 'tw_uhash', 'tw_hash', 'count_ats', 'hash0', 'hash1',
       'tw_freq_hash', 'tw_first_word', 'tw_second_word', 'tw_last_word',
       'tw_llast_word', 'tw_len', 'TE_present_media_reply_timestamp',
       'TE_tweet_type_reply_timestamp', 'TE_language_reply_timestamp',
       'TE_engaged_with_user_id_reply_timestamp',
       'TE_enaging_user_id_reply_timestamp',
       'TE_present_media_retweet_timestamp',
       'TE_tweet_type_retweet_timestamp', 'TE_language_retweet_timestamp',
       'TE_engaged_with_user_id_retweet_timestamp',
       'TE_enaging_

In [10]:
SAMPLE_RATIO = 0.052 # VAL SET NOW SIZE OF TEST SET
SEED = 1
if SAMPLE_RATIO < 1.0:
    print(len(valid))
    valid = valid.sample(frac=SAMPLE_RATIO,random_state=42)
    valid, = dask.persist(valid)
    valid.head()
    print(len(valid))
    
valid = valid.compute()
Y_valid = valid[label_names]
valid = valid.drop(label_names,axis=1)

8585006
446420


In [21]:
import xgboost as xgb
print('XGB Version',xgb.__version__)

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'nthread':94,
    'tree_method':'hist',
    #'predictor' : 'gpu_predictor'
}


XGB Version 1.0.2


In [22]:
if train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { train.columns[train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {train.shape}')
print(f'X_valid.shape {valid.shape}')

no dup :) 
X_train.shape (10874713, 121)
X_valid.shape (446420, 121)


In [23]:
%%time

for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
        valid[col] = valid[col].astype('int8')

CPU times: user 11.4 ms, sys: 3 ms, total: 14.4 ms
Wall time: 13.8 ms


In [24]:
train.head()

Unnamed: 0,hashtags,present_media,tweet_type,language,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,enaging_user_follower_count,enaging_user_following_count,enaging_user_is_verified,engagee_follows_engager,len_hashtags,len_domains,len_links,dt_hour,dt_minute,dt_second,count_words,count_char,tw_uhash,tw_hash,count_ats,hash0,hash1,tw_freq_hash,tw_first_word,tw_second_word,tw_last_word,tw_llast_word,tw_len,TE_present_media_reply_timestamp,TE_tweet_type_reply_timestamp,TE_language_reply_timestamp,TE_engaged_with_user_id_reply_timestamp,TE_enaging_user_id_reply_timestamp,TE_present_media_retweet_timestamp,TE_tweet_type_retweet_timestamp,TE_language_retweet_timestamp,TE_engaged_with_user_id_retweet_timestamp,TE_enaging_user_id_retweet_timestamp,TE_present_media_retweet_with_comment_timestamp,TE_tweet_type_retweet_with_comment_timestamp,TE_language_retweet_with_comment_timestamp,TE_engaged_with_user_id_retweet_with_comment_timestamp,TE_enaging_user_id_retweet_with_comment_timestamp,TE_present_media_like_timestamp,TE_tweet_type_like_timestamp,TE_language_like_timestamp,TE_engaged_with_user_id_like_timestamp,TE_enaging_user_id_like_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp,TE_engaged_with_user_id_tweet_type_language_reply_timestamp,TE_tw_first_word_tweet_type_language_reply_timestamp,TE_tw_last_word_tweet_type_language_reply_timestamp,TE_hash0_tweet_type_language_reply_timestamp,TE_hash1_tweet_type_language_reply_timestamp,TE_tw_uhash_tweet_type_language_reply_timestamp,TE_tw_hash_reply_timestamp,TE_tw_freq_hash_reply_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_reply_timestamp,TE_present_domains_present_media_tweet_type_language_reply_timestamp,TE_present_links_present_media_tweet_type_language_reply_timestamp,TE_hashtags_present_media_tweet_type_language_reply_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp,TE_engaged_with_user_id_tweet_type_language_retweet_timestamp,TE_tw_first_word_tweet_type_language_retweet_timestamp,TE_tw_last_word_tweet_type_language_retweet_timestamp,TE_hash0_tweet_type_language_retweet_timestamp,TE_hash1_tweet_type_language_retweet_timestamp,TE_tw_uhash_tweet_type_language_retweet_timestamp,TE_tw_hash_retweet_timestamp,TE_tw_freq_hash_retweet_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_retweet_timestamp,TE_present_domains_present_media_tweet_type_language_retweet_timestamp,TE_present_links_present_media_tweet_type_language_retweet_timestamp,TE_hashtags_present_media_tweet_type_language_retweet_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp,TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp,TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp,TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp,TE_hash0_tweet_type_language_retweet_with_comment_timestamp,TE_hash1_tweet_type_language_retweet_with_comment_timestamp,TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp,TE_tw_hash_retweet_with_comment_timestamp,TE_tw_freq_hash_retweet_with_comment_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp,TE_present_domains_present_media_tweet_type_language_retweet_with_comment_timestamp,TE_present_links_present_media_tweet_type_language_retweet_with_comment_timestamp,TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp,TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp,TE_engaged_with_user_id_tweet_type_language_like_timestamp,TE_tw_first_word_tweet_type_language_like_timestamp,TE_tw_last_word_tweet_type_language_like_timestamp,TE_hash0_tweet_type_language_like_timestamp,TE_hash1_tweet_type_language_like_timestamp,TE_tw_uhash_tweet_type_language_like_timestamp,TE_tw_hash_like_timestamp,TE_tw_freq_hash_like_timestamp,TE_present_media_tweet_type_language_engaged_with_user_is_verified_enaging_user_is_verified_engagee_follows_engager_like_timestamp,TE_present_domains_present_media_tweet_type_language_like_timestamp,TE_present_links_present_media_tweet_type_language_like_timestamp,TE_hashtags_present_media_tweet_type_language_like_timestamp,TE_present_media_elapsed_time,TE_tweet_type_elapsed_time,TE_language_elapsed_time,CE_present_media,CE_tweet_type,CE_language,CE_engaged_with_user_id,CE_enaging_user_id,CE_present_media_norm,CE_tweet_type_norm,CE_language_norm,CE_engaged_with_user_id_norm,CE_enaging_user_id_norm,DE_enaging_user_id_enaging_user_following_count_1,DE_enaging_user_id_enaging_user_following_count_-1,DE_enaging_user_id_language_1,DE_enaging_user_id_language_-1,a_ff_rate,b_ff_rate
1971849,0,0.0,1,0,334126,18414,0,33,39,0,0,0,0,0,16,31,11,33,170,1799,569692180,1,1799,0,569692180,924731,791917,1090,1090,14,0.032645,0.005752,0.028732,0.027621,0.0295,0.07928,0.112284,0.079864,0.10848,0.086867,0.006447,0.006206,0.007803,0.000457,0.00711,0.364952,0.246367,0.379281,0.343649,0.393745,0.006945,0.014197,0.0295,0.007291,0.010157,0.00657,0.010157,0.0295,0.0295,0.006809,0.007119,0.007119,0.007318,0.082877,0.015512,0.086867,0.091097,0.191127,0.094193,0.191127,0.086867,0.086867,0.080529,0.086885,0.086885,0.084484,0.004806,0.00127,0.00711,0.006127,0.004479,0.006772,0.004479,0.00711,0.00711,0.004662,0.005856,0.005856,0.005944,0.244193,0.168526,0.393745,0.226908,0.234804,0.279081,0.234804,0.393745,0.393745,0.236954,0.236088,0.236088,0.242079,1.2e-05,7e-06,9e-06,18765501.0,9965234,10957387,434.0,1.0,0.620148,0.329896,0.365925,1.526477e-05,4.597823e-08,-0.0,-0.0,-0.0,-0.0,0.055111,0.846154
1571286,0,5.0,1,4,490,529,0,75,378,0,1,0,0,0,8,51,18,7,56,6791,-279645608,1,6791,0,-279645608,1007816,155146,94416,94416,3,0.028661,0.005752,0.019377,0.026819,0.028096,0.087643,0.112284,0.082539,0.07897,0.082731,0.007457,0.006206,0.006672,0.006464,0.006771,0.449325,0.246367,0.415885,0.35795,0.374995,0.004904,0.0295,0.0295,0.004876,0.004917,0.004257,0.004917,0.0295,0.0295,0.004934,0.004565,0.004565,0.004755,0.109182,0.086867,0.086867,0.080474,0.081145,0.09917,0.081145,0.086867,0.086867,0.109122,0.09602,0.09602,0.095936,0.006926,0.00711,0.00711,0.00944,0.009518,0.006076,0.009518,0.00711,0.00711,0.006878,0.005263,0.005263,0.005506,0.335224,0.393745,0.393745,0.362603,0.365624,0.311002,0.365624,0.393745,0.393745,0.334552,0.324611,0.324611,0.330727,1.2e-05,7e-06,9e-06,6037524.0,9965234,1949730,4.0,3.0,0.19891,0.329896,0.063731,1.839129e-07,1.379347e-07,0.0,0.0,0.0,0.0,1.079592,0.198413
2062484,0,7.0,2,4,243,253,0,794,1002,0,1,0,0,0,18,8,50,2,35,0,-506757606,0,0,0,-506757606,-1,-1,-1,-1,0,0.020536,0.04233,0.019377,0.0295,0.028096,0.115903,0.077514,0.082539,0.086867,0.082731,0.008462,0.007917,0.006672,0.00711,0.006771,0.492777,0.482271,0.415885,0.393745,0.374995,0.05174,0.0295,0.03232,0.03232,0.025486,0.02722,0.025486,0.0295,0.0295,0.051742,0.016732,0.016732,0.017758,0.096642,0.086867,0.073602,0.073602,0.081918,0.081671,0.081918,0.086867,0.086867,0.097437,0.108256,0.108256,0.099966,0.013168,0.00711,0.007603,0.007603,0.007703,0.007572,0.007703,0.00711,0.00711,0.01321,0.008756,0.008756,0.009055,0.656567,0.393745,0.50578,0.50578,0.517565,0.523603,0.517565,0.393745,0.393745,0.655696,0.598832,0.598832,0.608145,1.1e-05,1.3e-05,9e-06,1705608.0,17904537,1949730,2.0,2.0,0.055562,0.587231,0.063731,9.195645e-08,9.195645e-08,0.0,0.0,0.0,-0.0,1.041152,0.792415
1419726,0,0.0,1,0,2207,572,0,2171,1363,0,1,0,0,0,6,19,5,17,107,2507,-214581439,1,2507,0,-214581439,796311,543910,93917,93917,5,0.032645,0.005752,0.028732,0.019033,0.026819,0.07928,0.112284,0.079864,0.185076,0.124425,0.006447,0.006206,0.007803,0.004587,0.006464,0.364952,0.246367,0.379281,0.479836,0.35795,0.008158,0.020345,0.0295,0.026819,0.004758,0.00657,0.004758,0.0295,0.0295,0.008161,0.007119,0.007119,0.007318,0.117667,0.163357,0.086867,0.07897,0.159172,0.094193,0.159172,0.086867,0.086867,0.116344,0.086885,0.086885,0.084484,0.008168,0.004903,0.00711,0.006464,0.017276,0.006772,0.017276,0.00711,0.00711,0.008109,0.005856,0.005856,0.005944,0.285034,0.478445,0.393745,0.35795,0.329636,0.279081,0.329636,0.393745,0.393745,0.280565,0.236088,0.236088,0.242079,1.2e-05,7e-06,9e-06,18765501.0,9965234,10957387,20.0,4.0,0.620148,0.329896,0.365925,7.356516e-07,1.379347e-07,0.0,0.0,-0.0,0.0,0.259175,1.59281
1416932,0,5.0,1,1,201,392,0,14,33,0,1,0,0,0,6,16,27,13,75,51238,448048590,1,51238,0,448048590,640219,633158,1080,1080,7,0.028661,0.005752,0.026728,0.026819,0.028096,0.087643,0.112284,0.055023,0.07897,0.082731,0.007457,0.006206,0.003553,0.006464,0.006771,0.449325,0.246367,0.419006,0.448859,0.422614,0.004478,0.028096,0.0295,0.005209,0.0295,0.003646,0.0295,0.0295,0.0295,0.004331,0.003759,0.003759,0.00406,0.089306,0.082731,0.086867,0.07331,0.086867,0.080555,0.086867,0.086867,0.086867,0.08892,0.082284,0.082284,0.087803,0.003817,0.006771,0.00711,0.001693,0.00711,0.003964,0.00711,0.00711,0.00711,0.003954,0.003115,0.003115,0.003566,0.328778,0.422614,0.393745,0.272628,0.393745,0.257394,0.393745,0.393745,0.393745,0.321286,0.333343,0.333343,0.339689,1.2e-05,7e-06,1.9e-05,6037524.0,9965234,5205178,4.0,2.0,0.19891,0.329896,0.169873,1.379347e-07,9.195645e-08,-0.0,-0.0,-0.0,-0.0,1.950249,0.424242


In [25]:
#label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [26]:
%%time
# TRAIN AND VALIDATE

NROUND = 500
VERBOSE_EVAL = 50
#ESR = 50
LR = [0.1,0.03,0.07,0.01]

#Like
#xgb_parms['learning_rate'] = LR[TARGET_id]    
oof = np.zeros((len(valid),len(label_names)))
preds = []
for i in range(4):
    if i<=1:
        NROUND = 300
    else:
        NROUND = 500
    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)
    xgb_parms['learning_rate'] = LR[i]   
    start = time.time(); print('Creating DMatrix...')
        
    dtrain = xgb.DMatrix(data=train,label=Y_train.iloc[:, i])
    dvalid = xgb.DMatrix(data=valid,label=Y_valid.iloc[:, i])
    print('Took %.1f seconds'%(time.time()-start))
             
    start = time.time(); print('Training...')
    model = xgb.train(xgb_parms, 
                           dtrain=dtrain,
                           #evals=[(dtrain,'train'),(dvalid,'valid')],
                           num_boost_round=NROUND,
                           #early_stopping_rounds=ESR,
                           verbose_eval=VERBOSE_EVAL) 
    print('Took %.1f seconds'%(time.time()-start))
        
    start = time.time(); print('Predicting...')
    #Y_valid[f'pred_{name}'] = xgb.dask.predict(client,model,valid)
    oof[:, i] += model.predict(dvalid)
    #preds.append(xgb.dask.predict(client,model,valid))
    print('Took %.1f seconds'%(time.time()-start))
        
    del model, dtrain, dvalid

#########################
### reply_timestamp
#########################
Creating DMatrix...
Took 29.6 seconds
Training...
Took 335.4 seconds
Predicting...
Took 0.2 seconds
#########################
### retweet_timestamp
#########################
Creating DMatrix...
Took 29.1 seconds
Training...
Took 354.4 seconds
Predicting...
Took 0.2 seconds
#########################
### retweet_with_comment_timestamp
#########################
Creating DMatrix...
Took 28.9 seconds
Training...
Took 519.4 seconds
Predicting...
Took 0.3 seconds
#########################
### like_timestamp
#########################
Creating DMatrix...
Took 28.7 seconds
Training...
Took 499.1 seconds
Predicting...
Took 0.4 seconds
CPU times: user 1d 13h 47min 10s, sys: 2min 10s, total: 1d 13h 49min 20s
Wall time: 30min 25s


In [27]:
yvalid = Y_valid[label_names].values

# Compute Validation Metrics

In [28]:
from sklearn.metrics import precision_recall_curve, auc, log_loss
from sklearn.metrics import average_precision_score

def compute_prauc(pred, gt):
  prec, recall, thresh = precision_recall_curve(gt, pred)
  prauc = auc(recall, prec)
  return prauc

def compute_AP(pred, gt):
  return average_precision_score(gt, pred)

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [29]:
%%time
txt = ''
for i in range(4):
    ap = compute_AP(oof[:,i], yvalid[:, i])
    rce   = compute_rce_fast(oof[:,i], yvalid[:, i])
    txt_ = f"{label_names[i]:20} AP:{ap:.5f} RCE:{rce:.5f}"
    print(txt_)
    txt += txt_ + '\n'

reply_timestamp      AP:0.21047 RCE:20.12967
retweet_timestamp    AP:0.41480 RCE:22.02740
retweet_with_comment_timestamp AP:0.04506 RCE:10.30338
like_timestamp       AP:0.71223 RCE:18.40327
CPU times: user 800 ms, sys: 2.99 ms, total: 803 ms
Wall time: 777 ms


In [30]:
print('This notebook took %.1f minutes'%((time.time()-very_start)/60.))

This notebook took 71.4 minutes
