Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

### New Notebook

#### portting to use original datasets
### <font color = red> With Re-generated the features </font>

In [88]:
import os, time
start = time.time()
very_start = time.time()

In [89]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [90]:
import dask
from dask.distributed import Client, wait, LocalCluster
import dask.dataframe as dd

In [91]:

client = Client(n_workers=8, 
                       threads_per_worker=1,
                       memory_limit='200GB',ip='10.1.0.131')
#client = Client(ip='10.2.48.253',memory_limit='100GB')
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41565 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://10.1.0.131:33490  Dashboard: http://10.1.0.131:41565/status,Cluster  Workers: 8  Cores: 8  Memory: 1.60 TB


# Load Train

In [92]:
%%time
path = '/mnt/DP_disk3/Recsys/nv'
### train = dd.read_parquet(f'{path}/train-preproc-fold-*.parquet')#,dtypes=dtypes)
### Notes: revious scripts cannot generate this dataset. 
### original tested was downloaded from here:  
###  https://github.com/rapidsai/dask-cuda/issues/337 

train = dd.read_parquet('/mnt/DP_disk3/Recsys/train-1.parquet')
test0 = dd.read_parquet('/mnt/DP_disk3/Recsys/test-0.parquet')
test1 = dd.read_parquet('/mnt/DP_disk3/Recsys/test-1.parquet')

CPU times: user 67.5 ms, sys: 12.6 ms, total: 80.1 ms
Wall time: 63.8 ms


In [93]:
train.shape, test0.shape, test1.shape

((Delayed('int-35d9ff05-960d-4e37-9268-1148d97ac824'), 27),
 (Delayed('int-30340cf5-66fb-4c47-bc2e-dd0ed0d3c9a0'), 27),
 (Delayed('int-7f367d98-eef2-416f-81b7-a0f426689237'), 27))

In [94]:
#train.dtypes

hashtags              object
tweet_id              uint32
media                  uint8
links                 object
domains               object
tweet_type             uint8
language               uint8
timestamp             uint32
a_user_id             uint32
a_follower_count      uint32
a_following_count     uint32
a_is_verified           bool
a_account_creation    uint32
b_user_id             uint32
b_follower_count      uint32
b_following_count     uint32
b_is_verified           bool
b_account_creation    uint32
b_follows_a             bool
reply                 uint32
retweet               uint32
retweet_comment       uint32
like                  uint32
id                    uint32
len_hashtags           int64
len_domains            int64
len_links              int64
dtype: object

In [95]:
%%time
# DROP UNUSED COLUMNS
#cols_drop = ['links','hashtags0', 'hashtags1', 'fold']
cols_drop = ['links','hashtags']

train = train.drop(cols_drop,axis=1)

CPU times: user 9.66 ms, sys: 3.12 ms, total: 12.8 ms
Wall time: 10.5 ms


In [96]:
%%time
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-bae199d2-0555-4ad4-93da-45cfbf8f9c93'), 25)
CPU times: user 3.36 ms, sys: 2.75 ms, total: 6.11 ms
Wall time: 4.61 ms


In [97]:
%%time
train = train.repartition(npartitions=8)
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-5e3d5035-c840-47b5-8fe8-3962ba4d6e9a'), 25)
CPU times: user 12.3 ms, sys: 3.2 ms, total: 15.5 ms
Wall time: 13.2 ms


In [98]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [99]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
for col in train.columns:
    if col in label_names:
        train[col] = train[col].astype('int64')
    elif train[col].dtype=='int64':
        train[col] = train[col].astype('int32')
    elif train[col].dtype=='int16':
        train[col] = train[col].astype('int8')

In [100]:
#train.dtypes

tweet_id              uint32
media                  uint8
domains               object
tweet_type             uint8
language               uint8
timestamp             uint32
a_user_id             uint32
a_follower_count      uint32
a_following_count     uint32
a_is_verified           bool
a_account_creation    uint32
b_user_id             uint32
b_follower_count      uint32
b_following_count     uint32
b_is_verified           bool
b_account_creation    uint32
b_follows_a             bool
reply                  int64
retweet                int64
retweet_comment        int64
like                   int64
id                    uint32
len_hashtags           int32
len_domains            int32
len_links              int32
dtype: object

gap: int32 -> uint32; tweet_id: unit32, links uint32, domains uint32; timestamp: datetime64[ns]
len_* -> uint8

In [101]:
%%time
train = train.reset_index(drop=True)

CPU times: user 21.7 ms, sys: 255 µs, total: 22 ms
Wall time: 19 ms


In [102]:
%%time
train, = dask.persist(train)
print(train.shape)

(Delayed('int-2156b0de-2a12-4e00-b150-dc32bb719629'), 25)
CPU times: user 22.3 ms, sys: 2.03 ms, total: 24.3 ms
Wall time: 22.1 ms


# <font color = red> Newly add code </font>

## Caculate Engage time & Elapsed Time

In [103]:
#train.head()

Unnamed: 0,tweet_id,media,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links
0,0,0,DBB05735DF475AB5DABEE791299AE3D1,2,54,1581131751,0,314265,139,True,1369082795,11153676,2,29,False,1568107028,False,0,0,0,1581156487,0,1,1,1
1,1,5,,2,9,1581527100,1,43089,2553,True,1244928766,11153677,26,161,False,1513935572,False,0,0,0,0,1,0,0,0
2,2,0,,2,38,1580979604,2,60258,773,False,1306719127,11153678,4,72,False,1573996260,False,0,0,0,1580992068,2,0,0,0
3,3,9,,1,38,1580993048,3,119321,220,False,1283447472,11153679,22,251,False,1439637842,False,0,0,0,1580993386,3,0,0,0
4,4,7,,1,47,1581143484,4,3106126,3740,False,1234718202,11153679,22,251,False,1439637842,False,0,0,0,0,4,0,0,0


In [105]:
%%time 

## New Add, convert timestamp to timestamp64 for split_time to run 

label_names = ["reply", "retweet", "retweet_comment", "like"]

train['timestamp'] = dd.to_datetime(train['timestamp'], unit="s")


CPU times: user 13.6 ms, sys: 1.11 ms, total: 14.7 ms
Wall time: 12.7 ms


In [106]:
#train.dtypes

tweet_id                      uint32
media                          uint8
domains                       object
tweet_type                     uint8
language                       uint8
timestamp             datetime64[ns]
a_user_id                     uint32
a_follower_count              uint32
a_following_count             uint32
a_is_verified                   bool
a_account_creation            uint32
b_user_id                     uint32
b_follower_count              uint32
b_following_count             uint32
b_is_verified                   bool
b_account_creation            uint32
b_follows_a                     bool
reply                          int64
retweet                        int64
retweet_comment                int64
like                           int64
id                            uint32
len_hashtags                   int32
len_domains                    int32
len_links                      int32
dtype: object

In [107]:
#train.head()['timestamp']

0   2020-02-08 03:15:51
1   2020-02-12 17:05:00
2   2020-02-06 09:00:04
3   2020-02-06 12:44:08
4   2020-02-08 06:31:24
Name: timestamp, dtype: datetime64[ns]

In [108]:
%%time 
# TIME FEATURES
# RAPIDS does this 5x faster than Pandas CPU
# If we didn't need to copy CPU to GPU to CPU, then 1300x faster!
def split_time(df):
    #gf = cudf.from_pandas(df[['timestamp']])
    df['dt_dow']  = df['timestamp'].dt.weekday#.to_array() 
    df['dt_hour'] = df['timestamp'].dt.hour#.to_array()
    df['dt_minute'] = df['timestamp'].dt.minute#.to_array()
    df['dt_second'] = df['timestamp'].dt.second#.to_array()
    return df

train = split_time(train)

CPU times: user 51.6 ms, sys: 4.65 ms, total: 56.2 ms
Wall time: 49.2 ms


In [112]:
train["timestamp"] = train["timestamp"].astype("int64") / 1e9
train["reply"] = train.reply.mask(train["reply"] == 0.0, np.nan)
train["retweet"] = train.retweet.mask(train['retweet'] == 0.0, np.nan)
train["retweet_comment"] = train.retweet_comment.mask(train['retweet_comment'] == 0.0, np.nan)
train["like"] = train.like.mask(train['like'] == 0.0, np.nan)

In [113]:
#train.head(1000)[['reply','retweet','retweet_comment','like']]

Unnamed: 0,reply,retweet,retweet_comment,like
0,,,,1.581156e+09
1,,,,
2,,,,1.580992e+09
3,,,,1.580993e+09
4,,,,
...,...,...,...,...
995,,,,1.581224e+09
996,,,,
997,,1.580958e+09,1.580958e+09,1.580958e+09
998,,,,


In [114]:
train["engage_time"] = train[label_names].min(1)

In [115]:
#train.head()[["engage_time","timestamp"]]

Unnamed: 0,engage_time,timestamp
0,1581156000.0,1581132000.0
1,,1581527000.0
2,1580992000.0,1580980000.0
3,1580993000.0,1580993000.0
4,,1581143000.0


In [116]:
train["elapsed_time"] = train["engage_time"] - train["timestamp"]


In [117]:
#train.head()[['elapsed_time','engage_time','timestamp']]

Unnamed: 0,elapsed_time,engage_time,timestamp
0,24736.0,1581156000.0,1581132000.0
1,,,1581527000.0
2,12464.0,1580992000.0,1580980000.0
3,338.0,1580993000.0,1580993000.0
4,,,1581143000.0


In [118]:
# Check this... NaN -> False * 1 -> 0, others -> Ture * 1 -> 1
train[label_names] = (train[label_names] > 0) * 1

In [121]:
#train.head()[['reply','retweet','retweet_comment','like']]

Unnamed: 0,reply,retweet,retweet_comment,like
0,0,0,0,1
1,0,0,0,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,0


In [123]:
#train['a_account_creation'] = dd.to_datetime(train['a_account_creation'], unit="s")
#train['b_account_creation'] = dd.to_datetime(train['b_account_creation'], unit="s")


In [124]:
#train.dtypes

tweet_id               uint32
media                   uint8
domains                object
tweet_type              uint8
language                uint8
timestamp             float64
a_user_id              uint32
a_follower_count       uint32
a_following_count      uint32
a_is_verified            bool
a_account_creation     uint32
b_user_id              uint32
b_follower_count       uint32
b_following_count      uint32
b_is_verified            bool
b_account_creation     uint32
b_follows_a              bool
reply                   int64
retweet                 int64
retweet_comment         int64
like                    int64
id                     uint32
len_hashtags            int32
len_domains             int32
len_links               int32
dt_dow                  int64
dt_hour                 int64
dt_minute               int64
dt_second               int64
engage_time           float64
elapsed_time          float64
dtype: object

def set_nan(ds):
    mask = ds == 0
    ds.loc[mask] = np.nan
    return ds
train['engage_time'] = train['engage_time'].map_partitions(set_nan)

In [125]:
#train['elapsed_time'] = train['engage_time'] - train['timestamp']
train['elapsed_time'] = train.elapsed_time.astype('float64')

In [126]:
print(train['elapsed_time'].min().compute(),train['elapsed_time'].max().compute())
print(train['elapsed_time'].mean().compute())

2.0 602391.0
16617.73330803138


In [127]:
#train.dtypes

tweet_id               uint32
media                   uint8
domains                object
tweet_type              uint8
language                uint8
timestamp             float64
a_user_id              uint32
a_follower_count       uint32
a_following_count      uint32
a_is_verified            bool
a_account_creation     uint32
b_user_id              uint32
b_follower_count       uint32
b_following_count      uint32
b_is_verified            bool
b_account_creation     uint32
b_follows_a              bool
reply                   int64
retweet                 int64
retweet_comment         int64
like                    int64
id                     uint32
len_hashtags            int32
len_domains             int32
len_links               int32
dt_dow                  int64
dt_hour                 int64
dt_minute               int64
dt_second               int64
engage_time           float64
elapsed_time          float64
dtype: object

# Feature Engineering 

In [128]:
%%time
# TRAIN FIRST 5 DAYS. VALIDATE LAST 2 DAYS
VALID_DOW = [1, 2]# order is [3, 4, 5, 6, 0, 1, 2]
valid = train[train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)
train = train[~train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)

CPU times: user 25.1 ms, sys: 1.83 ms, total: 26.9 ms
Wall time: 24.1 ms


In [129]:
%%time
train,valid = dask.persist(train,valid)
print(type(train), train.shape, valid.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-48462355-2086-4b34-9ef7-ffa95d400ced'), 31) (Delayed('int-e7cc8bd9-1423-43e0-a818-a30c6a4c3e1b'), 31)
CPU times: user 25.9 ms, sys: 2.96 ms, total: 28.8 ms
Wall time: 26.9 ms


In [130]:
%%time
train = train.set_index('timestamp')
valid = valid.set_index('timestamp')
train,valid = dask.persist(train,valid)
#train.head()

CPU times: user 26.9 s, sys: 3.79 s, total: 30.6 s
Wall time: 2min 13s


Unnamed: 0_level_0,tweet_id,media,domains,tweet_type,language,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,dt_dow,dt_hour,dt_minute,dt_second,engage_time,elapsed_time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
1580947000.0,1330909,9,3896E26D12C903F0A00B6B1BE9A9BEA3,2,59,128636,190873,1828,True,1386291082,19634226,1,69,False,1575657678,False,0,0,0,0,22371531,0,1,1,3,0,0,0,,
1580947000.0,4266140,0,,2,54,117672,267411,53195,True,1228701492,20615716,236,368,False,1498245483,False,0,0,0,0,32316490,0,0,0,3,0,0,0,,
1580947000.0,75708,5,,2,54,41258,9794149,90,True,1321282652,23079942,1,124,False,1575520904,False,0,0,0,0,92180073,0,0,0,3,0,0,0,,
1580947000.0,753219,0,4FC207F77ACC1A9B2D92E3BEFE6B4A04,2,54,4319,1370976,1945,True,1228241097,16304780,77,181,False,1247064687,False,0,0,0,0,66646539,0,1,1,3,0,0,0,,
1580947000.0,75708,5,,2,54,41258,9794149,90,True,1321282652,18611739,13,57,False,1544065409,False,0,0,0,0,38514688,0,0,0,3,0,0,0,,


In [131]:
%%time
train = train.reset_index()
valid = valid.reset_index()
train,valid = dask.persist(train,valid)
#train.head()

CPU times: user 276 ms, sys: 45.7 ms, total: 322 ms
Wall time: 1.28 s


Unnamed: 0,timestamp,tweet_id,media,domains,tweet_type,language,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,dt_dow,dt_hour,dt_minute,dt_second,engage_time,elapsed_time
0,1580947000.0,1330909,9,3896E26D12C903F0A00B6B1BE9A9BEA3,2,59,128636,190873,1828,True,1386291082,19634226,1,69,False,1575657678,False,0,0,0,0,22371531,0,1,1,3,0,0,0,,
1,1580947000.0,4266140,0,,2,54,117672,267411,53195,True,1228701492,20615716,236,368,False,1498245483,False,0,0,0,0,32316490,0,0,0,3,0,0,0,,
2,1580947000.0,75708,5,,2,54,41258,9794149,90,True,1321282652,23079942,1,124,False,1575520904,False,0,0,0,0,92180073,0,0,0,3,0,0,0,,
3,1580947000.0,753219,0,4FC207F77ACC1A9B2D92E3BEFE6B4A04,2,54,4319,1370976,1945,True,1228241097,16304780,77,181,False,1247064687,False,0,0,0,0,66646539,0,1,1,3,0,0,0,,
4,1580947000.0,75708,5,,2,54,41258,9794149,90,True,1321282652,18611739,13,57,False,1544065409,False,0,0,0,0,38514688,0,0,0,3,0,0,0,,


In [132]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [133]:
#for i,p in enumerate(valid.partitions):
#    print(i,len(p))

### Target Encode

In [134]:
class MTE_one_shot:
    
    def __init__(self, folds, smooth, seed=42):
        self.folds = folds
        self.seed = seed
        self.smooth = smooth
        
    def fit_transform(self, train, x_col, y_col, y_mean=None, out_col = None, out_dtype=None):
        
        self.y_col = y_col
        np.random.seed(self.seed)
        
        if 'fold' not in train.columns:
            fsize = len(train)//self.folds
            train['fold'] = 1
            train['fold'] = train['fold'].cumsum()
            train['fold'] = train['fold']//fsize
            train['fold'] = train['fold']%self.folds
        
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        
        if y_mean is None:
            y_mean = train[y_col].mean()#.compute().astype('float32')
        self.mean = y_mean
        
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        
        agg_each_fold = train.groupby(cols).agg({y_col:['count','sum']}).reset_index()
        agg_each_fold.columns = cols + ['count_y','sum_y']
        
        agg_all = agg_each_fold.groupby(x_col).agg({'count_y':'sum','sum_y':'sum'}).reset_index()
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all.columns = cols + ['count_y_all','sum_y_all']
        
        agg_each_fold = agg_each_fold.merge(agg_all,on=x_col,how='left')
        agg_each_fold['count_y_all'] = agg_each_fold['count_y_all'] - agg_each_fold['count_y']
        agg_each_fold['sum_y_all'] = agg_each_fold['sum_y_all'] - agg_each_fold['sum_y']
        agg_each_fold[out_col] = (agg_each_fold['sum_y_all']+self.smooth*self.mean)/(agg_each_fold['count_y_all']+self.smooth)
        agg_each_fold = agg_each_fold.drop(['count_y_all','count_y','sum_y_all','sum_y'],axis=1)
        
        agg_all[out_col] = (agg_all['sum_y_all']+self.smooth*self.mean)/(agg_all['count_y_all']+self.smooth)
        agg_all = agg_all.drop(['count_y_all','sum_y_all'],axis=1)
        self.agg_all = agg_all
        
        train.columns
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        train = train.merge(agg_each_fold,on=cols,how='left')
        del agg_each_fold
        #self.agg_each_fold = agg_each_fold
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        train[out_col] = train[out_col].fillna(self.mean)
        
        if out_dtype is not None:
            train[out_col] = train[out_col].astype(out_dtype)
        return train
    
    def transform(self, test, x_col, out_col = None, out_dtype=None):
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        test = test.merge(self.agg_all,on=x_col,how='left')
        test[out_col] = test[out_col].fillna(self.mean)
        if out_dtype is not None:
            test[out_col] = test[out_col].astype(out_dtype)
        return test
 

TE_media_reply 17.8 seconds<br>
TE_tweet_type_reply 27.1 seconds<br>
TE_language_reply 52.5 seconds<br>
TE_a_user_id_reply 180.0 seconds<br>

In [135]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for t in ['reply', 'retweet', 'retweet_comment', 'like']:
    start = time.time()
    for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
        cols.append(out_col)
        train,valid = dask.persist(train,valid)
        del encoder
        #train.head()
        wait(train)
        wait(valid)
        print(out_col,"%.1f seconds"%(time.time()-start))

TE_media_reply 31.8 seconds
TE_tweet_type_reply 53.6 seconds
TE_language_reply 75.8 seconds
TE_a_user_id_reply 143.4 seconds
TE_b_user_id_reply 277.6 seconds
TE_media_retweet 23.1 seconds
TE_tweet_type_retweet 41.5 seconds
TE_language_retweet 61.7 seconds
TE_a_user_id_retweet 121.4 seconds
TE_b_user_id_retweet 262.7 seconds
TE_media_retweet_comment 26.3 seconds
TE_tweet_type_retweet_comment 53.2 seconds
TE_language_retweet_comment 81.3 seconds
TE_a_user_id_retweet_comment 152.4 seconds
TE_b_user_id_retweet_comment 300.9 seconds
TE_media_like 24.1 seconds
TE_tweet_type_like 48.8 seconds
TE_language_like 69.8 seconds
TE_a_user_id_like 132.2 seconds
TE_b_user_id_like 264.4 seconds
CPU times: user 3min 49s, sys: 33.3 s, total: 4min 23s
Wall time: 18min 25s


In [136]:
train['fold'].value_counts().compute()

0    13979362
4    13979361
3    13979361
2    13979361
1    13979361
Name: fold, dtype: int64

### Multiple Column Target Encode

In [137]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols=[]
c = ['domains','language','b_follows_a','tweet_type','media','a_is_verified']
for t in ['reply', 'retweet', 'retweet_comment', 'like']:
    out_col = f'TE_multi_{t}'
    encoder = MTE_one_shot(folds=5,smooth=20)
    train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
    valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
    cols.append(out_col)
    del encoder

CPU times: user 791 ms, sys: 107 ms, total: 898 ms
Wall time: 777 ms


In [138]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 27 s, sys: 3.93 s, total: 30.9 s
Wall time: 2min 11s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-67d05b9063caaa9b4cdbe24a9a727d29', 2)>}, not_done=set())

### Elapsed Time Target Encode

In [139]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language']:#, 'a_user_id', 'b_user_id']:
    for t in ['elapsed_time']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col)
        out_dtype='float32' #if 'user_id' in c else None
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype=out_dtype)
        cols.append(out_col)
        print(out_col,"%.1f seconds"%(time.time()-start))
        #del encoder

TE_media_elapsed_time 0.2 seconds
TE_tweet_type_elapsed_time 0.3 seconds
TE_language_elapsed_time 0.5 seconds
CPU times: user 477 ms, sys: 62 ms, total: 539 ms
Wall time: 467 ms


In [140]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 13.1 s, sys: 1.98 s, total: 15.1 s
Wall time: 1min 1s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-28c0465bee32ef41209e9dd6b689e021', 1)>}, not_done=set())

### Count Encode

In [141]:
class FrequencyEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, x_col, c_col=None, out_col = None):
        np.random.seed(self.seed)
        if c_col is None or c_col not in train.columns:
            c_col = 'dummy'
            train[c_col] = 1
            drop = True
        else:
            drop = False
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        if drop:
            train = train.drop(c_col,axis=1)
        agg_all.columns = cols + [out_col]
        agg_all[out_col] = agg_all[out_col].astype('int32')
        agg_all[out_col] = agg_all[out_col]*1.0/len(train)
        agg_all[out_col] = agg_all[out_col].astype('float32')
    
        train = train.merge(agg_all,on=cols,how='left')
        del agg_all
        #print(train.columns)
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        return train
    
    def transform(self, test, x_col, c_col=None, out_col = None):
        return self.fit_transform(test, x_col, c_col, out_col)
 

In [142]:
class CountEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, test, x_col, out_col = None):
        np.random.seed(self.seed)
        
        common_cols = [i for i in train.columns if i in test.columns and i!=x_col]

        if len(common_cols):
            c_col = common_cols[0]
            drop = False
        else:
            c_col = 'dummy'
            train[c_col] = 1
            test[c_col]=1
            drop = True
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_all.columns = cols + [out_col]
        
        agg_test = test.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_test.columns = cols + [out_col+'_test']
        agg_all = agg_all.merge(agg_test,on=cols,how='left')
        agg_all[out_col+'_test'] = agg_all[out_col+'_test'].fillna(0)
        agg_all[out_col] = agg_all[out_col] + agg_all[out_col+'_test']
        agg_all = agg_all.drop(out_col+'_test', axis=1)
        del agg_test
            
        if drop:
            train = train.drop(c_col,axis=1)
            test = test.drop(c_col,axis=1)
        train = train.merge(agg_all,on=cols,how='left')
        test = test.merge(agg_all,on=cols,how='left')
        del agg_all
        return train,test

In [143]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
    encoder = CountEncoder()
    out_col = f'CE_{c}'
    train,valid = encoder.fit_transform(train, valid, c, out_col=out_col)
    print
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_media 21.1 seconds
CE_tweet_type 42.9 seconds
CE_language 67.3 seconds
CE_a_user_id 110.9 seconds
CE_b_user_id 185.0 seconds
CPU times: user 37.8 s, sys: 5.6 s, total: 43.4 s
Wall time: 3min 5s


In [144]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
    encoder = FrequencyEncoder()
    out_col = f'CE_{c}_norm'
    train = encoder.fit_transform(train, c, c_col='tweet_id', out_col=out_col)
    valid = encoder.transform(valid, c, c_col='tweet_id', out_col=out_col)
    cols.append(out_col)
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_media_norm 29.2 seconds
CE_tweet_type_norm 55.1 seconds
CE_language_norm 80.0 seconds
CE_a_user_id_norm 114.6 seconds
CE_b_user_id_norm 166.6 seconds
CPU times: user 35 s, sys: 5.2 s, total: 40.2 s
Wall time: 2min 46s


### Difference Encode (Lag Features)

In [145]:
def diff_encode_cudf_v1(train,col,tar,sft=1):
    train[col+'_sft'] = train[col].shift(sft)
    train[tar+'_sft'] = train[tar].shift(sft)
    out_col = f'DE_{col}_{tar}_{sft}'
    train[out_col] = train[tar]-train[tar+'_sft']
    mask = '__MASK__'
    train[mask] = train[col] == train[col+'_sft']
    train = train.drop([col+'_sft',tar+'_sft'],axis=1)
    train[out_col] = train[out_col]*train[mask]
    train = train.drop(mask,axis=1)
    return train

In [146]:
%%time
start = time.time()
# cuDF DE ENCODING IS FAST!!
idx = 0; cols = []; sc = 'timestamp'
for c in ['b_user_id']:
    for t in ['b_follower_count','b_following_count','language']:
        for s in [1,-1]:
            start = time.time()
            train = diff_encode_cudf_v1(train, col=c, tar=t, sft=s)
            valid = diff_encode_cudf_v1(valid, col=c, tar=t, sft=s)
            train,valid = dask.persist(train,valid)
            wait(train)
            wait(valid)
            end = time.time(); idx += 1
            print('DE',c,t,s,'%.1f seconds'%(end-start))

DE b_user_id b_follower_count 1 18.1 seconds
DE b_user_id b_follower_count -1 15.1 seconds
DE b_user_id b_following_count 1 16.0 seconds
DE b_user_id b_following_count -1 19.6 seconds
DE b_user_id language 1 17.6 seconds
DE b_user_id language -1 16.1 seconds
CPU times: user 21.7 s, sys: 3.36 s, total: 25.1 s
Wall time: 1min 42s


### Diff Language

In [147]:
train_lang = train[['a_user_id', 'language', 'tweet_id']].drop_duplicates()
valid_lang = valid[['a_user_id', 'language', 'tweet_id']].drop_duplicates()
train_lang_count = train_lang.groupby(['a_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
valid_lang_count = valid_lang.groupby(['a_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
train_lang_count,valid_lang_count = dask.persist(train_lang_count,valid_lang_count)
train_lang_count.head()
del train_lang,valid_lang

In [148]:
%%time
train_lang_count = train_lang_count.merge(valid_lang_count,on=['a_user_id', 'language'],how='left')
train_lang_count['tweet_id_y'] = train_lang_count['tweet_id_y'].fillna(0)
train_lang_count['tweet_id_x'] = train_lang_count['tweet_id_x'] + train_lang_count['tweet_id_y']
train_lang_count = train_lang_count.drop('tweet_id_y',axis=1)
train_lang_count.columns = ['a_user_id', 'top_language', 'language_count']
train_lang_count, = dask.persist(train_lang_count)
train_lang_count.head()

CPU times: user 1.09 s, sys: 163 ms, total: 1.25 s
Wall time: 5.14 s


Unnamed: 0,a_user_id,top_language,language_count
0,0,54,51.0
1,1,9,75.0
2,1,47,3.0
3,1,54,4.0
4,1,61,1.0


In [149]:
%%time

train_lang_count = train_lang_count.sort_values(['a_user_id', 'language_count'])
train_lang_count['a_user_shifted'] = train_lang_count['a_user_id'].shift(1)
train_lang_count = train_lang_count[train_lang_count['a_user_id']!=train_lang_count['a_user_shifted']]
train_lang_count = train_lang_count.drop(['a_user_shifted','language_count'],axis=1)
train_lang_count.columns = ['a_user_id','top_language']
train_lang_count, = dask.persist(train_lang_count)
train_lang_count.head()

AttributeError: 'DataFrame' object has no attribute 'sort_values'

In [150]:
def diff_language(df,df_lang_count):
    df = df.merge(df_lang_count,how='left', left_on='b_user_id', right_on='a_user_id')
    df['nan_language'] = df['top_language'].isnull()
    df['same_language'] = df['language'] == df['top_language']
    df['diff_language'] = df['language'] != df['top_language']
    df['same_language'] = df['same_language']*(1-df['nan_language'])
    df['diff_language'] = df['diff_language']*(1-df['nan_language'])
    df = df.drop('top_language',axis=1)
    return df

In [151]:
#%%time
#train = diff_language(train,train_lang_count)
#valid = diff_language(valid,train_lang_count)
#train,valid = dask.persist(train,valid)
#train.head()

## Rate feature

In [152]:
%%time
# follow rate feature
train['a_ff_rate'] = (train['a_following_count'] / train['a_follower_count']).astype('float32')
train['b_ff_rate'] = (train['b_follower_count']  / train['b_following_count']).astype('float32')
valid['a_ff_rate']  = (valid['a_following_count'] / valid['a_follower_count']).astype('float32')
valid['b_ff_rate']  = (valid['b_follower_count']  / valid['b_following_count']).astype('float32')

CPU times: user 69.5 ms, sys: 13.4 ms, total: 82.9 ms
Wall time: 71.5 ms


In [153]:
train,valid = dask.persist(train,valid)

In [154]:
wait(train)
wait(valid)

DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-d0e2021b2ba699b069bd361197565a2a', 6)>}, not_done=set())

# Summarize Features

In [155]:
%%time

label_names = ['reply', 'retweet', 'retweet_comment', 'like']
#DONT_USE = ['tweet_id','timestamp','a_account_creation','b_account_creation','engage_time',
#            'fold','b_user_id','a_user_id', 'dt_dow',
#            'a_account_creation', 'b_account_creation', 
#             'links','domains','hashtags0','hashtags1']
DONT_USE = ['tweet_id','timestamp','a_account_creation','b_account_creation',
            'fold','b_user_id','a_user_id', 'dt_dow',
            'a_account_creation', 'b_account_creation', 
             'domains']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

RMV = [c for c in DONT_USE if c in train.columns and c not in label_names]
RMV

CPU times: user 178 µs, sys: 0 ns, total: 178 µs
Wall time: 186 µs


['tweet_id',
 'timestamp',
 'a_account_creation',
 'b_account_creation',
 'fold',
 'b_user_id',
 'a_user_id',
 'dt_dow',
 'a_account_creation',
 'b_account_creation',
 'domains']

In [156]:
%%time

for col in RMV:
    #print(col, col in train.columns)
    if col in train.columns:
        train = train.drop(col,axis=1)
        train, = dask.persist(train)
        train.head()
        

CPU times: user 4.46 s, sys: 564 ms, total: 5.03 s
Wall time: 19.4 s


In [157]:
%%time

for col in RMV:
    #print(col, col in valid.columns)
    if col in valid.columns:
        valid = valid.drop(col,axis=1)
        valid, = dask.persist(valid,)
        valid.head()
        

CPU times: user 3.05 s, sys: 412 ms, total: 3.46 s
Wall time: 13.5 s


In [158]:
#train.dtypes

media                                  uint8
tweet_type                             uint8
language                               uint8
a_follower_count                      uint32
a_following_count                     uint32
a_is_verified                         object
b_follower_count                      uint32
b_following_count                     uint32
b_is_verified                           bool
b_follows_a                           object
reply                                  int64
retweet                                int64
retweet_comment                        int64
like                                   int64
id                                    uint32
len_hashtags                           int32
len_domains                            int32
len_links                              int32
dt_hour                                int64
dt_minute                              int64
dt_second                              int64
engage_time                          float64
elapsed_ti

# Train Model Validate
We will train on random 10% of first 5 days and validation on last 2 days

In [159]:
%%time

SAMPLE_RATIO = 0.1
SEED = 1

if SAMPLE_RATIO < 1.0:
    print(len(train))
    train = train.sample(frac=SAMPLE_RATIO,random_state=42)
    train, = dask.persist(train)
    train.head()
    print(len(train))

train = train.compute()
Y_train = train[label_names]
train = train.drop(label_names,axis=1)

features = [c for c in train.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),train.shape[1])
np.asarray(features)

69896806
6989680
Using 64 features: 64
CPU times: user 1min 6s, sys: 18.8 s, total: 1min 25s
Wall time: 12.2 s


array(['media', 'tweet_type', 'language', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'b_follower_count',
       'b_following_count', 'b_is_verified', 'b_follows_a', 'id',
       'len_hashtags', 'len_domains', 'len_links', 'dt_hour', 'dt_minute',
       'dt_second', 'engage_time', 'elapsed_time', 'TE_media_reply',
       'TE_tweet_type_reply', 'TE_language_reply', 'TE_a_user_id_reply',
       'TE_b_user_id_reply', 'TE_media_retweet', 'TE_tweet_type_retweet',
       'TE_language_retweet', 'TE_a_user_id_retweet',
       'TE_b_user_id_retweet', 'TE_media_retweet_comment',
       'TE_tweet_type_retweet_comment', 'TE_language_retweet_comment',
       'TE_a_user_id_retweet_comment', 'TE_b_user_id_retweet_comment',
       'TE_media_like', 'TE_tweet_type_like', 'TE_language_like',
       'TE_a_user_id_like', 'TE_b_user_id_like', 'TE_multi_reply',
       'TE_multi_retweet', 'TE_multi_retweet_comment', 'TE_multi_like',
       'TE_media_elapsed_time', 'TE_tweet_type_elapsed_t

In [160]:
SAMPLE_RATIO = 0.35 # VAL SET NOW SIZE OF TEST SET
SEED = 1
if SAMPLE_RATIO < 1.0:
    print(len(valid))
    valid = valid.sample(frac=SAMPLE_RATIO,random_state=42)
    valid, = dask.persist(valid)
    valid.head()
    print(len(valid))
    
valid = valid.compute()
Y_valid = valid[label_names]
valid = valid.drop(label_names,axis=1)

26703585
9346254


In [161]:
#train.head()
#valid.head()
#valid.dtypes



media                                  uint8
tweet_type                             uint8
language                               uint8
a_follower_count                       int64
a_following_count                      int64
a_is_verified                           bool
b_follower_count                       int64
b_following_count                      int64
b_is_verified                           bool
b_follows_a                             bool
id                                     int64
len_hashtags                           int32
len_domains                            int32
len_links                              int32
dt_hour                                int64
dt_minute                              int64
dt_second                              int64
engage_time                          float64
elapsed_time                         float64
TE_media_reply                       float32
TE_tweet_type_reply                  float32
TE_language_reply                    float32
TE_a_user_

In [162]:
#train.head()

Unnamed: 0,media,tweet_type,language,a_follower_count,a_following_count,a_is_verified,b_follower_count,b_following_count,b_is_verified,b_follows_a,id,len_hashtags,len_domains,len_links,dt_hour,dt_minute,dt_second,engage_time,elapsed_time,TE_media_reply,TE_tweet_type_reply,TE_language_reply,TE_a_user_id_reply,TE_b_user_id_reply,TE_media_retweet,TE_tweet_type_retweet,TE_language_retweet,TE_a_user_id_retweet,TE_b_user_id_retweet,TE_media_retweet_comment,TE_tweet_type_retweet_comment,TE_language_retweet_comment,TE_a_user_id_retweet_comment,TE_b_user_id_retweet_comment,TE_media_like,TE_tweet_type_like,TE_language_like,TE_a_user_id_like,TE_b_user_id_like,TE_multi_reply,TE_multi_retweet,TE_multi_retweet_comment,TE_multi_like,TE_media_elapsed_time,TE_tweet_type_elapsed_time,TE_language_elapsed_time,CE_media,CE_tweet_type,CE_language,CE_a_user_id,CE_b_user_id,CE_media_norm,CE_tweet_type_norm,CE_language_norm,CE_a_user_id_norm,CE_b_user_id_norm,DE_b_user_id_b_follower_count_1,DE_b_user_id_b_follower_count_-1,DE_b_user_id_b_following_count_1,DE_b_user_id_b_following_count_-1,DE_b_user_id_language_1,DE_b_user_id_language_-1,a_ff_rate,b_ff_rate
6300707,0,2,22,5024,5535,False,478,120,False,True,79512587,0,0,0,12,38,39,1581013000.0,20591.0,0.02631,0.031931,0.017247,0.044936,0.02195,0.102205,0.098213,0.058201,0.064094,0.096142,0.006495,0.007319,0.003518,0.00405,0.006074,0.407764,0.515156,0.498548,0.537495,0.397152,0.024145,0.105756,0.006682,0.436867,16767.637441,19599.645658,20622.761078,59710620.0,57135959,2717747,40.0,4.0,0.616467,0.592586,0.028073,4.148974e-07,5.722722e-08,0.0,0.0,-0.0,-0.0,-0.0,-0.0,1.101712,3.983333
848427,0,0,54,4336,1681,False,117,212,False,True,89637884,0,1,1,1,28,42,1580959000.0,6941.0,0.02631,0.031916,0.022051,0.075239,0.02195,0.102205,0.094236,0.097156,0.064094,0.096142,0.006495,0.00726,0.007454,0.00405,0.006074,0.407764,0.497789,0.433831,0.507192,0.397152,0.024145,0.105756,0.006682,0.436867,16767.637441,17594.849751,16782.726253,59710620.0,8617943,42245460,19.0,5.0,0.616467,0.088347,0.437819,2.146021e-07,5.722722e-08,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.387684,0.551887
9748437,5,2,47,1487367,496,True,79,494,False,False,38181493,0,0,0,17,46,24,1581019000.0,8090.0,0.023223,0.031931,0.018596,0.017805,0.067405,0.093026,0.098213,0.09202,0.13954,0.096142,0.006239,0.007319,0.00623,0.003189,0.006074,0.481159,0.515156,0.474556,0.752948,0.397152,0.024145,0.105756,0.006682,0.436867,18569.632329,19599.645658,18953.320645,18701994.0,57135959,4837584,9077.0,9.0,0.194344,0.592586,0.05037,8.936031e-05,5.722722e-08,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.000333,0.159919
327248,0,1,3,68,250,False,118,101,False,True,76480241,0,0,0,0,33,21,1580973000.0,24210.0,0.02631,0.007328,0.025706,0.022995,0.02195,0.102205,0.125494,0.117096,0.10072,0.141596,0.006495,0.005672,0.008113,0.006364,0.006074,0.407764,0.285715,0.394001,0.416064,0.397152,0.024145,0.105756,0.006682,0.436867,16767.637441,11875.342533,20594.046798,59710620.0,30846489,9548294,2.0,8.0,0.616467,0.319067,0.099489,2.861361e-08,1.001476e-07,-0.0,0.0,-0.0,0.0,-0.0,0.0,3.676471,1.168317
4109083,7,2,36,16447503,231,True,0,24,False,False,38882861,0,0,0,8,15,28,,,0.018,0.031931,0.025559,0.053032,0.020121,0.133054,0.098213,0.17741,0.139,0.08813,0.007127,0.007319,0.007634,0.014368,0.005568,0.548119,0.515156,0.569845,0.572032,0.405723,0.024145,0.105756,0.006682,0.436867,18925.012391,19599.645658,23142.533537,6999036.0,57135959,1046354,23829.0,7.0,0.072724,0.592586,0.010675,0.0002323711,7.153402e-08,-0.0,-0.0,-0.0,-0.0,0.0,0.0,1.4e-05,0.0


In [164]:
#train.dtypes

media                                  uint8
tweet_type                             uint8
language                               uint8
a_follower_count                       int64
a_following_count                      int64
a_is_verified                           bool
b_follower_count                       int64
b_following_count                      int64
b_is_verified                           bool
b_follows_a                             bool
id                                     int64
len_hashtags                           int32
len_domains                            int32
len_links                              int32
dt_hour                                int64
dt_minute                              int64
dt_second                              int64
engage_time                          float64
elapsed_time                         float64
TE_media_reply                       float32
TE_tweet_type_reply                  float32
TE_language_reply                    float32
TE_a_user_

In [165]:
#valid.dtypes

media                                  uint8
tweet_type                             uint8
language                               uint8
a_follower_count                       int64
a_following_count                      int64
a_is_verified                           bool
b_follower_count                       int64
b_following_count                      int64
b_is_verified                           bool
b_follows_a                             bool
id                                     int64
len_hashtags                           int32
len_domains                            int32
len_links                              int32
dt_hour                                int64
dt_minute                              int64
dt_second                              int64
engage_time                          float64
elapsed_time                         float64
TE_media_reply                       float32
TE_tweet_type_reply                  float32
TE_language_reply                    float32
TE_a_user_

In [166]:
import xgboost as xgb
print('XGB Version',xgb.__version__)

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'nthread':40,
    'tree_method':'hist',
    #'predictor' : 'gpu_predictor'
}


XGB Version 1.2.1


In [167]:
if train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { train.columns[train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {train.shape}')
print(f'X_valid.shape {valid.shape}')

no dup :) 
X_train.shape (6989680, 64)
X_valid.shape (9346254, 64)


In [168]:
%%time

for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
        valid[col] = valid[col].astype('int8')

CPU times: user 1.22 s, sys: 98.1 ms, total: 1.32 s
Wall time: 36.6 ms


In [169]:
#Y_train[['reply','retweet','retweet_comment','like']]

Unnamed: 0,reply,retweet,retweet_comment,like
6300707,0,0,0,1
848427,0,0,0,1
9748437,0,0,0,1
327248,0,1,0,0
4109083,0,0,0,0
...,...,...,...,...
1041334,0,0,0,0
8616999,0,0,0,0
4155209,0,0,0,0
2983073,0,0,0,0


In [170]:
%%time
# TRAIN AND VALIDATE

NROUND = 300
VERBOSE_EVAL = 50
#ESR = 50
    
oof = np.zeros((len(valid),len(label_names)))
preds = []
for i in range(4):

    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)
       
    start = time.time(); print('Creating DMatrix...')
        
    dtrain = xgb.DMatrix(data=train,label=Y_train.iloc[:, i])
    dvalid = xgb.DMatrix(data=valid,label=Y_valid.iloc[:, i])
    print('Took %.1f seconds'%(time.time()-start))
             
    start = time.time(); print('Training...')
    model = xgb.train(xgb_parms, 
                           dtrain=dtrain,
                           #evals=[(dtrain,'train'),(dvalid,'valid')],
                           num_boost_round=NROUND,
                           #early_stopping_rounds=ESR,
                           verbose_eval=VERBOSE_EVAL) 
    print('Took %.1f seconds'%(time.time()-start))
        
    start = time.time(); print('Predicting...')
    #Y_valid[f'pred_{name}'] = xgb.dask.predict(client,model,valid)
    oof[:, i] += model.predict(dvalid)
    #preds.append(xgb.dask.predict(client,model,valid))
    print('Took %.1f seconds'%(time.time()-start))
        
    del model, dtrain, dvalid

#########################
### reply
#########################
Creating DMatrix...
Took 24.8 seconds
Training...
Took 89.4 seconds
Predicting...
Took 5.5 seconds
#########################
### retweet
#########################
Creating DMatrix...
Took 24.8 seconds
Training...
Took 90.7 seconds
Predicting...
Took 5.3 seconds
#########################
### retweet_comment
#########################
Creating DMatrix...
Took 25.1 seconds
Training...
Took 90.6 seconds
Predicting...
Took 5.3 seconds
#########################
### like
#########################
Creating DMatrix...
Took 24.8 seconds
Training...
Took 91.5 seconds
Predicting...
Took 5.4 seconds
CPU times: user 4h 7min 6s, sys: 5min 18s, total: 4h 12min 24s
Wall time: 8min 3s


In [171]:
yvalid = Y_valid[label_names].values

# Compute Validation Metrics

In [172]:
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
  prec, recall, thresh = precision_recall_curve(gt, pred)
  prauc = auc(recall, prec)
  return prauc

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [173]:
%%time
txt = ''
for i in range(4):
    prauc = compute_prauc(oof[:,i], yvalid[:, i])
    rce   = compute_rce_fast(oof[:,i], yvalid[:, i])
    txt_ = f"{label_names[i]:20} PRAUC:{prauc:.5f} RCE:{rce:.5f}"
    print(txt_)
    txt += txt_ + '\n'

reply                PRAUC:0.24053 RCE:31.54185
retweet              PRAUC:0.70858 RCE:50.85064
retweet_comment      PRAUC:0.07924 RCE:22.78097
like                 PRAUC:0.97406 RCE:82.05389
CPU times: user 10min 10s, sys: 38.3 s, total: 10min 48s
Wall time: 18.1 s


In [174]:
print('This notebook took %.1f minutes'%((time.time()-very_start)/60.))

This notebook took 48.1 minutes
