In [1]:
import os, time
start = time.time()
very_start = time.time()

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
import dask
from dask.distributed import Client, wait, LocalCluster
import dask.dataframe as dd

In [4]:
client = Client(n_workers=8, 
                       threads_per_worker=10,
                       memory_limit='45GB',ip='10.1.0.xxx)
dask.config.set(shuffle='disk')
dask.config.set({'temporary_directory': '/path/to/dask_tmp'})

<dask.config.set at 0x7feb716b0c10>

# Load Train

In [5]:
%%time
path = '/path/to/all_data_10_raw_with_text_fe/'
files = os.listdir(path)
train = dd.read_parquet(f'{path}/*.parquet')#,dtypes=dtypes)

107
CPU times: user 68.3 ms, sys: 12.7 ms, total: 81 ms
Wall time: 72.6 ms


In [6]:
train.columns

Index(['tweet_id', 'text_tokens', 'hashtags', 'present_media', 'present_links',
       'present_domains', 'tweet_type', 'language', 'tweet_timestamp',
       'engaged_with_user_id', 'engaged_with_user_follower_count',
       'engaged_with_user_following_count', 'engaged_with_user_is_verified',
       'engaged_with_user_account_creation', 'enaging_user_id',
       'enaging_user_follower_count', 'enaging_user_following_count',
       'enaging_user_is_verified', 'enaging_user_account_creation',
       'engagee_follows_engager', 'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp', 'len_hashtags',
       'len_domains', 'len_links', 'engage_time', 'dt_dow', 'dt_hour',
       'dt_minute', 'dt_second', 'tweet', 'tweet_nortsign', 'count_words',
       'count_char', 'tw_uhash', 'tw_hash', 'count_ats', 'hash0', 'hash1',
       'tw_freq_hash', 'tw_first_word', 'tw_second_word', 'tw_last_word',
       'tw_llast_word', 'tw_len'],
      dtype='object')

In [7]:
%%time
# DROP UNUSED COLUMNS
cols_drop = ['text_tokens']
train = train.drop(cols_drop,axis=1)

CPU times: user 11.4 ms, sys: 2.63 ms, total: 14 ms
Wall time: 12.3 ms


In [8]:
%%time
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-3337961c-2a03-4468-a256-e7dd4edd7ce3'), 46)
CPU times: user 27.4 ms, sys: 2.67 ms, total: 30.1 ms
Wall time: 28.1 ms


In [9]:
%%time
train = train.repartition(npartitions=8)
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-1a16b6af-8434-49df-b5ae-5c70d049125e'), 46)
CPU times: user 18.6 ms, sys: 9.17 ms, total: 27.7 ms
Wall time: 18.6 ms


In [10]:
train.dtypes

tweet_id                               int32
hashtags                               int32
present_media                          int32
present_links                          int32
present_domains                        int32
tweet_type                             int32
language                               int32
tweet_timestamp                        int32
engaged_with_user_id                   int32
engaged_with_user_follower_count       int32
engaged_with_user_following_count      int32
engaged_with_user_is_verified           bool
engaged_with_user_account_creation     int32
enaging_user_id                        int32
enaging_user_follower_count            int32
enaging_user_following_count           int32
enaging_user_is_verified                bool
enaging_user_account_creation          int32
engagee_follows_engager                 bool
reply_timestamp                        int32
retweet_timestamp                      int32
retweet_with_comment_timestamp         int32
like_times

In [11]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [12]:
label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
for col in train.columns:
    if col in label_names:
        train[col] = train[col].astype('float32')
    elif train[col].dtype=='int64':
        train[col] = train[col].astype('int32')
    elif train[col].dtype=='int16':
        train[col] = train[col].astype('int8')

In [13]:
%%time
train = train.reset_index(drop=True)

CPU times: user 11.7 ms, sys: 1.96 ms, total: 13.6 ms
Wall time: 10.2 ms


In [14]:
%%time
train, = dask.persist(train)
print(train.shape)

(Delayed('int-dbea8e3d-ad2f-456c-8938-3482c35515f6'), 46)
CPU times: user 6.46 ms, sys: 2.08 ms, total: 8.54 ms
Wall time: 7.14 ms


In [15]:
%%time
# ELAPSED TIME
for col in ['engage_time','tweet_timestamp']:
    train[col] = train[col].astype('int64')/1e9

CPU times: user 20.4 ms, sys: 4.65 ms, total: 25.1 ms
Wall time: 18.8 ms


In [16]:
%%time
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-bdfe6cca-d846-45a3-b54a-624c1491b16e'), 46)
CPU times: user 4.9 ms, sys: 2.38 ms, total: 7.28 ms
Wall time: 5.52 ms


In [17]:
def set_nan(ds):
    mask = ds == 0
    ds.loc[mask] = np.nan
    return ds
train['engage_time'] = train['engage_time'].map_partitions(set_nan)

In [18]:
train['elapsed_time'] = train['engage_time'] - train['tweet_timestamp']
train['elapsed_time'] = train.elapsed_time.astype('float64')

In [19]:
%%time
# TRAIN FIRST 5 DAYS. VALIDATE LAST 2 DAYS
VALID_DOW = [1, 2]# order is [3, 4, 5, 6, 0, 1, 2]
valid = train[train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)
train = train[~train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)

CPU times: user 21.5 ms, sys: 3.92 ms, total: 25.4 ms
Wall time: 19.9 ms


In [20]:
%%time
train,valid = dask.persist(train,valid)
print(type(train), train.shape, valid.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-8fe6a423-9353-44a7-9626-0844b8ee4852'), 47) (Delayed('int-39dc6880-d328-4882-aa91-8495cfb909eb'), 47)
CPU times: user 34.6 ms, sys: 3.33 ms, total: 37.9 ms
Wall time: 34.5 ms


In [21]:
%%time
train = train.set_index('tweet_timestamp')
valid = valid.set_index('tweet_timestamp')
train,valid = dask.persist(train,valid)
# train.head()

CPU times: user 6.99 s, sys: 873 ms, total: 7.87 s
Wall time: 1min 14s


In [22]:
%%time
train = train.reset_index()
valid = valid.reset_index()
train,valid = dask.persist(train,valid)
# train.head()

CPU times: user 34.6 ms, sys: 4.79 ms, total: 39.4 ms
Wall time: 31.2 ms


In [23]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [24]:
#for i,p in enumerate(valid.partitions):
#    print(i,len(p))

### Target Encode

In [25]:
class MTE_one_shot:
    
    def __init__(self, folds, smooth, seed=42):
        self.folds = folds
        self.seed = seed
        self.smooth = smooth
        
    def fit_transform(self, train, x_col, y_col, y_mean=None, out_col = None, out_dtype=None):
        
        self.y_col = y_col
        np.random.seed(self.seed)
        
        if 'fold' not in train.columns:
            fsize = len(train)//self.folds
            train['fold'] = 1
            train['fold'] = train['fold'].cumsum()
            train['fold'] = train['fold']//fsize
            train['fold'] = train['fold']%self.folds
        
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        
        if y_mean is None:
            y_mean = train[y_col].mean()#.compute().astype('float32')
        self.mean = y_mean
        
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        
        agg_each_fold = train.groupby(cols).agg({y_col:['count','sum']}).reset_index()
        agg_each_fold.columns = cols + ['count_y','sum_y']
        
        agg_all = agg_each_fold.groupby(x_col).agg({'count_y':'sum','sum_y':'sum'}).reset_index()
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all.columns = cols + ['count_y_all','sum_y_all']
        
        agg_each_fold = agg_each_fold.merge(agg_all,on=x_col,how='left')
        agg_each_fold['count_y_all'] = agg_each_fold['count_y_all'] - agg_each_fold['count_y']
        agg_each_fold['sum_y_all'] = agg_each_fold['sum_y_all'] - agg_each_fold['sum_y']
        agg_each_fold[out_col] = (agg_each_fold['sum_y_all']+self.smooth*self.mean)/(agg_each_fold['count_y_all']+self.smooth)
        agg_each_fold = agg_each_fold.drop(['count_y_all','count_y','sum_y_all','sum_y'],axis=1)
        
        agg_all[out_col] = (agg_all['sum_y_all']+self.smooth*self.mean)/(agg_all['count_y_all']+self.smooth)
        agg_all = agg_all.drop(['count_y_all','sum_y_all'],axis=1)
        self.agg_all = agg_all
        
        train.columns
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        train = train.merge(agg_each_fold,on=cols,how='left')
        del agg_each_fold
        #self.agg_each_fold = agg_each_fold
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        train[out_col] = train[out_col].fillna(self.mean)
        
        if out_dtype is not None:
            train[out_col] = train[out_col].astype(out_dtype)
        return train
    
    def transform(self, test, x_col, out_col = None, out_dtype=None):
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        test = test.merge(self.agg_all,on=x_col,how='left')
        test[out_col] = test[out_col].fillna(self.mean)
        if out_dtype is not None:
            test[out_col] = test[out_col].astype(out_dtype)
        return test
 

In [26]:
train.columns

Index(['tweet_timestamp', 'tweet_id', 'hashtags', 'present_media',
       'present_links', 'present_domains', 'tweet_type', 'language',
       'engaged_with_user_id', 'engaged_with_user_follower_count',
       'engaged_with_user_following_count', 'engaged_with_user_is_verified',
       'engaged_with_user_account_creation', 'enaging_user_id',
       'enaging_user_follower_count', 'enaging_user_following_count',
       'enaging_user_is_verified', 'enaging_user_account_creation',
       'engagee_follows_engager', 'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp', 'len_hashtags',
       'len_domains', 'len_links', 'engage_time', 'dt_dow', 'dt_hour',
       'dt_minute', 'dt_second', 'tweet', 'tweet_nortsign', 'count_words',
       'count_char', 'tw_uhash', 'tw_hash', 'count_ats', 'hash0', 'hash1',
       'tw_freq_hash', 'tw_first_word', 'tw_second_word', 'tw_last_word',
       'tw_llast_word', 'tw_len', 'elapsed_time'],
      dtype='object')

TE_media_reply 17.8 seconds<br>
TE_tweet_type_reply 27.1 seconds<br>
TE_language_reply 52.5 seconds<br>
TE_a_user_id_reply 180.0 seconds<br>

In [27]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for t in ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']:
    start = time.time()
    for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'enaging_user_id']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
        cols.append(out_col)
        train,valid = dask.persist(train,valid)
        del encoder
        #train.head()
        wait(train)
        wait(valid)
        print(out_col,"%.1f seconds"%(time.time()-start))

TE_present_media_reply_timestamp 98.6 seconds
TE_tweet_type_reply_timestamp 109.2 seconds
TE_language_reply_timestamp 120.6 seconds
TE_engaged_with_user_id_reply_timestamp 148.4 seconds
TE_enaging_user_id_reply_timestamp 185.5 seconds
TE_present_media_retweet_timestamp 12.2 seconds
TE_tweet_type_retweet_timestamp 23.6 seconds
TE_language_retweet_timestamp 35.0 seconds
TE_engaged_with_user_id_retweet_timestamp 62.7 seconds
TE_enaging_user_id_retweet_timestamp 100.3 seconds
TE_present_media_retweet_with_comment_timestamp 12.3 seconds
TE_tweet_type_retweet_with_comment_timestamp 23.6 seconds
TE_language_retweet_with_comment_timestamp 34.6 seconds
TE_engaged_with_user_id_retweet_with_comment_timestamp 63.1 seconds
TE_enaging_user_id_retweet_with_comment_timestamp 101.5 seconds
TE_present_media_like_timestamp 12.8 seconds
TE_tweet_type_like_timestamp 24.5 seconds
TE_language_like_timestamp 35.7 seconds
TE_engaged_with_user_id_like_timestamp 65.1 seconds
TE_enaging_user_id_like_timestamp 102

In [28]:
train.columns

Index(['tweet_timestamp', 'tweet_id', 'hashtags', 'present_media',
       'present_links', 'present_domains', 'tweet_type', 'language',
       'engaged_with_user_id', 'engaged_with_user_follower_count',
       'engaged_with_user_following_count', 'engaged_with_user_is_verified',
       'engaged_with_user_account_creation', 'enaging_user_id',
       'enaging_user_follower_count', 'enaging_user_following_count',
       'enaging_user_is_verified', 'enaging_user_account_creation',
       'engagee_follows_engager', 'reply_timestamp', 'retweet_timestamp',
       'retweet_with_comment_timestamp', 'like_timestamp', 'len_hashtags',
       'len_domains', 'len_links', 'engage_time', 'dt_dow', 'dt_hour',
       'dt_minute', 'dt_second', 'tweet', 'tweet_nortsign', 'count_words',
       'count_char', 'tw_uhash', 'tw_hash', 'count_ats', 'hash0', 'hash1',
       'tw_freq_hash', 'tw_first_word', 'tw_second_word', 'tw_last_word',
       'tw_llast_word', 'tw_len', 'elapsed_time', 'fold',
       'TE_prese

### Multiple Column Target Encode

In [29]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols=[]

for t in ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']:
    for c in [
        ['present_domains','language','engagee_follows_engager','tweet_type','present_media','engaged_with_user_is_verified'],
        ['engaged_with_user_id','tweet_type','language'],
        ['tw_first_word','tweet_type','language'],
        ['tw_last_word','tweet_type','language'],
        ['hash0','tweet_type','language'],
        ['hash1','tweet_type','language'],
        ['tw_uhash','tweet_type','language'],
        ['tw_hash'],
        ['tw_freq_hash'],
        ['present_media','tweet_type','language','engaged_with_user_is_verified','enaging_user_is_verified','engagee_follows_engager'],
        ['present_domains','present_media','tweet_type','language'],
        ['present_links','present_media','tweet_type','language'],
        ['hashtags','present_media','tweet_type','language'],
        ]:
        start = time.time()
        out_col = 'TE_'+'_'.join(c)+'_'+t
        print(out_col)
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
        cols.append(out_col)
        train,valid = dask.persist(train,valid)
        del encoder
        wait(train)
        wait(valid)
        print(out_col,"%.1f seconds"%(time.time()-start))

TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp
TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp 15.1 seconds
TE_engaged_with_user_id_tweet_type_language_reply_timestamp
TE_engaged_with_user_id_tweet_type_language_reply_timestamp 39.7 seconds
TE_tw_first_word_tweet_type_language_reply_timestamp
TE_tw_first_word_tweet_type_language_reply_timestamp 34.3 seconds
TE_tw_last_word_tweet_type_language_reply_timestamp
TE_tw_last_word_tweet_type_language_reply_timestamp 14.5 seconds
TE_hash0_tweet_type_language_reply_timestamp
TE_hash0_tweet_type_language_reply_timestamp 20.0 seconds
TE_hash1_tweet_type_language_reply_timestamp
TE_hash1_tweet_type_language_reply_timestamp 14.3 seconds
TE_tw_uhash_tweet_type_language_reply_timestamp
TE_tw_uhash_tweet_type_language_reply_timestamp 20.5 seconds
TE_tw_hash_reply_timestamp
TE_tw_hash_reply_timestamp 49.

In [30]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 20.6 ms, sys: 3.54 ms, total: 24.2 ms
Wall time: 17.9 ms


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-b1ec71164db1528c812475c2c3c8d4e9', 4)>}, not_done=set())

### Elapsed Time Target Encode

In [31]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['present_media', 'tweet_type', 'language']:#, 'a_user_id', 'b_user_id']:
    for t in ['elapsed_time']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col)
        out_dtype='float32' #if 'user_id' in c else None
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype=out_dtype)
        cols.append(out_col)
        print(out_col,"%.1f seconds"%(time.time()-start))
        #del encoder

TE_present_media_elapsed_time 0.2 seconds
TE_tweet_type_elapsed_time 0.4 seconds
TE_language_elapsed_time 0.5 seconds
CPU times: user 573 ms, sys: 58 ms, total: 631 ms
Wall time: 513 ms


In [32]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 4.31 s, sys: 447 ms, total: 4.76 s
Wall time: 40.1 s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-ea2d4510f3129f596f86efc66236f544', 6)>}, not_done=set())

### Count Encode

In [33]:
class FrequencyEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, x_col, c_col=None, out_col = None):
        np.random.seed(self.seed)
        if c_col is None or c_col not in train.columns:
            c_col = 'dummy'
            train[c_col] = 1
            drop = True
        else:
            drop = False
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        if drop:
            train = train.drop(c_col,axis=1)
        agg_all.columns = cols + [out_col]
        agg_all[out_col] = agg_all[out_col].astype('int32')
        agg_all[out_col] = agg_all[out_col]*1.0/len(train)
        agg_all[out_col] = agg_all[out_col].astype('float32')
    
        train = train.merge(agg_all,on=cols,how='left')
        del agg_all
        #print(train.columns)
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        return train
    
    def transform(self, test, x_col, c_col=None, out_col = None):
        return self.fit_transform(test, x_col, c_col, out_col)
 

In [34]:
class CountEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, test, x_col, out_col = None):
        np.random.seed(self.seed)
        
        common_cols = [i for i in train.columns if i in test.columns and i!=x_col]

        if len(common_cols):
            c_col = common_cols[0]
            drop = False
        else:
            c_col = 'dummy'
            train[c_col] = 1
            test[c_col]=1
            drop = True
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_all.columns = cols + [out_col]
        
        agg_test = test.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_test.columns = cols + [out_col+'_test']
        agg_all = agg_all.merge(agg_test,on=cols,how='left')
        agg_all[out_col+'_test'] = agg_all[out_col+'_test'].fillna(0)
        agg_all[out_col] = agg_all[out_col] + agg_all[out_col+'_test']
        agg_all = agg_all.drop(out_col+'_test', axis=1)
        del agg_test
            
        if drop:
            train = train.drop(c_col,axis=1)
            test = test.drop(c_col,axis=1)
        train = train.merge(agg_all,on=cols,how='left')
        test = test.merge(agg_all,on=cols,how='left')
        del agg_all
        return train,test

In [35]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'enaging_user_id']:
    encoder = CountEncoder()
    out_col = f'CE_{c}'
    train,valid = encoder.fit_transform(train, valid, c, out_col=out_col)
    print
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_present_media 11.1 seconds
CE_tweet_type 23.5 seconds
CE_language 35.7 seconds
CE_engaged_with_user_id 54.1 seconds
CE_enaging_user_id 75.2 seconds
CPU times: user 7.6 s, sys: 898 ms, total: 8.5 s
Wall time: 1min 15s


In [36]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'enaging_user_id']:
    encoder = FrequencyEncoder()
    out_col = f'CE_{c}_norm'
    train = encoder.fit_transform(train, c, c_col='tweet_id', out_col=out_col)
    valid = encoder.transform(valid, c, c_col='tweet_id', out_col=out_col)
    cols.append(out_col)
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_present_media_norm 15.0 seconds
CE_tweet_type_norm 29.6 seconds
CE_language_norm 44.5 seconds
CE_engaged_with_user_id_norm 62.1 seconds
CE_enaging_user_id_norm 82.0 seconds
CPU times: user 9.78 s, sys: 1.13 s, total: 10.9 s
Wall time: 1min 22s


### Difference Encode (Lag Features)

In [37]:
def diff_encode_cudf_v1(train,col,tar,sft=1):
    train[col+'_sft'] = train[col].shift(sft)
    train[tar+'_sft'] = traiclass FrequencyEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, x_col, c_col=None, out_col = None):
        np.random.seed(self.seed)
        if c_col is None or c_col not in train.columns:
            c_col = 'dummy'
            train[c_col] = 1
            drop = True
        else:
            drop = False
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        if drop:
            train = train.drop(c_col,axis=1)
        agg_all.columns = cols + [out_col]
        agg_all[out_col] = agg_all[out_col].astype('int32')
        agg_all[out_col] = agg_all[out_col]*1.0/len(train)
        agg_all[out_col] = agg_all[out_col].astype('float32')
    
        train = train.merge(agg_all,on=cols,how='left')
        del agg_all
        #print(train.columns)
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        return train
    
    def transform(self, test, x_col, c_col=None, out_col = None):
        return self.fit_transform(test, x_col, c_col, out_col)
 n[tar].shift(sft)
    out_col = f'DE_{col}_{tar}_{sft}'
    train[out_col] = train[tar]-train[tar+'_sft']
    mask = '__MASK__'
    train[mask] = train[col] == train[col+'_sft']
    train = train.drop([col+'_sft',tar+'_sft'],axis=1)
    train[out_col] = train[out_col]*train[mask]
    train = train.drop(mask,axis=1)
    return train

In [38]:
%%time
start = time.time()
# cuDF DE ENCODING IS FAST!!
idx = 0; cols = []; sc = 'tweet_timestamp'
for c in ['enaging_user_id']:
    for t in ['enaging_user_following_count','enaging_user_following_count','language']:
        for s in [1,-1]:
            start = time.time()
            train = diff_encode_cudf_v1(train, col=c, tar=t, sft=s)
            valid = diff_encode_cudf_v1(valid, col=c, tar=t, sft=s)
            train,valid = dask.persist(train,valid)
            wait(train)
            wait(valid)
            end = time.time(); idx += 1
            print('DE',c,t,s,'%.1f seconds'%(end-start))

DE enaging_user_id enaging_user_following_count 1 7.6 seconds
DE enaging_user_id enaging_user_following_count -1 6.0 seconds
DE enaging_user_id enaging_user_following_count 1 6.4 seconds
DE enaging_user_id enaging_user_following_count -1 7.1 seconds
DE enaging_user_id language 1 6.5 seconds
DE enaging_user_id language -1 6.1 seconds
CPU times: user 6.56 s, sys: 570 ms, total: 7.13 s
Wall time: 39.7 s


### Diff Language

In [39]:
train_lang = train[['engaged_with_user_id', 'language', 'tweet_id']].drop_duplicates()
valid_lang = valid[['engaged_with_user_id', 'language', 'tweet_id']].drop_duplicates()
train_lang_count = train_lang.groupby(['engaged_with_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
valid_lang_count = valid_lang.groupby(['engaged_with_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
train_lang_count,valid_lang_count = dask.persist(train_lang_count,valid_lang_count)
# train_lang_count.head()
del train_lang,valid_lang

In [40]:
%%time
train_lang_count = train_lang_count.merge(valid_lang_count,on=['engaged_with_user_id', 'language'],how='left')
train_lang_count['tweet_id_y'] = train_lang_count['tweet_id_y'].fillna(0)
train_lang_count['tweet_id_x'] = train_lang_count['tweet_id_x'] + train_lang_count['tweet_id_y']
train_lang_count = train_lang_count.drop('tweet_id_y',axis=1)
train_lang_count.columns = ['engaged_with_user_id', 'top_language', 'language_count']
train_lang_count, = dask.persist(train_lang_count)
# train_lang_count.head()

CPU times: user 58.2 ms, sys: 4.56 ms, total: 62.7 ms
Wall time: 51.1 ms


In [53]:
%%time

#train_lang_count = train_lang_count.sort_values(['engaged_with_user_id', 'language_count'])
train_lang_count['engaged_with_user_shifted'] = train_lang_count['engaged_with_user_id'].shift(1)
train_lang_count = train_lang_count[train_lang_count['engaged_with_user_id']!=train_lang_count['engaged_with_user_shifted']]
train_lang_count = train_lang_count.drop(['engaged_with_user_shifted','language_count'],axis=1)
train_lang_count.columns = ['engaged_with_user_id','top_language']
train_lang_count, = dask.persist(train_lang_count)
# train_lang_count.head()

CPU times: user 31.6 ms, sys: 7.31 ms, total: 38.9 ms
Wall time: 33.9 ms


In [54]:
def diff_language(df,df_lang_count):
    df = df.merge(df_lang_count,how='left', left_on='enaging_user_id', right_on='engaged_with_user_id')
    df['nan_language'] = df['top_language'].isnull()
    df['same_language'] = df['language'] == df['top_language']
    df['diff_language'] = df['language'] != df['top_language']
    df['same_language'] = df['same_language']*(1-df['nan_language'])
    df['diff_language'] = df['diff_language']*(1-df['nan_language'])
    df = df.drop('top_language',axis=1)
    return df

In [55]:
#%%time
#train = diff_language(train,train_lang_count)
#valid = diff_language(valid,train_lang_count)
#train,valid = dask.persist(train,valid)
#train.head()

## Rate feature

In [56]:
%%time
# follow rate feature
train['a_ff_rate'] = (train['engaged_with_user_following_count'] / train['engaged_with_user_follower_count']).astype('float32')
train['b_ff_rate'] = (train['enaging_user_follower_count']  / train['enaging_user_following_count']).astype('float32')
valid['a_ff_rate']  = (valid['engaged_with_user_following_count'] / valid['engaged_with_user_follower_count']).astype('float32')
valid['b_ff_rate']  = (valid['enaging_user_follower_count']  / valid['enaging_user_following_count']).astype('float32')

CPU times: user 117 ms, sys: 11.3 ms, total: 128 ms
Wall time: 118 ms


In [57]:
train,valid = dask.persist(train,valid)

In [58]:
wait(train)
wait(valid)

DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-0a5c015311656606ded3cf17af343870', 7)>}, not_done=set())

In [59]:
train.columns

Index(['hashtags', 'present_media', 'tweet_type', 'language',
       'engaged_with_user_follower_count', 'engaged_with_user_following_count',
       'engaged_with_user_is_verified', 'enaging_user_follower_count',
       'enaging_user_following_count', 'enaging_user_is_verified',
       ...
       'CE_tweet_type_norm', 'CE_language_norm',
       'CE_engaged_with_user_id_norm', 'CE_enaging_user_id_norm',
       'DE_enaging_user_id_enaging_user_following_count_1',
       'DE_enaging_user_id_enaging_user_following_count_-1',
       'DE_enaging_user_id_language_1', 'DE_enaging_user_id_language_-1',
       'a_ff_rate', 'b_ff_rate'],
      dtype='object', length=127)

# Summarize Features

In [60]:
%%time

label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
DONT_USE = ['tweet_id','tweet_timestamp','engaged_with_user_account_creation','enaging_user_account_creation','engage_time',
            'fold','enaging_user_id','engaged_with_user_id', 'dt_dow',
            'engaged_with_user_account_creation', 'enaging_user_account_creation', 'elapsed_time',
             'present_links','present_domains']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

RMV = [c for c in DONT_USE if c in train.columns and c not in label_names]
RMV

CPU times: user 347 µs, sys: 0 ns, total: 347 µs
Wall time: 347 µs


[]

In [61]:
%%time

for col in RMV:
    #print(col, col in train.columns)
    if col in train.columns:
        train = train.drop(col,axis=1)
        train, = dask.persist(train)
#         train.head()
        

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 14.5 µs


In [62]:
%%time

for col in RMV:
    #print(col, col in valid.columns)
    if col in valid.columns:
        valid = valid.drop(col,axis=1)
        valid, = dask.persist(valid,)
#         valid.head()
        

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 14.8 µs


In [63]:
train.to_parquet('/mnt/DP_disk3/bin/dask_tmp/train_10_raw')
valid.to_parquet('/mnt/DP_disk3/bin/dask_tmp/valid_10_raw')

In [52]:
very_end = time.time()
print(F"Total time:{very_end - very_start}")

Total time:2200.7121801376343
