Copyright (c) 2020, NVIDIA CORPORATION.

Modifications copyright Intel. 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

### Added missing featrues to use dataset from pre-process 

In [80]:
import os, time
start = time.time()
very_start = time.time()

In [81]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [82]:
import dask
from dask.distributed import Client, wait, LocalCluster
import dask.dataframe as dd

In [83]:

client = Client(n_workers=8, 
                       threads_per_worker=5,
                       memory_limit='200GB',ip='10.1.0.131')
#client = Client(ip='10.2.48.253',memory_limit='100GB')
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40571 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://10.1.0.131:35366  Dashboard: http://10.1.0.131:40571/status,Cluster  Workers: 8  Cores: 40  Memory: 1.60 TB


# Load Train

In [84]:
%%time
path = '/mnt/DP_disk3/Recsys/nv'
### train = dd.read_parquet(f'{path}/train-preproc-fold-*.parquet')#,dtypes=dtypes)
### Notes: revious scripts cannot generate this dataset. 
### original tested was downloaded from here:  
###  https://github.com/rapidsai/dask-cuda/issues/337 

train = dd.read_parquet('/mnt/DP_disk3/Recsys/train-1.parquet')
test0 = dd.read_parquet('/mnt/DP_disk3/Recsys/test-0.parquet')
test1 = dd.read_parquet('/mnt/DP_disk3/Recsys/test-1.parquet')

CPU times: user 64.6 ms, sys: 23.1 ms, total: 87.7 ms
Wall time: 74 ms


In [85]:
train.shape, test0.shape, test1.shape

((Delayed('int-a90eb658-b2d7-4168-bdaf-76220f5dde41'), 27),
 (Delayed('int-cb398baf-8b19-4a9e-9e7d-294f54be7df3'), 27),
 (Delayed('int-472ad2f0-1e79-4159-bebc-eb8a042a3fe4'), 27))

In [86]:
#train.dtypes

In [87]:
%%time
# DROP UNUSED COLUMNS
#cols_drop = ['links','hashtags0', 'hashtags1', 'fold']
cols_drop = ['links','hashtags']

train = train.drop(cols_drop,axis=1)

CPU times: user 10.1 ms, sys: 2.08 ms, total: 12.2 ms
Wall time: 9.69 ms


In [88]:
%%time
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-714f9e31-4fcf-4246-a16d-c7d2c655ef9e'), 25)
CPU times: user 3.96 ms, sys: 0 ns, total: 3.96 ms
Wall time: 3.51 ms


In [89]:
%%time
train = train.repartition(npartitions=8)
train, = dask.persist(train)
print(type(train), train.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-22c99e2d-881e-4dbb-a480-2041fa6fcb8e'), 25)
CPU times: user 3.89 ms, sys: 2.95 ms, total: 6.84 ms
Wall time: 5.19 ms


In [90]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [91]:
label_names = ['reply', 'retweet', 'retweet_comment', 'like']
for col in train.columns:
    if col in label_names:
        train[col] = train[col].astype('int64')
    elif train[col].dtype=='int64':
        train[col] = train[col].astype('int32')
    elif train[col].dtype=='int16':
        train[col] = train[col].astype('int8')

In [92]:
#train.dtypes

gap: int32 -> uint32; tweet_id: unit32, links uint32, domains uint32; timestamp: datetime64[ns]
len_* -> uint8

In [93]:
%%time
train = train.reset_index(drop=True)

CPU times: user 5.02 ms, sys: 5.97 ms, total: 11 ms
Wall time: 7.46 ms


In [94]:
%%time
train, = dask.persist(train)
print(train.shape)

(Delayed('int-e45f1bb5-bba2-4233-a89e-f012f06341af'), 25)
CPU times: user 10.9 ms, sys: 102 µs, total: 11 ms
Wall time: 8.59 ms


# <font color = red> Newly add code </font>

## Caculate Engage time & Elapsed Time

In [95]:
#train.head()

In [96]:
%%time 

## New Add, convert timestamp to timestamp64 for split_time to run 

label_names = ["reply", "retweet", "retweet_comment", "like"]

train['timestamp'] = dd.to_datetime(train['timestamp'], unit="s")


CPU times: user 13.6 ms, sys: 39 µs, total: 13.6 ms
Wall time: 10.7 ms


In [97]:
#train.dtypes

In [98]:
#train.head()['timestamp']

In [99]:
%%time 
# TIME FEATURES
# RAPIDS does this 5x faster than Pandas CPU
# If we didn't need to copy CPU to GPU to CPU, then 1300x faster!
def split_time(df):
    #gf = cudf.from_pandas(df[['timestamp']])
    df['dt_dow']  = df['timestamp'].dt.weekday#.to_array() 
    df['dt_hour'] = df['timestamp'].dt.hour#.to_array()
    df['dt_minute'] = df['timestamp'].dt.minute#.to_array()
    df['dt_second'] = df['timestamp'].dt.second#.to_array()
    return df

train = split_time(train)

CPU times: user 41.3 ms, sys: 6.95 ms, total: 48.3 ms
Wall time: 39.7 ms


In [100]:
train["timestamp"] = train["timestamp"].astype("int64") / 1e9
train["reply"] = train.reply.mask(train["reply"] == 0.0, np.nan)
train["retweet"] = train.retweet.mask(train['retweet'] == 0.0, np.nan)
train["retweet_comment"] = train.retweet_comment.mask(train['retweet_comment'] == 0.0, np.nan)
train["like"] = train.like.mask(train['like'] == 0.0, np.nan)

In [101]:
#train.head(1000)[['reply','retweet','retweet_comment','like']]

In [102]:
train["engage_time"] = train[label_names].min(1)

In [103]:
#train.head()[["engage_time","timestamp"]]

In [104]:
train["elapsed_time"] = train["engage_time"] - train["timestamp"]


In [105]:
#train.head()[['elapsed_time','engage_time','timestamp']]

In [106]:
# Check this... NaN -> False * 1 -> 0, others -> Ture * 1 -> 1
train[label_names] = (train[label_names] > 0) * 1

In [107]:
#train.head()[['reply','retweet','retweet_comment','like']]

In [108]:
#train['a_account_creation'] = dd.to_datetime(train['a_account_creation'], unit="s")
#train['b_account_creation'] = dd.to_datetime(train['b_account_creation'], unit="s")


In [109]:
#train.dtypes

def set_nan(ds):
    mask = ds == 0
    ds.loc[mask] = np.nan
    return ds
train['engage_time'] = train['engage_time'].map_partitions(set_nan)

In [110]:
#train['elapsed_time'] = train['engage_time'] - train['timestamp']
train['elapsed_time'] = train.elapsed_time.astype('float64')

In [111]:
print(train['elapsed_time'].min().compute(),train['elapsed_time'].max().compute())
print(train['elapsed_time'].mean().compute())

2.0 602391.0
16617.73330803138


In [112]:
#train.dtypes

# Feature Engineering 

In [113]:
%%time
# TRAIN FIRST 5 DAYS. VALIDATE LAST 2 DAYS
VALID_DOW = [1, 2]# order is [3, 4, 5, 6, 0, 1, 2]
valid = train[train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)
train = train[~train['dt_dow'].isin(VALID_DOW)].reset_index(drop=True)

CPU times: user 26.6 ms, sys: 26.2 ms, total: 52.7 ms
Wall time: 46.2 ms


In [114]:
%%time
train,valid = dask.persist(train,valid)
print(type(train), train.shape, valid.shape)

<class 'dask.dataframe.core.DataFrame'> (Delayed('int-769b7b3c-363f-4af5-a399-a5f9a31d729b'), 31) (Delayed('int-418bdee4-3396-45c4-8dcc-c4924758dfe5'), 31)
CPU times: user 26.4 ms, sys: 2.05 ms, total: 28.5 ms
Wall time: 25.3 ms


In [115]:
%%time
train = train.set_index('timestamp')
valid = valid.set_index('timestamp')
train,valid = dask.persist(train,valid)
#train.head()

CPU times: user 14.3 s, sys: 2.18 s, total: 16.5 s
Wall time: 1min 16s


In [116]:
%%time
train = train.reset_index()
valid = valid.reset_index()
train,valid = dask.persist(train,valid)
#train.head()

CPU times: user 39.3 ms, sys: 11 ms, total: 50.2 ms
Wall time: 42 ms


In [117]:
#for i,p in enumerate(train.partitions):
#    print(i,len(p))

In [118]:
#for i,p in enumerate(valid.partitions):
#    print(i,len(p))

### Target Encode

In [119]:
class MTE_one_shot:
    
    def __init__(self, folds, smooth, seed=42):
        self.folds = folds
        self.seed = seed
        self.smooth = smooth
        
    def fit_transform(self, train, x_col, y_col, y_mean=None, out_col = None, out_dtype=None):
        
        self.y_col = y_col
        np.random.seed(self.seed)
        
        if 'fold' not in train.columns:
            fsize = len(train)//self.folds
            train['fold'] = 1
            train['fold'] = train['fold'].cumsum()
            train['fold'] = train['fold']//fsize
            train['fold'] = train['fold']%self.folds
        
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        
        if y_mean is None:
            y_mean = train[y_col].mean()#.compute().astype('float32')
        self.mean = y_mean
        
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        
        agg_each_fold = train.groupby(cols).agg({y_col:['count','sum']}).reset_index()
        agg_each_fold.columns = cols + ['count_y','sum_y']
        
        agg_all = agg_each_fold.groupby(x_col).agg({'count_y':'sum','sum_y':'sum'}).reset_index()
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all.columns = cols + ['count_y_all','sum_y_all']
        
        agg_each_fold = agg_each_fold.merge(agg_all,on=x_col,how='left')
        agg_each_fold['count_y_all'] = agg_each_fold['count_y_all'] - agg_each_fold['count_y']
        agg_each_fold['sum_y_all'] = agg_each_fold['sum_y_all'] - agg_each_fold['sum_y']
        agg_each_fold[out_col] = (agg_each_fold['sum_y_all']+self.smooth*self.mean)/(agg_each_fold['count_y_all']+self.smooth)
        agg_each_fold = agg_each_fold.drop(['count_y_all','count_y','sum_y_all','sum_y'],axis=1)
        
        agg_all[out_col] = (agg_all['sum_y_all']+self.smooth*self.mean)/(agg_all['count_y_all']+self.smooth)
        agg_all = agg_all.drop(['count_y_all','sum_y_all'],axis=1)
        self.agg_all = agg_all
        
        train.columns
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        train = train.merge(agg_each_fold,on=cols,how='left')
        del agg_each_fold
        #self.agg_each_fold = agg_each_fold
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        train[out_col] = train[out_col].fillna(self.mean)
        
        if out_dtype is not None:
            train[out_col] = train[out_col].astype(out_dtype)
        return train
    
    def transform(self, test, x_col, out_col = None, out_dtype=None):
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        test = test.merge(self.agg_all,on=x_col,how='left')
        test[out_col] = test[out_col].fillna(self.mean)
        if out_dtype is not None:
            test[out_col] = test[out_col].astype(out_dtype)
        return test
 

TE_media_reply 17.8 seconds<br>
TE_tweet_type_reply 27.1 seconds<br>
TE_language_reply 52.5 seconds<br>
TE_a_user_id_reply 180.0 seconds<br>

In [120]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for t in ['reply', 'retweet', 'retweet_comment', 'like']:
    start = time.time()
    for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
        cols.append(out_col)
        train,valid = dask.persist(train,valid)
        del encoder
        #train.head()
        wait(train)
        wait(valid)
        print(out_col,"%.1f seconds"%(time.time()-start))

TE_media_reply 88.8 seconds
TE_tweet_type_reply 115.7 seconds
TE_language_reply 143.4 seconds
TE_a_user_id_reply 205.2 seconds
TE_b_user_id_reply 331.9 seconds
TE_media_retweet 29.8 seconds
TE_tweet_type_retweet 59.3 seconds
TE_language_retweet 87.5 seconds
TE_a_user_id_retweet 150.9 seconds
TE_b_user_id_retweet 277.5 seconds
TE_media_retweet_comment 28.9 seconds
TE_tweet_type_retweet_comment 58.6 seconds
TE_language_retweet_comment 87.4 seconds
TE_a_user_id_retweet_comment 150.6 seconds
TE_b_user_id_retweet_comment 283.0 seconds
TE_media_like 30.2 seconds
TE_tweet_type_like 60.2 seconds
TE_language_like 90.1 seconds
TE_a_user_id_like 155.5 seconds
TE_b_user_id_like 282.4 seconds
CPU times: user 4min 9s, sys: 35.9 s, total: 4min 45s
Wall time: 19min 34s


In [121]:
train['fold'].value_counts().compute()

0    13979362
4    13979361
3    13979361
2    13979361
1    13979361
Name: fold, dtype: int64

### Multiple Column Target Encode

In [122]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
idx = 0; cols=[]
c = ['domains','language','b_follows_a','tweet_type','media','a_is_verified']
for t in ['reply', 'retweet', 'retweet_comment', 'like']:
    out_col = f'TE_multi_{t}'
    encoder = MTE_one_shot(folds=5,smooth=20)
    train = encoder.fit_transform(train, c, t, out_col=out_col, out_dtype='float32')
    valid = encoder.transform(valid, c, out_col=out_col, out_dtype='float32')
    cols.append(out_col)
    del encoder

CPU times: user 785 ms, sys: 118 ms, total: 903 ms
Wall time: 773 ms


In [123]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 30.9 s, sys: 4.63 s, total: 35.5 s
Wall time: 2min 29s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-002601277f4712d771c72c38e9d5dd5e', 3)>}, not_done=set())

### Elapsed Time Target Encode

In [124]:
%%time
# cuDF TE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language']:#, 'a_user_id', 'b_user_id']:
    for t in ['elapsed_time']:
        out_col = f'TE_{c}_{t}'
        encoder = MTE_one_shot(folds=5,smooth=20)
        train = encoder.fit_transform(train, c, t, out_col=out_col)
        out_dtype='float32' #if 'user_id' in c else None
        valid = encoder.transform(valid, c, out_col=out_col, out_dtype=out_dtype)
        cols.append(out_col)
        print(out_col,"%.1f seconds"%(time.time()-start))
        #del encoder

TE_media_elapsed_time 0.2 seconds
TE_tweet_type_elapsed_time 0.3 seconds
TE_language_elapsed_time 0.5 seconds
CPU times: user 474 ms, sys: 60.5 ms, total: 534 ms
Wall time: 476 ms


In [125]:
%%time
train,valid = dask.persist(train,valid)
wait(train)
wait(valid)

CPU times: user 23.6 s, sys: 3.49 s, total: 27.1 s
Wall time: 2min 10s


DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 1)>, <Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-6bbb7039e55b979f93eb27b3935fb335', 6)>}, not_done=set())

### Count Encode

In [126]:
class FrequencyEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, x_col, c_col=None, out_col = None):
        np.random.seed(self.seed)
        if c_col is None or c_col not in train.columns:
            c_col = 'dummy'
            train[c_col] = 1
            drop = True
        else:
            drop = False
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        if drop:
            train = train.drop(c_col,axis=1)
        agg_all.columns = cols + [out_col]
        agg_all[out_col] = agg_all[out_col].astype('int32')
        agg_all[out_col] = agg_all[out_col]*1.0/len(train)
        agg_all[out_col] = agg_all[out_col].astype('float32')
    
        train = train.merge(agg_all,on=cols,how='left')
        del agg_all
        #print(train.columns)
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        return train
    
    def transform(self, test, x_col, c_col=None, out_col = None):
        return self.fit_transform(test, x_col, c_col, out_col)
 

In [127]:
class CountEncoder:
    
    def __init__(self, seed=42):
        self.seed = seed
        
    def fit_transform(self, train, test, x_col, out_col = None):
        np.random.seed(self.seed)
        
        common_cols = [i for i in train.columns if i in test.columns and i!=x_col]

        if len(common_cols):
            c_col = common_cols[0]
            drop = False
        else:
            c_col = 'dummy'
            train[c_col] = 1
            test[c_col]=1
            drop = True
            
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'CE_{tag}_norm'
            
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all = train.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_all.columns = cols + [out_col]
        
        agg_test = test.groupby(cols).agg({c_col:'count'}).reset_index()
        agg_test.columns = cols + [out_col+'_test']
        agg_all = agg_all.merge(agg_test,on=cols,how='left')
        agg_all[out_col+'_test'] = agg_all[out_col+'_test'].fillna(0)
        agg_all[out_col] = agg_all[out_col] + agg_all[out_col+'_test']
        agg_all = agg_all.drop(out_col+'_test', axis=1)
        del agg_test
            
        if drop:
            train = train.drop(c_col,axis=1)
            test = test.drop(c_col,axis=1)
        train = train.merge(agg_all,on=cols,how='left')
        test = test.merge(agg_all,on=cols,how='left')
        del agg_all
        return train,test

In [128]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
start = time.time()
idx = 0; cols = []
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
    encoder = CountEncoder()
    out_col = f'CE_{c}'
    train,valid = encoder.fit_transform(train, valid, c, out_col=out_col)
    print
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_media 31.1 seconds
CE_tweet_type 67.0 seconds
CE_language 107.1 seconds
CE_a_user_id 164.3 seconds
CE_b_user_id 245.4 seconds
CPU times: user 50.1 s, sys: 6.86 s, total: 56.9 s
Wall time: 4min 5s


In [129]:
%%time
# cuDF CE ENCODING IS SUPER FAST!!
idx = 0; cols = []
start = time.time()
for c in ['media', 'tweet_type', 'language', 'a_user_id', 'b_user_id']:
    encoder = FrequencyEncoder()
    out_col = f'CE_{c}_norm'
    train = encoder.fit_transform(train, c, c_col='tweet_id', out_col=out_col)
    valid = encoder.transform(valid, c, c_col='tweet_id', out_col=out_col)
    cols.append(out_col)
    del encoder
    train,valid = dask.persist(train,valid)
    wait(train)
    wait(valid)
    print(out_col,"%.1f seconds"%(time.time()-start))

CE_media_norm 54.5 seconds
CE_tweet_type_norm 99.9 seconds
CE_language_norm 147.9 seconds
CE_a_user_id_norm 204.6 seconds
CE_b_user_id_norm 272.4 seconds
CPU times: user 55.1 s, sys: 7.71 s, total: 1min 2s
Wall time: 4min 32s


### Difference Encode (Lag Features)

In [130]:
def diff_encode_cudf_v1(train,col,tar,sft=1):
    train[col+'_sft'] = train[col].shift(sft)
    train[tar+'_sft'] = train[tar].shift(sft)
    out_col = f'DE_{col}_{tar}_{sft}'
    train[out_col] = train[tar]-train[tar+'_sft']
    mask = '__MASK__'
    train[mask] = train[col] == train[col+'_sft']
    train = train.drop([col+'_sft',tar+'_sft'],axis=1)
    train[out_col] = train[out_col]*train[mask]
    train = train.drop(mask,axis=1)
    return train

In [131]:
%%time
start = time.time()
# cuDF DE ENCODING IS FAST!!
idx = 0; cols = []; sc = 'timestamp'
for c in ['b_user_id']:
    for t in ['b_follower_count','b_following_count','language']:
        for s in [1,-1]:
            start = time.time()
            train = diff_encode_cudf_v1(train, col=c, tar=t, sft=s)
            valid = diff_encode_cudf_v1(valid, col=c, tar=t, sft=s)
            train,valid = dask.persist(train,valid)
            wait(train)
            wait(valid)
            end = time.time(); idx += 1
            print('DE',c,t,s,'%.1f seconds'%(end-start))

DE b_user_id b_follower_count 1 31.1 seconds
DE b_user_id b_follower_count -1 24.9 seconds
DE b_user_id b_following_count 1 25.9 seconds
DE b_user_id b_following_count -1 26.1 seconds
DE b_user_id language 1 26.7 seconds
DE b_user_id language -1 29.7 seconds
CPU times: user 29 s, sys: 4.42 s, total: 33.5 s
Wall time: 2min 44s


### Diff Language

In [132]:
train_lang = train[['a_user_id', 'language', 'tweet_id']].drop_duplicates()
valid_lang = valid[['a_user_id', 'language', 'tweet_id']].drop_duplicates()
train_lang_count = train_lang.groupby(['a_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
valid_lang_count = valid_lang.groupby(['a_user_id', 'language']).agg({'tweet_id':'count'}).reset_index()
train_lang_count,valid_lang_count = dask.persist(train_lang_count,valid_lang_count)
train_lang_count.head()
del train_lang,valid_lang

In [133]:
%%time
train_lang_count = train_lang_count.merge(valid_lang_count,on=['a_user_id', 'language'],how='left')
train_lang_count['tweet_id_y'] = train_lang_count['tweet_id_y'].fillna(0)
train_lang_count['tweet_id_x'] = train_lang_count['tweet_id_x'] + train_lang_count['tweet_id_y']
train_lang_count = train_lang_count.drop('tweet_id_y',axis=1)
train_lang_count.columns = ['a_user_id', 'top_language', 'language_count']
train_lang_count, = dask.persist(train_lang_count)
train_lang_count.head()

CPU times: user 1.1 s, sys: 143 ms, total: 1.24 s
Wall time: 5.01 s


Unnamed: 0,a_user_id,top_language,language_count
0,0,54,51.0
1,1,9,75.0
2,1,47,3.0
3,1,54,4.0
4,1,61,1.0


In [134]:
%%time

train_lang_count = train_lang_count.sort_values(['a_user_id', 'language_count'])
train_lang_count['a_user_shifted'] = train_lang_count['a_user_id'].shift(1)
train_lang_count = train_lang_count[train_lang_count['a_user_id']!=train_lang_count['a_user_shifted']]
train_lang_count = train_lang_count.drop(['a_user_shifted','language_count'],axis=1)
train_lang_count.columns = ['a_user_id','top_language']
train_lang_count, = dask.persist(train_lang_count)
train_lang_count.head()

AttributeError: 'DataFrame' object has no attribute 'sort_values'

In [135]:
def diff_language(df,df_lang_count):
    df = df.merge(df_lang_count,how='left', left_on='b_user_id', right_on='a_user_id')
    df['nan_language'] = df['top_language'].isnull()
    df['same_language'] = df['language'] == df['top_language']
    df['diff_language'] = df['language'] != df['top_language']
    df['same_language'] = df['same_language']*(1-df['nan_language'])
    df['diff_language'] = df['diff_language']*(1-df['nan_language'])
    df = df.drop('top_language',axis=1)
    return df

In [136]:
#%%time
#train = diff_language(train,train_lang_count)
#valid = diff_language(valid,train_lang_count)
#train,valid = dask.persist(train,valid)
#train.head()

## Rate feature

In [137]:
%%time
# follow rate feature
train['a_ff_rate'] = (train['a_following_count'] / train['a_follower_count']).astype('float32')
train['b_ff_rate'] = (train['b_follower_count']  / train['b_following_count']).astype('float32')
valid['a_ff_rate']  = (valid['a_following_count'] / valid['a_follower_count']).astype('float32')
valid['b_ff_rate']  = (valid['b_follower_count']  / valid['b_following_count']).astype('float32')

CPU times: user 59.4 ms, sys: 10.1 ms, total: 69.5 ms
Wall time: 58.2 ms


In [138]:
train,valid = dask.persist(train,valid)

In [139]:
wait(train)
wait(valid)

DoneAndNotDoneFutures(done={<Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 4)>, <Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 2)>, <Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 3)>, <Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 6)>, <Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 7)>, <Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 5)>, <Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 0)>, <Future: finished, type: pandas.DataFrame, key: ('assign-3bbab3c613472f3dee4e0ba4d8c15292', 1)>}, not_done=set())

# Summarize Features

In [140]:
%%time

label_names = ['reply', 'retweet', 'retweet_comment', 'like']
#DONT_USE = ['tweet_id','timestamp','a_account_creation','b_account_creation','engage_time',
#            'fold','b_user_id','a_user_id', 'dt_dow',
#            'a_account_creation', 'b_account_creation', 
#             'links','domains','hashtags0','hashtags1']
DONT_USE = ['tweet_id','timestamp','a_account_creation','b_account_creation',
            'fold','b_user_id','a_user_id', 'dt_dow',
            'a_account_creation', 'b_account_creation', 
             'domains']
DONT_USE += label_names
features = [c for c in train.columns if c not in DONT_USE]

RMV = [c for c in DONT_USE if c in train.columns and c not in label_names]
RMV

CPU times: user 272 µs, sys: 11 µs, total: 283 µs
Wall time: 294 µs


['tweet_id',
 'timestamp',
 'a_account_creation',
 'b_account_creation',
 'fold',
 'b_user_id',
 'a_user_id',
 'dt_dow',
 'a_account_creation',
 'b_account_creation',
 'domains']

In [141]:
%%time

for col in RMV:
    #print(col, col in train.columns)
    if col in train.columns:
        train = train.drop(col,axis=1)
        train, = dask.persist(train)
        train.head()
        

CPU times: user 10.5 s, sys: 1.44 s, total: 11.9 s
Wall time: 49.3 s


In [142]:
%%time

for col in RMV:
    #print(col, col in valid.columns)
    if col in valid.columns:
        valid = valid.drop(col,axis=1)
        valid, = dask.persist(valid,)
        valid.head()
        

CPU times: user 3.85 s, sys: 585 ms, total: 4.43 s
Wall time: 18.3 s


In [143]:
#train.dtypes

# Train Model Validate
We will train on random 10% of first 5 days and validation on last 2 days

In [144]:
%%time

SAMPLE_RATIO = 0.1
SEED = 1

if SAMPLE_RATIO < 1.0:
    print(len(train))
    train = train.sample(frac=SAMPLE_RATIO,random_state=42)
    train, = dask.persist(train)
    train.head()
    print(len(train))

train = train.compute()
Y_train = train[label_names]
train = train.drop(label_names,axis=1)

features = [c for c in train.columns if c not in DONT_USE]
print('Using %i features:'%(len(features)),train.shape[1])
np.asarray(features)

69896806
6989680
Using 64 features: 64
CPU times: user 1min 17s, sys: 18 s, total: 1min 35s
Wall time: 14.6 s


array(['media', 'tweet_type', 'language', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'b_follower_count',
       'b_following_count', 'b_is_verified', 'b_follows_a', 'id',
       'len_hashtags', 'len_domains', 'len_links', 'dt_hour', 'dt_minute',
       'dt_second', 'engage_time', 'elapsed_time', 'TE_media_reply',
       'TE_tweet_type_reply', 'TE_language_reply', 'TE_a_user_id_reply',
       'TE_b_user_id_reply', 'TE_media_retweet', 'TE_tweet_type_retweet',
       'TE_language_retweet', 'TE_a_user_id_retweet',
       'TE_b_user_id_retweet', 'TE_media_retweet_comment',
       'TE_tweet_type_retweet_comment', 'TE_language_retweet_comment',
       'TE_a_user_id_retweet_comment', 'TE_b_user_id_retweet_comment',
       'TE_media_like', 'TE_tweet_type_like', 'TE_language_like',
       'TE_a_user_id_like', 'TE_b_user_id_like', 'TE_multi_reply',
       'TE_multi_retweet', 'TE_multi_retweet_comment', 'TE_multi_like',
       'TE_media_elapsed_time', 'TE_tweet_type_elapsed_t

In [145]:
SAMPLE_RATIO = 0.35 # VAL SET NOW SIZE OF TEST SET
SEED = 1
if SAMPLE_RATIO < 1.0:
    print(len(valid))
    valid = valid.sample(frac=SAMPLE_RATIO,random_state=42)
    valid, = dask.persist(valid)
    valid.head()
    print(len(valid))
    
valid = valid.compute()
Y_valid = valid[label_names]
valid = valid.drop(label_names,axis=1)

26703585
9346255


In [146]:
#train.head()
#valid.head()
#valid.dtypes



In [147]:
#train.head()

In [148]:
#train.dtypes

In [149]:
#valid.dtypes

In [150]:
import xgboost as xgb
print('XGB Version',xgb.__version__)

xgb_parms = { 
    'max_depth':8, 
    'learning_rate':0.1, 
    'subsample':0.8,
    'colsample_bytree':0.3, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'nthread':40,
    'tree_method':'hist',
    #'predictor' : 'gpu_predictor'
}


XGB Version 1.2.1


In [151]:
if train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { train.columns[train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {train.shape}')
print(f'X_valid.shape {valid.shape}')

no dup :) 
X_train.shape (6989680, 64)
X_valid.shape (9346255, 64)


In [152]:
%%time

for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
        valid[col] = valid[col].astype('int8')

CPU times: user 1.51 s, sys: 278 ms, total: 1.79 s
Wall time: 47.3 ms


In [153]:
#Y_train[['reply','retweet','retweet_comment','like']]

In [154]:
%%time
# TRAIN AND VALIDATE

NROUND = 300
VERBOSE_EVAL = 50
#ESR = 50
    
oof = np.zeros((len(valid),len(label_names)))
preds = []
for i in range(4):

    name = label_names[i]
    print('#'*25);print('###',name);print('#'*25)
       
    start = time.time(); print('Creating DMatrix...')
        
    dtrain = xgb.DMatrix(data=train,label=Y_train.iloc[:, i])
    dvalid = xgb.DMatrix(data=valid,label=Y_valid.iloc[:, i])
    print('Took %.1f seconds'%(time.time()-start))
             
    start = time.time(); print('Training...')
    model = xgb.train(xgb_parms, 
                           dtrain=dtrain,
                           #evals=[(dtrain,'train'),(dvalid,'valid')],
                           num_boost_round=NROUND,
                           #early_stopping_rounds=ESR,
                           verbose_eval=VERBOSE_EVAL) 
    print('Took %.1f seconds'%(time.time()-start))
        
    start = time.time(); print('Predicting...')
    #Y_valid[f'pred_{name}'] = xgb.dask.predict(client,model,valid)
    oof[:, i] += model.predict(dvalid)
    #preds.append(xgb.dask.predict(client,model,valid))
    print('Took %.1f seconds'%(time.time()-start))
        
    del model, dtrain, dvalid

#########################
### reply
#########################
Creating DMatrix...
Took 33.4 seconds
Training...
Took 99.4 seconds
Predicting...
Took 5.3 seconds
#########################
### retweet
#########################
Creating DMatrix...
Took 32.0 seconds
Training...
Took 100.1 seconds
Predicting...
Took 5.2 seconds
#########################
### retweet_comment
#########################
Creating DMatrix...
Took 32.9 seconds
Training...
Took 97.7 seconds
Predicting...
Took 5.2 seconds
#########################
### like
#########################
Creating DMatrix...
Took 32.9 seconds
Training...
Took 93.0 seconds
Predicting...
Took 5.4 seconds
CPU times: user 4h 12min 57s, sys: 13min 23s, total: 4h 26min 20s
Wall time: 9min 4s


In [155]:
yvalid = Y_valid[label_names].values

# Compute Validation Metrics

In [156]:
from sklearn.metrics import precision_recall_curve, auc, log_loss

def compute_prauc(pred, gt):
  prec, recall, thresh = precision_recall_curve(gt, pred)
  prauc = auc(recall, prec)
  return prauc

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [157]:
%%time
txt = ''
for i in range(4):
    prauc = compute_prauc(oof[:,i], yvalid[:, i])
    rce   = compute_rce_fast(oof[:,i], yvalid[:, i])
    txt_ = f"{label_names[i]:20} PRAUC:{prauc:.5f} RCE:{rce:.5f}"
    print(txt_)
    txt += txt_ + '\n'

reply                PRAUC:0.23910 RCE:31.28032
retweet              PRAUC:0.70965 RCE:50.94263
retweet_comment      PRAUC:0.08023 RCE:22.98754
like                 PRAUC:0.97411 RCE:82.06254
CPU times: user 10min 21s, sys: 30.8 s, total: 10min 51s
Wall time: 18.2 s


In [158]:
print('This notebook took %.1f minutes'%((time.time()-very_start)/60.))

This notebook took 52.4 minutes
