In [38]:
spark.stop()

In [21]:
import init

import findspark
findspark.init()

import os
import pandas as pd
import numpy as np
from pyspark.sql import *
from pyspark import *
import pyspark.sql.functions as f
from pyspark.sql.types import *
import time
from pyrecdp.data_processor import *
from pyrecdp.encoder import *
from pyrecdp.utils import *

spark = SparkSession.builder.master('yarn')\
        .appName("Recsys2021_data_process")\
        .getOrCreate()
current_path = "/recsys2021_0608_processed/sample_0_3_20days/"
path_prefix = "hdfs://"
dicts_folder = "recsys_dicts/"

In [16]:
train_data_path= "%s/tweet_text_processed_20days_with_fold_0_3_original_word" % current_path
train = spark.read.parquet("%s/" % (train_data_path))

### Step1: Perform encoding individually, each encoding will be saved as two tables(train/valid)

#### Notice:
* Path is current_path + "/train" + "/TE_xxx" or current_path + "/valid" + "/TE_xxx"
* train output format

| fold_id | categorified_input_col_1 | categorified_input_col_2 | ... | TE_xxx |


In [3]:
from collections import *

proc = DataProcessor(spark, path_prefix,
                     current_path=current_path, dicts_path=dicts_folder)

label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

te_dicts_train = OrderedDict() # value should be tuple, (input_cols, output_col, y_mean)
te_dicts_valid = OrderedDict() # value should be tuple, (input_cols, output_col, y_mean)
encoding_columns = []

#############################################################################################
### 1. Target Encoding ###
# begin = time.time()
# y_mean_all = []
# y_mean_all_dict = {}
# for t in ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']:
#     y_mean = np.array(train.groupBy().mean(t).collect())[0][0]
#     y_mean_all.append(y_mean)
#     y_mean_all_dict[t] = y_mean
#     for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'engaging_user_id']:
#         start = time.time()
#         out_col = f'TE_{c}_{t}'
#         encoder = TargetEncoder(proc, c, t, out_col, y_mean, out_dtype=FloatType())
#         te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid, train_only=True)      
#         print(out_col," y_mean is %f, %.1f seconds"%(y_mean, time.time()-start))
# print(F"Target encoding #1 total time:{time.time()-begin}")
# #############################################################################################
### 2. Target Encoding for multi columns ###
begin = time.time()
k=0
for t in ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']:
    y_mean = y_mean_all_dict[t]
    for c in [
        ['present_domains','language','engagee_follows_engager','tweet_type','present_media','engaged_with_user_is_verified'],
        ['engaged_with_user_id','tweet_type','language'],
        ['tw_first_word','tweet_type','language'],
        ['tw_last_word','tweet_type','language'],
        ['tw_hash0','tweet_type','language'],
        ['tw_hash1','tweet_type','language'],
        ['tw_uhash','tweet_type','language'],
        ['tw_hash'],
        ['present_media','tweet_type','language','engaged_with_user_is_verified','engaging_user_is_verified','engagee_follows_engager'],
        ['present_domains','present_media','tweet_type','language'],
        ['present_links','present_media','tweet_type','language'],
        ['hashtags','present_media','tweet_type','language'],
        ]:
        start = time.time()
        out_col = 'TE_'+'_'.join(c)+'_'+t
        encoder = TargetEncoder(proc, c, t, out_col, y_mean, out_dtype=FloatType())
        te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid, train_only=True)
        print(out_col," y_mean is %f, %.1f seconds"%(y_mean, time.time()-start))
print(F"Target encoding #2 total time:{time.time()-begin}")
# #############################################################################################
# # 3. Target Encoding for elapse_time ###
# begin = time.time()
# y_mean = np.array(train.groupBy().mean('elapsed_time').collect())[0][0]
# y_mean_all_dict['elapsed_time'] = y_mean
# for c in ['present_media', 'tweet_type', 'language']:#, 'a_user_id', 'b_user_id']:
#     start = time.time()
#     for t in ['elapsed_time']:
#         out_col = f'TE_{c}_{t}'
#         encoder = TargetEncoder(proc, c, t, out_col, y_mean, out_dtype=FloatType())
#         te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid, train_only=True)
#         print(out_col," y_mean is %f, %.1f seconds"%(y_mean, time.time()-start))
# print(F"Target encoding #3 total time:{time.time()-begin}")
# 
# #############################################################################################
# ### 4. Count Encoding ###
# begin = time.time()
# for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'engaging_user_id']:
#     start = time.time()
#     out_col = f'CE_{c}'
#     encoder = CountEncoder(proc, c, out_col)
#     te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid, train_only=True)
#     print(out_col,"%.1f seconds"%(time.time()-start))
# print(F"Count Encoding total time:{time.time() - begin}")
#############################################################################################
### 5. Frequency Encoding ###
begin = time.time()
for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'engaging_user_id']:
    start = time.time()
    out_col = f'CE_{c}_norm'
    encoder = FrequencyEncoder(proc, c, out_col)
    te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid, train_only=True)
    print(out_col,"%.1f seconds"%(time.time()-start))
print(F"Frequency encoding total time:{time.time()-begin}")
#############################################################################################
# print(y_mean_all_dict)

CE_present_media_norm 16.9 seconds
CE_tweet_type_norm 6.2 seconds
CE_language_norm 6.4 seconds
CE_engaged_with_user_id_norm 12.8 seconds
CE_engaging_user_id_norm 9.6 seconds
Frequency encoding total time:51.94622468948364


NameError: name 'y_mean_all_dict' is not defined

### Step2: Once we have individual encoded columns, combine them with original input

In [7]:
# run it!
# Load schema for merge
from collections import *
# 55 features
encoding_columns = [    
    ('TE_engaging_user_id_reply_timestamp',                    ['engaging_user_id']),    
    ('TE_engaging_user_id_retweet_timestamp',                  ['engaging_user_id']),    
    ('TE_engaging_user_id_retweet_with_comment_timestamp',     ['engaging_user_id']),
    ('TE_engaging_user_id_like_timestamp',                     ['engaging_user_id']),
    ('CE_engaging_user_id', ['engaging_user_id']),
    ('CE_engaging_user_id_norm', ['engaging_user_id']),    
    ('TE_engaged_with_user_id_reply_timestamp',                ['engaged_with_user_id']),
    ('TE_engaged_with_user_id_retweet_timestamp',              ['engaged_with_user_id']),
    ('TE_engaged_with_user_id_retweet_with_comment_timestamp', ['engaged_with_user_id']),
    ('TE_engaged_with_user_id_like_timestamp',                 ['engaged_with_user_id']),    
    ('CE_engaged_with_user_id', ['engaged_with_user_id']),    
    ('CE_engaged_with_user_id_norm', ['engaged_with_user_id']),
    ('TE_present_links_present_media_tweet_type_language_reply_timestamp', ['present_links', 'present_media', 'tweet_type', 'language']),
    ('TE_tw_hash0_tweet_type_language_reply_timestamp', ['tw_hash0', 'tweet_type', 'language']),
    ('TE_tw_hash0_tweet_type_language_retweet_timestamp', ['tw_hash0', 'tweet_type', 'language']),
    ('TE_tw_hash0_tweet_type_language_retweet_with_comment_timestamp', ['tw_hash0', 'tweet_type', 'language']),  
    ('TE_tw_hash0_tweet_type_language_like_timestamp', ['tw_hash0', 'tweet_type', 'language']),
    ('TE_tw_hash1_tweet_type_language_retweet_with_comment_timestamp', ['tw_hash1', 'tweet_type', 'language']),
    ('TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp', ['tw_uhash', 'tweet_type', 'language']),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_reply_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager']),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager']),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager']),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_like_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager']),
    ('TE_tw_first_word_tweet_type_language_reply_timestamp', ['tw_first_word', 'tweet_type', 'language']),
    ('TE_tw_first_word_tweet_type_language_retweet_timestamp', ['tw_first_word', 'tweet_type', 'language']),
    ('TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp', ['tw_first_word', 'tweet_type', 'language']),
    ('TE_tw_first_word_tweet_type_language_like_timestamp', ['tw_first_word', 'tweet_type', 'language']),
    ('TE_tw_last_word_tweet_type_language_reply_timestamp', ['tw_last_word', 'tweet_type', 'language']),
    ('TE_tw_last_word_tweet_type_language_retweet_timestamp', ['tw_last_word', 'tweet_type', 'language']),
    ('TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp', ['tw_last_word', 'tweet_type', 'language']),
    ('TE_tw_last_word_tweet_type_language_like_timestamp', ['tw_last_word', 'tweet_type', 'language']),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified']),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified']),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified']),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified']),
    ('TE_present_domains_present_media_tweet_type_language_reply_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language']),
    ('TE_present_domains_present_media_tweet_type_language_like_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language']),
    ('TE_engaged_with_user_id_tweet_type_language_reply_timestamp', ['engaged_with_user_id', 'tweet_type', 'language']),
    ('TE_engaged_with_user_id_tweet_type_language_retweet_timestamp', ['engaged_with_user_id', 'tweet_type', 'language']),
    ('TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp', ['engaged_with_user_id', 'tweet_type', 'language']),
    ('TE_engaged_with_user_id_tweet_type_language_like_timestamp', ['engaged_with_user_id', 'tweet_type', 'language']),
    ('TE_hashtags_present_media_tweet_type_language_reply_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language']),
    ('TE_hashtags_present_media_tweet_type_language_retweet_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language']),
    ('TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language']),
    ('TE_hashtags_present_media_tweet_type_language_like_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'])
]

original_map = {
    'hashtags': 'original_hashtags',
    'tweet_id': 'original_tweet_id',
    'present_media': 'original_present_media',
    'present_links': 'original_present_links',
    'present_domains': 'original_present_domains',
    'tweet_type': 'original_tweet_type',
    'language': 'original_language',
    'engaged_with_user_id': 'original_engaged_with_user_id',
    'engaged_with_user_follower_count': 'engaged_with_user_follower_count',
    'engaged_with_user_following_count': 'engaged_with_user_following_count',
    'engaged_with_user_is_verified': 'engaged_with_user_is_verified',
    'engaged_with_user_account_creation': 'engaged_with_user_account_creation',
    'engaging_user_id': 'original_engaging_user_id',
    'engaging_user_follower_count': 'engaging_user_follower_count',
    'engaging_user_following_count': 'engaging_user_following_count',
    'engaging_user_is_verified': 'engaging_user_is_verified',
    'engaging_user_account_creation': 'engaging_user_account_creation',
    'engagee_follows_engager': 'engagee_follows_engager',
    'tweet_nortsign': 'text_tokens',
    'count_words': 'text_tokens',
    'count_char': 'text_tokens',
    'tw_uhash': 'text_tokens',
    'tw_hash': 'text_tokens',
    'count_ats': 'text_tokens',
    'tw_hash0': 'text_tokens',
    'tw_hash1': 'text_tokens',
    'tw_first_word': 'tw_first_word',
    'tw_last_word': 'tw_last_word',
    'tw_len': 'text_tokens',
    'original_tweet': 'text_tokens',
    'tweet': 'text_tokens',
    'fold': 'fold'
}

skip_list = [#"['original_present_media']",
             #"['original_tweet_type']",
             #"['original_language']",
             #"['original_engaged_with_user_id']",
             #"['original_engaging_user_id']",
             #"['original_present_domains', 'original_language', 'engagee_follows_engager', 'original_tweet_type', 'original_present_media', 'engaged_with_user_is_verified']",
             #"['original_engaged_with_user_id', 'original_tweet_type', 'original_language']",
             #"['tw_first_word', 'original_tweet_type', 'original_language']",
             #"['tw_last_word', 'original_tweet_type', 'original_language']",
             #"['text_tokens', 'original_tweet_type', 'original_language']",
             #"['text_tokens']",
             #"['original_present_media', 'original_tweet_type', 'original_language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager']",
             #"['original_present_domains', 'original_present_media', 'original_tweet_type', 'original_language']",
             #"['original_present_links', 'original_present_media', 'original_tweet_type', 'original_language']",
             #"['original_hashtags', 'original_present_media', 'original_tweet_type', 'original_language']"
            ]

def merge_indicator_by_input_cols(proc, df, input_dicts, output_list, is_train=True, skip_list=[]):
    begin = time.time()
    for origin_cols, te in input_dicts.items():
        if origin_cols in skip_list:
            continue
        start = time.time()
        input_cols = eval(origin_cols)
        intermediate_columns = []
        output_cols = []
        # op_fillna_list = []
        renamed_input_cols = []

        for out_col, i_dict in te.items():
            for i in i_dict[0]:
                if i not in intermediate_columns:
                    intermediate_columns.append(i)
            # op_fillna_list.append(FillNA([out_col], i_dict[2]))
            output_cols.append(out_col)
        to_select = input_cols + intermediate_columns
        
        for c in input_cols:
            if 'original' in c:
                renamed_input_cols.append((c, c.replace('original_', '')))
            else: 
                renamed_input_cols.append(c)
        
        op_merge = ModelMerge(te)
        if is_train:
            op_select = SelectFeature(renamed_input_cols + ['fold'] + output_cols)
        else:
            op_select = SelectFeature(renamed_input_cols + output_cols)
        distinct = Distinct()      
        
        proc.reset_ops([op_merge] + [op_select, distinct])
        output_list.append(proc.transform(df.select(*to_select), name="lookup_table_%s" % ('_'.join(input_cols))))
        if is_train:
            print("Train Merge models for %s took: %.3f secs" % ("lookup_table_%s" % ('_'.join(input_cols)), time.time()-start))
        else:
            print("Valid Merge models for %s took: %.3f secs" % ("lookup_table_%s" % ('_'.join(input_cols)), time.time()-start))


train_res_dfs = OrderedDict()
# valid_res_dfs = OrderedDict()

for out_col, in_cols in encoding_columns:
    in_cols_str = str([original_map[i] for i in in_cols])
    y_mean = 0
    if in_cols_str not in train_res_dfs:
        train_res_dfs[in_cols_str] = {}
        valid_res_dfs[in_cols_str] = {}
    if 'TE_' in out_col:
        train_res_dfs[in_cols_str][out_col] = (['fold'] + in_cols, spark.read.parquet("%s/train/%s" % (current_path, out_col)), y_mean)
    else:
        train_res_dfs[in_cols_str][out_col] = (in_cols, spark.read.parquet("%s/train/%s" % (current_path, out_col)), y_mean)   
    # valid_res_dfs[in_cols_str][out_col] = (in_cols, spark.read.parquet("%s/valid/%s" % (current_path, out_col)), y_mean)

train_output_path = "%s/train_lookup" % (current_path)
# valid_output_path = "%s/valid_lookup" % (current_path)

############## Train join #############
proc = DataProcessor(spark, path_prefix,
                     current_path=train_output_path, dicts_path=dicts_folder)
train_out_dfs = []
merge_indicator_by_input_cols(proc, train, train_res_dfs, train_out_dfs, is_train=True, skip_list=skip_list)

# ############## Valid join #############
# del proc
# proc = DataProcessor(spark, path_prefix,
#                      current_path=valid_output_path, dicts_path=dicts_folder)
# valid_out_dfs = []
# merge_indicator_by_input_cols(proc, valid, valid_res_dfs, valid_out_dfs, is_train=False)
# 
# #######################################


Train Merge models for lookup_table_original_engaging_user_id took: 43.860 secs
Train Merge models for lookup_table_original_engaged_with_user_id took: 31.886 secs
Train Merge models for lookup_table_original_present_links_original_present_media_original_tweet_type_original_language took: 15.588 secs
Train Merge models for lookup_table_text_tokens_original_tweet_type_original_language took: 146.255 secs
Train Merge models for lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager took: 12.830 secs
Train Merge models for lookup_table_tw_first_word_original_tweet_type_original_language took: 10.379 secs
Train Merge models for lookup_table_tw_last_word_original_tweet_type_original_language took: 41.017 secs
Train Merge models for lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified took: 29.72

In [4]:
# run it
# we need to create some additional lookup table
# merge tw_first_word and tw_last_word in same lookup table
train_output_path = "%s/train_lookup" % (current_path)
valid_output_path = "%s/valid_lookup" % (current_path)
tw_first_word_table_name = "lookup_table_tw_first_word_original_tweet_type_original_language"
tw_last_word_table_name = "lookup_table_tw_last_word_original_tweet_type_original_language"
output_name = "lookup_table_word_tweet_type_language"

## Merge for train
f_df = spark.read.parquet("%s/%s" % (train_output_path, tw_first_word_table_name))
l_df = spark.read.parquet("%s/%s" % (train_output_path, tw_last_word_table_name))

df = f_df.withColumnRenamed('tw_first_word', 'word')\
         .join(l_df.withColumnRenamed('tw_last_word', 'word'), ['fold', 'word', 'language', 'tweet_type'], "outer")
df.write.format('parquet').mode('overwrite').save("%s/%s" % (train_output_path, output_name))

## Merge for valid
f_df = spark.read.parquet("%s/%s" % (valid_output_path, tw_first_word_table_name))
l_df = spark.read.parquet("%s/%s" % (valid_output_path, tw_last_word_table_name))

df = f_df.withColumnRenamed('tw_first_word', 'word')\
         .join(l_df.withColumnRenamed('tw_last_word', 'word'), ['word', 'language', 'tweet_type'], "outer")
df.write.format('parquet').mode('overwrite').save("%s/%s" % (valid_output_path, output_name))


In [10]:
# run it!
# This is used to generate valid_lookup from train_lookup
input_cols = [
    #(['present_media'], "lookup_table_original_present_media"),
    #(['tweet_type'], "lookup_table_original_tweet_type"),
    #(['language'], "lookup_table_original_language"),
    ## (['engaged_with_user_id'], "lookup_table_original_engaged_with_user_id"),
    ## (['engaging_user_id'], "lookup_table_original_engaging_user_id"),
    ## (['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'],
    ##  "lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified"),
    ## (['engaged_with_user_id', 'tweet_type', 'language'], "lookup_table_original_engaged_with_user_id_original_tweet_type_original_language"),
    ## (['tw_first_word', 'tweet_type', 'language'], "lookup_table_tw_first_word_original_tweet_type_original_language"),
    ## (['tw_last_word', 'tweet_type', 'language'], "lookup_table_tw_last_word_original_tweet_type_original_language"),
    ## (['text_tokens', 'tweet_type', 'language'], "lookup_table_text_tokens_original_tweet_type_original_language"),
    #(['text_tokens'], "lookup_table_text_tokens"),
    (['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'],
     "lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager"), 
    (['present_domains', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language"),
    (['present_links', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_present_links_original_present_media_original_tweet_type_original_language"),    
    (['hashtags', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language")
]

train_res_dfs = {}
for in_cols, dict_name in input_cols:
    start = time.time()
    df = spark.read.parquet("%s/train_lookup/%s" % (current_path, dict_name))
    encoding_indicators_df = df.drop('fold')
    for c in encoding_indicators_df.columns:
        if c in in_cols:
            encoding_indicators_df = encoding_indicators_df.drop(c)
    df = df.drop('fold').groupby(in_cols).agg(*[f.mean(c).alias(c) for c in encoding_indicators_df.columns])
    df.write.format("parquet").mode('overwrite').save("%s/valid_lookup/%s" % (current_path, dict_name))
    print("Generate valid_lookup for %s took %.3f secs" % (dict_name, (time.time() - start))) 


Generate valid_lookup for lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager took 1.159 secs
Generate valid_lookup for lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language took 2.021 secs
Generate valid_lookup for lookup_table_original_present_links_original_present_media_original_tweet_type_original_language took 3.484 secs
Generate valid_lookup for lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language took 4.432 secs


In [10]:
# remove some features in lookup table
input_cols = [    
    #(['engaged_with_user_id', 
    #  'TE_engaged_with_user_id_reply_timestamp', 
    #  'TE_engaged_with_user_id_retweet_timestamp',
    #  'TE_engaged_with_user_id_retweet_with_comment_timestamp', 
    #  'TE_engaged_with_user_id_like_timestamp', 
    #  'CE_engaged_with_user_id', 
    #  'CE_engaged_with_user_id_norm'],
    # "lookup_table_original_engaged_with_user_id"),
    #
    #(['engaging_user_id', 
    #  'TE_engaging_user_id_reply_timestamp', 
    #  'TE_engaging_user_id_retweet_timestamp', 
    #  'TE_engaging_user_id_retweet_with_comment_timestamp', 
    #  'TE_engaging_user_id_like_timestamp', 
    #  'CE_engaging_user_id', 
    #  'CE_engaging_user_id_norm'],
    # "lookup_table_original_engaging_user_id"),
    #
    #(['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified', 
    #  'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp', 
    #  'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp', 
    #  'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp', 
    #  'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp'],
    # "lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified"),
    #
    #(['engaged_with_user_id', 'tweet_type', 'language', 
    #  'TE_engaged_with_user_id_tweet_type_language_reply_timestamp', 
    #  'TE_engaged_with_user_id_tweet_type_language_retweet_timestamp', 
    #  'TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp', 
    #  'TE_engaged_with_user_id_tweet_type_language_like_timestamp'],
    # "lookup_table_original_engaged_with_user_id_original_tweet_type_original_language"),
    #
    #(['tw_first_word', 'tweet_type', 'language', 
    #  'TE_tw_first_word_tweet_type_language_reply_timestamp', 
    #  'TE_tw_first_word_tweet_type_language_retweet_timestamp', 
    #  'TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp', 
    #  'TE_tw_first_word_tweet_type_language_like_timestamp'],
    # "lookup_table_tw_first_word_original_tweet_type_original_language"),
    #
    #(['tw_last_word', 'tweet_type', 'language', 
    #  'TE_tw_last_word_tweet_type_language_reply_timestamp', 
    #  'TE_tw_last_word_tweet_type_language_retweet_timestamp', 
    #  'TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp', 
    #  'TE_tw_last_word_tweet_type_language_like_timestamp'],
    # "lookup_table_tw_last_word_original_tweet_type_original_language"),
   #
    #(['text_tokens', 'tweet_type', 'language', 
    #  'TE_tw_hash0_tweet_type_language_reply_timestamp',  
    #  'TE_tw_hash0_tweet_type_language_retweet_timestamp', 
    #  'TE_tw_hash0_tweet_type_language_retweet_with_comment_timestamp', 
    #  'TE_tw_hash1_tweet_type_language_retweet_with_comment_timestamp', 
    #  'TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp', 
    #  'TE_tw_hash0_tweet_type_language_like_timestamp'],
    # "lookup_table_text_tokens_original_tweet_type_original_language"),
    #
    #(['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager', 
    #  'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_reply_timestamp', 
    #  'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_timestamp', 
    #  'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp', 
    #  'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_like_timestamp'],
    # "lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager"), 
    #
    #(['present_domains', 'present_media', 'tweet_type', 'language', 
    #  'TE_present_domains_present_media_tweet_type_language_reply_timestamp',  
    #  'TE_present_domains_present_media_tweet_type_language_like_timestamp'],
    # "lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language"),
    #
    #(['present_links', 'present_media', 'tweet_type', 'language', 
    #  'TE_present_links_present_media_tweet_type_language_reply_timestamp'],
    # "lookup_table_original_present_links_original_present_media_original_tweet_type_original_language"),    
    #
    #(['hashtags', 'present_media', 'tweet_type', 'language', 
    #  'TE_hashtags_present_media_tweet_type_language_reply_timestamp', 
    #  'TE_hashtags_present_media_tweet_type_language_retweet_timestamp', 
    #  'TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp', 
    #  'TE_hashtags_present_media_tweet_type_language_like_timestamp'],
    # "lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language"),
    
    (['word', 'tweet_type', 'language', 
      'TE_tw_first_word_tweet_type_language_reply_timestamp', 
      'TE_tw_first_word_tweet_type_language_retweet_timestamp', 
      'TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp', 
      'TE_tw_first_word_tweet_type_language_like_timestamp',
      'TE_tw_last_word_tweet_type_language_reply_timestamp', 
      'TE_tw_last_word_tweet_type_language_retweet_timestamp', 
      'TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp', 
      'TE_tw_last_word_tweet_type_language_like_timestamp'],
     "lookup_table_word_tweet_type_language"),
]

train_res_dfs = {}
for in_cols, dict_name in input_cols:
    start = time.time()
    df = spark.read.parquet("%s/valid_lookup/%s" % (current_path, dict_name))
    for c in df.columns:
        if c not in in_cols:
            df = df.drop(c)
    df.write.format("parquet").mode('overwrite').save("%s/valid_lookup_64features/%s" % (current_path, dict_name))
    print("Generate valid_lookup for %s took %.3f secs" % (dict_name, (time.time() - start))) 


Generate valid_lookup for lookup_table_word_tweet_type_language took 1.684 secs


### On Demand, merge all features back to train

In [17]:
# run on demand
## For Xinyao's test, merge all feature back to train

feature_list = [
    "engaged_with_user_follower_count",
    "engaged_with_user_following_count",
    "engaging_user_follower_count",
    "engaging_user_following_count",
    "dt_hour",
    "dt_minute",
    "dt_second",
    "tw_len",
    "TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp",
    "TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp",
    "TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp",
    "TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp",
    "TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_reply_timestamp",
    "TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_timestamp",
    "TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp",
    "TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_like_timestamp",
    "TE_present_domains_present_media_tweet_type_language_reply_timestamp",
    "TE_present_domains_present_media_tweet_type_language_like_timestamp",
    "TE_present_links_present_media_tweet_type_language_reply_timestamp",
    "TE_hashtags_present_media_tweet_type_language_reply_timestamp",
    "TE_hashtags_present_media_tweet_type_language_retweet_timestamp",
    "TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp",
    "TE_hashtags_present_media_tweet_type_language_like_timestamp",
    "TE_engaged_with_user_id_reply_timestamp",
    "TE_engaged_with_user_id_retweet_timestamp",
    "TE_engaged_with_user_id_retweet_with_comment_timestamp",
    "TE_engaged_with_user_id_like_timestamp",
    "CE_engaged_with_user_id",
    "CE_engaged_with_user_id_norm",
    "TE_engaging_user_id_reply_timestamp",
    "TE_engaging_user_id_retweet_timestamp",
    "TE_engaging_user_id_retweet_with_comment_timestamp",
    "TE_engaging_user_id_like_timestamp",
    "CE_engaging_user_id",
    "CE_engaging_user_id_norm",
    "TE_engaged_with_user_id_tweet_type_language_reply_timestamp",
    "TE_engaged_with_user_id_tweet_type_language_retweet_timestamp",
    "TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp",
    "TE_engaged_with_user_id_tweet_type_language_like_timestamp",
    "TE_tw_hash0_tweet_type_language_reply_timestamp",
    "TE_tw_hash0_tweet_type_language_retweet_timestamp",
    "TE_tw_hash0_tweet_type_language_retweet_with_comment_timestamp",
    "TE_tw_hash1_tweet_type_language_retweet_with_comment_timestamp",
    "TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp",
    "TE_tw_hash0_tweet_type_language_like_timestamp",
    "TE_tw_first_word_tweet_type_language_reply_timestamp",
    "TE_tw_first_word_tweet_type_language_retweet_timestamp",
    "TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp",
    "TE_tw_first_word_tweet_type_language_like_timestamp",
    "TE_tw_last_word_tweet_type_language_reply_timestamp",
    "TE_tw_last_word_tweet_type_language_retweet_timestamp",
    "TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp",
    "TE_tw_last_word_tweet_type_language_like_timestamp"
]

train_feature_output_path = "%s" % (current_path)
valid_feature_output_path = "%s" % (current_path)

def merge_indicator_by_input_cols(proc, df, te, is_train=True):
    to_select = df.columns
    for c in feature_list:
        if c not in to_select:
            to_select.append(c)
    start = time.time()
    op_merge = ModelMerge(te)
    op_select = SelectFeature(to_select)
    proc.reset_ops([op_merge, op_select])
    proc.transform(df, name="train_with_features")
    if is_train:
        print("Train Merge models took: %.3f secs" % (time.time()-start))
    else:
        print("Valid Merge models took: %.3f secs" % (time.time()-start))


input_cols = [
    #(['present_media'], "lookup_table_original_present_media"),
    #(['tweet_type'], "lookup_table_original_tweet_type"),
    #(['language'], "lookup_table_original_language"),
    (['engaged_with_user_id'], "lookup_table_original_engaged_with_user_id"),
    (['engaging_user_id'], "lookup_table_original_engaging_user_id"),
    (['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'],
     "lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified"),
    (['engaged_with_user_id', 'tweet_type', 'language'], "lookup_table_original_engaged_with_user_id_original_tweet_type_original_language"),
    (['tw_first_word', 'tweet_type', 'language'], "lookup_table_tw_first_word_original_tweet_type_original_language"),
    (['tw_last_word', 'tweet_type', 'language'], "lookup_table_tw_last_word_original_tweet_type_original_language"),
    (['text_tokens', 'tweet_type', 'language'], "lookup_table_text_tokens_original_tweet_type_original_language"),
    #(['text_tokens'], "lookup_table_text_tokens"),
    (['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'],
     "lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager"), 
    (['present_domains', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language"),
    (['present_links', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_present_links_original_present_media_original_tweet_type_original_language"),    
    (['hashtags', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language")
]

train_res_dfs = {}
############## Train join #############
for in_cols, dict_name in input_cols:
    train_res_dfs[dict_name] = (['fold'] + in_cols, spark.read.parquet("%s/train_lookup/%s" % (current_path, dict_name)), 0)

proc = DataProcessor(spark, path_prefix,
                     current_path=train_feature_output_path, dicts_path=dicts_folder)

train_out_dfs = []
df = train.withColumnRenamed('hashtags', 'categorified_hashtags').\
    withColumnRenamed('tweet_id', 'categorified_tweet_id').\
    withColumnRenamed('present_media', 'categorified_present_media').\
    withColumnRenamed('present_links', 'categorified_present_links').\
    withColumnRenamed('present_domains', 'categorified_present_domains').\
    withColumnRenamed('tweet_type', 'categorified_tweet_type').\
    withColumnRenamed('language', 'categorified_language').\
    withColumnRenamed('engaged_with_user_id', 'categorified_engaged_with_user_id').\
    withColumnRenamed('engaging_user_id', 'categorified_engaging_user_id').\
    withColumnRenamed('original_hashtags', 'hashtags').\
    withColumnRenamed('original_tweet_id', 'tweet_id').\
    withColumnRenamed('original_present_media', 'present_media').\
    withColumnRenamed('original_present_links', 'present_links').\
    withColumnRenamed('original_present_domains', 'present_domains').\
    withColumnRenamed('original_tweet_type', 'tweet_type').\
    withColumnRenamed('original_language', 'language',).\
    withColumnRenamed('original_engaged_with_user_id', 'engaged_with_user_id').\
    withColumnRenamed('original_engaging_user_id', 'engaging_user_id')

merge_indicator_by_input_cols(proc, df, train_res_dfs, is_train=True)


Train Merge models took: 9437.672 secs


In [18]:
# run it, generate valid_with_feature
def merge_indicator_by_input_cols(proc, df, te, name):
    start = time.time()
    op_merge = ModelMerge(te)
    proc.reset_ops([op_merge])
    proc.transform(df, name=name)
    print("Valid Merge models took: %.3f secs" % (time.time()-start))

input_cols = [
    #(['present_media'], "lookup_table_original_present_media"),
    #(['tweet_type'], "lookup_table_original_tweet_type"),
    #(['language'], "lookup_table_original_language"),
    (['engaged_with_user_id'], "lookup_table_original_engaged_with_user_id"),
    (['engaging_user_id'], "lookup_table_original_engaging_user_id"),
    (['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'],
     "lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified"),
    (['engaged_with_user_id', 'tweet_type', 'language'], "lookup_table_original_engaged_with_user_id_original_tweet_type_original_language"),
    (['tw_first_word', 'tweet_type', 'language'], "lookup_table_tw_first_word_original_tweet_type_original_language"),
    (['tw_last_word', 'tweet_type', 'language'], "lookup_table_tw_last_word_original_tweet_type_original_language"),
    (['text_tokens', 'tweet_type', 'language'], "lookup_table_text_tokens_original_tweet_type_original_language"),
    #(['text_tokens'], "lookup_table_text_tokens"),
    (['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'],
     "lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager"), 
    (['present_domains', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language"),
    (['present_links', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_present_links_original_present_media_original_tweet_type_original_language"),    
    (['hashtags', 'present_media', 'tweet_type', 'language'],
     "lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language")
]
categorify_input_cols = [
    (['hashtags'], 'hashtags_categorified_hashtags'),
    (['tweet_id'], 'tweet_id_categorified_tweet_id'),
    (['present_media'], 'present_media_categorified_present_media'),
    (['present_links'], 'present_links_categorified_present_links'),
    (['present_domains'], 'present_domains_categorified_present_domains'),
    (['tweet_type'], 'tweet_type_categorified_tweet_type'),
    (['language'], 'language_categorified_language'),
    (['engaged_with_user_id'], 'engaged_with_user_id_categorified_engaged_with_user_id'),
    (['engaging_user_id'], 'engaging_user_id_categorified_engaging_user_id'),
    (['tw_first_word'], 'tw_first_word_categorified_tw_first_word'),
    (['tw_last_word'], 'tw_last_word_categorified_tw_last_word'),
    (['text_tokens'], 'text_tokens_tw_hash')
]

valid = spark.read.parquet("/recsys2021_0608_processed/sample_0_3/validate_decoded_with_word")

############## Train join #############
valid_res_dfs = {}
#for in_cols, dict_name in categorify_input_cols:
#    valid_res_dfs[dict_name] = (in_cols, spark.read.parquet("%s/train_categorified_lookup/%s" % (current_path, dict_name)), 0)

for in_cols, dict_name in input_cols:
    valid_res_dfs[dict_name] = (in_cols, spark.read.parquet("%s/valid_lookup/%s" % (current_path, dict_name)), 0)

proc = DataProcessor(spark, path_prefix,
                     current_path=current_path, dicts_path=dicts_folder)

df = valid
res = merge_indicator_by_input_cols(proc, df, valid_res_dfs, "validate_with_features")

Valid Merge models took: 135.173 secs


In [13]:
# run it!
# since we didn't save original_tw_first_word and original_tw_last_word in original train dataset
# we have to re-add them for later valid dataset lookup
#train_processed = spark.read.parquet("%s/train_with_features" % current_path)

df = spark.read.parquet("/recsys2021_0608_processed/recsys_dicts/tweet")
first_df = df.withColumnRenamed('dict_col', 'original_tw_first_word').withColumnRenamed('dict_col_id', 'tw_first_word').select('original_tw_first_word', 'tw_first_word')
last_df = df.withColumnRenamed('dict_col', 'original_tw_last_word').withColumnRenamed('dict_col_id', 'tw_last_word').select('original_tw_last_word', 'tw_last_word')
df = train\
  .join(first_df, 'tw_first_word', 'left')\
  .join(last_df, 'tw_last_word', 'left')
  #.withColumnRenamed('tw_first_word', 'categorified_tw_first_word')\
  #.withColumnRenamed('tw_last_word', 'categorified_tw_last_word')\
  #.withColumnRenamed('original_tw_first_word', 'tw_first_word')\
  #.withColumnRenamed('original_tw_last_word', 'tw_last_word')
# train_processed.show()
start = time.time()
df.write.format('parquet').mode('overwrite').save("%s/tweet_text_processed_20days_with_fold_0_3_original_word" % (current_path))
print("Add original_tw_word took %.3f secs" % (time.time() - start))

Add original_tw_word took 574.029 secs


In [39]:
df = spark.read.parquet("%s/table_with_features" % (current_path))
after_df = spark.read.parquet("%s/train_with_features_with_original_word" % (current_path))
print("origin is %d, now id %d" % (df.count(), after_df.count()))
schema = df.columns
after_schema = after_df.columns
for c in schema:
    if c not in after_schema:
        print("%s is not exists in after" % c)

for c in after_schema:
    if c not in schema:
        print("%s is not exists in before" % c)

origin is 142255442, now id 142255442
categorified_tw_last_word is not exists in before
categorified_tw_first_word is not exists in before


In [35]:
# generate categorfied tables from train_with_features as categorified_lookup
train_categofied_output_path = "%s/train_categorified_lookup/" % (current_path)
categorified_table_name = [
    ['hashtags', 'categorified_hashtags'],
    ['tweet_id', 'categorified_tweet_id'],
    ['present_media', 'categorified_present_media'],
    ['present_links', 'categorified_present_links'],
    ['present_domains', 'categorified_present_domains'],
    ['tweet_type', 'categorified_tweet_type'],
    ['language', 'categorified_language'],
    ['engaged_with_user_id', 'categorified_engaged_with_user_id'],
    ['engaging_user_id', 'categorified_engaging_user_id'],
    ['tw_first_word', 'categorified_tw_first_word'],
    ['tw_last_word', 'categorified_tw_last_word'],
    ['text_tokens', 'tweet'],
    ['text_tokens', 'tw_hash']
]
train_processed = spark.read.parquet("%s/train_with_features_with_original_word" % (current_path))
for to_select in categorified_table_name:
    start = time.time()
    cat_df = train_processed.select(to_select).distinct()
    cat_df.write.format('parquet').mode('overwrite').save("%s/%s" % (train_categofied_output_path, "_".join(to_select)))
    print("Created categorified table for %s, took %.3f seconds" % ("_".join(to_select), (time.time() - start)))

Created categorified table for hashtags_categorified_hashtags, took 3.174 seconds
Created categorified table for tweet_id_categorified_tweet_id, took 7.548 seconds
Created categorified table for present_media_categorified_present_media, took 1.893 seconds
Created categorified table for present_links_categorified_present_links, took 2.818 seconds
Created categorified table for present_domains_categorified_present_domains, took 2.606 seconds
Created categorified table for tweet_type_categorified_tweet_type, took 1.912 seconds
Created categorified table for language_categorified_language, took 2.181 seconds
Created categorified table for engaged_with_user_id_categorified_engaged_with_user_id, took 5.602 seconds
Created categorified table for engaging_user_id_categorified_engaging_user_id, took 8.041 seconds
Created categorified table for tw_first_word_categorified_tw_first_word, took 2.014 seconds
Created categorified table for tw_last_word_categorified_tw_last_word, took 4.840 seconds
Cr

In [14]:
for dict_name, v in valid_res_dfs.items():
    in_cols, df, default = v
    df.show(vertical=True, truncate=False)

-RECORD 0------------------------------------------------------------------
 present_media                                   | Video	Photo	Photo	Video 
 TE_present_media_reply_timestamp                | 0.019245570525527       
 TE_present_media_retweet_timestamp              | 0.26492688059806824     
 TE_present_media_retweet_with_comment_timestamp | 7.306360639631748E-4    
 TE_present_media_like_timestamp                 | 0.45630601048469543     
 TE_present_media_elapsed_time                   | -1.613212544E9          
 CE_present_media                                | 223.0                   
 CE_present_media_norm                           | 1.4832472743364633E-6   
-RECORD 1------------------------------------------------------------------
 present_media                                   | Video	Photo	Photo	Photo 
 TE_present_media_reply_timestamp                | 0.019954183759788673    
 TE_present_media_retweet_timestamp              | 0.24289379020531973     
 TE_present_

-RECORD 0----------------------------------------------------------------------
 language                                   | 2F548E5BE0D7F678E72DDE31DFBEF8E7 
 TE_language_reply_timestamp                | 0.035949740558862686             
 TE_language_retweet_timestamp              | 0.06307207917173703              
 TE_language_retweet_with_comment_timestamp | 0.006806539759660761             
 TE_language_like_timestamp                 | 0.444216916958491                
 TE_language_elapsed_time                   | -1.6132449066666667E9            
 CE_language                                | 1130851.0                        
 CE_language_norm                           | 0.007134777959436178             
-RECORD 1----------------------------------------------------------------------
 language                                   | F33767F7D7080003F403FDAB34FEB755 
 TE_language_reply_timestamp                | 0.03594622512658437              
 TE_language_retweet_timestamp          

-RECORD 0------------------------------------------------------------------------------
 engaging_user_id                                   | BDA344DCD384677B471BFB36F9685E51 
 TE_engaging_user_id_reply_timestamp                | 0.0019587956679364047            
 TE_engaging_user_id_retweet_timestamp              | 0.006321542353058855             
 TE_engaging_user_id_retweet_with_comment_timestamp | 4.8658895927170914E-4            
 TE_engaging_user_id_like_timestamp                 | 0.7224165002504984               
 CE_engaging_user_id                                | 349.0                            
 CE_engaging_user_id_norm                           | 2.17214892472839E-6              
-RECORD 1------------------------------------------------------------------------------
 engaging_user_id                                   | 349C91578ECD25A987A8F4D61185F8DE 
 TE_engaging_user_id_reply_timestamp                | 0.0854070174197356               
 TE_engaging_user_id_retweet_tim

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 present_domains                                                                                                                           |                                                                   
 language                                                                                                                                  | 8729EBF694C3DAF61208A209C2A542C8                                  
 engagee_follows_engager                                                                                                                   | true                                                              
 tweet_type                                                                                                                                | Retweet                    

-RECORD 0-----------------------------------------------------------------------------------------------
 tw_first_word                                                       | 1                                
 tweet_type                                                          | Quote                            
 language                                                            | 105008E45831ADE8AF1DB888319F422A 
 TE_tw_first_word_tweet_type_language_reply_timestamp                | 0.011433950159698725             
 TE_tw_first_word_tweet_type_language_retweet_timestamp              | 0.1891856367389361               
 TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp | 0.03355295645693938              
 TE_tw_first_word_tweet_type_language_like_timestamp                 | 0.284892275929451                
-RECORD 1-----------------------------------------------------------------------------------------------
 tw_first_word                                         

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 text_tokens                                                    | 101	56898	137	40094	93005	10627	10884	131	137	10685	74052	20498	11273	10308	12577	10308	137	81732	11537	27561	61590	108	81732	11537	27561	61590	216	108	39759	19180	11490	32080	20872	216	108	163	21

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 text_tokens                               | 101	5760	2154	2016	61980	51695	5713	7986	216	7146	6490	5713	7986	48096	18825	28224	51143	15221	216	10233	4388	8137	2204	2468	1943	60933	18628	16838	111806	18825	10055	10055	102                                                                                                                               

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 present_domains                                                                     | BAED74433A3AC529FBEDC13AEA9ABB9F                                                                   
 present_media                                                                       |                                                                                                    
 tweet_type                                                                          | TopLevel                                                                                           
 language                                                                            | E7F038DE3EAD397AEC9193686C911677                                                                   
 TE_present_domains_present_media_tweet_type_language_reply_times

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 hashtags                                                                     | 1A5BBAA651361E613ED2A71626C97198                                                                                                                                                                                                       
 present_media                                                                | Photo	Photo	Photo	Photo                                                                                                                                                                                                                
 tweet_type                                                     

### Print Schema

In [11]:
# Print out All TEs for train and valid
# No need to run

from prettytable import PrettyTable
from IPython.core.display import display, HTML

def generate_ascii_table(title, data):
    mycss = """
    table {
        text-align: left;        
    }
    table.td {
        max-width: 200px;
    }
    """
    x = PrettyTable()    
    x.field_names = title
    for row in data:
        x.add_row(row)
    display(HTML(x.get_html_string().format(mycss)))
    return x


def print_encoding_input(res_dfs):
    for origin_cols, te in res_dfs.items():
        output = []
        input_cols = eval(origin_cols)
        for out_col, i_dict in te.items():
            output.append([str(i_dict[0]), out_col, i_dict[2]])
        print('Valid for ' + "lookup_table_%s" % ('_'.join(input_cols)))
        generate_ascii_table(['input_cols', 'TE_Name', 'Default_value'], output)


def print_merged_schema(input_names, out_dfs):
    output = []
    for origin_cols, (table_name, spark_df) in zip(input_names, out_dfs):
        input_cols = eval(origin_cols)
        output.append([origin_cols, table_name, str(spark_df.columns)])
    generate_ascii_table(["input", "table_name", "lookup_table_schema"], output)


# print_encoding_input(train_res_dfs)
# print_encoding_input(valid_res_dfs)
input_cols = [
    #"['present_media']",
    #"['tweet_type']",
    #"['language']",
    "['engaged_with_user_id']",
    "['engaging_user_id']",
    "['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified']",
    "['engaged_with_user_id', 'tweet_type', 'language']",
    "['word', 'tweet_type', 'language']",
    "['text_tokens', 'tweet_type', 'language']",
    #"['text_tokens']",
    "['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager']",
    "['present_domains', 'present_media', 'tweet_type', 'language']",
    "['present_links', 'present_media', 'tweet_type', 'language']",
    "['hashtags', 'present_media', 'tweet_type', 'language']"
]
out_dfs_names = [                 
    #"lookup_table_original_present_media",
    #"lookup_table_original_tweet_type",
    #"lookup_table_original_language",
    "lookup_table_original_engaged_with_user_id",
    "lookup_table_original_engaging_user_id",
    "lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified",
    "lookup_table_original_engaged_with_user_id_original_tweet_type_original_language",
    "lookup_table_word_tweet_type_language",
    "lookup_table_text_tokens_original_tweet_type_original_language",
    #"lookup_table_text_tokens",
    "lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager",
    "lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language",
    "lookup_table_original_present_links_original_present_media_original_tweet_type_original_language",
    "lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language",
]
train_output_path = "%s/train_lookup" % (current_path)
train_out_dfs = [(table_name, spark.read.parquet("%s/%s" % (train_output_path, table_name))) for table_name in out_dfs_names]
valid_output_path = "%s/valid_lookup_64features" % (current_path)
valid_out_dfs = [(table_name, spark.read.parquet("%s/%s" % (valid_output_path, table_name))) for table_name in out_dfs_names]
# print("Train lookup tables")
# print_merged_schema(input_cols, train_out_dfs)
print("Valid lookup tables")
print_merged_schema(input_cols, valid_out_dfs)


Valid lookup tables


input,table_name,lookup_table_schema
['engaged_with_user_id'],lookup_table_original_engaged_with_user_id,"['engaged_with_user_id', 'TE_engaged_with_user_id_reply_timestamp', 'TE_engaged_with_user_id_retweet_timestamp', 'TE_engaged_with_user_id_retweet_with_comment_timestamp', 'TE_engaged_with_user_id_like_timestamp', 'CE_engaged_with_user_id', 'CE_engaged_with_user_id_norm']"
['engaging_user_id'],lookup_table_original_engaging_user_id,"['engaging_user_id', 'TE_engaging_user_id_reply_timestamp', 'TE_engaging_user_id_retweet_timestamp', 'TE_engaging_user_id_retweet_with_comment_timestamp', 'TE_engaging_user_id_like_timestamp', 'CE_engaging_user_id', 'CE_engaging_user_id_norm']"
"['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified']",lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified,"['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified', 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp', 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp', 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp', 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp']"
"['engaged_with_user_id', 'tweet_type', 'language']",lookup_table_original_engaged_with_user_id_original_tweet_type_original_language,"['engaged_with_user_id', 'tweet_type', 'language', 'TE_engaged_with_user_id_tweet_type_language_reply_timestamp', 'TE_engaged_with_user_id_tweet_type_language_retweet_timestamp', 'TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp', 'TE_engaged_with_user_id_tweet_type_language_like_timestamp']"
"['word', 'tweet_type', 'language']",lookup_table_word_tweet_type_language,"['word', 'language', 'tweet_type', 'TE_tw_first_word_tweet_type_language_reply_timestamp', 'TE_tw_first_word_tweet_type_language_retweet_timestamp', 'TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp', 'TE_tw_first_word_tweet_type_language_like_timestamp', 'TE_tw_last_word_tweet_type_language_reply_timestamp', 'TE_tw_last_word_tweet_type_language_retweet_timestamp', 'TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp', 'TE_tw_last_word_tweet_type_language_like_timestamp']"
"['text_tokens', 'tweet_type', 'language']",lookup_table_text_tokens_original_tweet_type_original_language,"['text_tokens', 'tweet_type', 'language', 'TE_tw_hash0_tweet_type_language_reply_timestamp', 'TE_tw_hash0_tweet_type_language_retweet_timestamp', 'TE_tw_hash0_tweet_type_language_retweet_with_comment_timestamp', 'TE_tw_hash1_tweet_type_language_retweet_with_comment_timestamp', 'TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp', 'TE_tw_hash0_tweet_type_language_like_timestamp']"
"['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager']",lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager,"['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager', 'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_reply_timestamp', 'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_timestamp', 'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp', 'TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_like_timestamp']"
"['present_domains', 'present_media', 'tweet_type', 'language']",lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language,"['present_domains', 'present_media', 'tweet_type', 'language', 'TE_present_domains_present_media_tweet_type_language_reply_timestamp', 'TE_present_domains_present_media_tweet_type_language_like_timestamp']"
"['present_links', 'present_media', 'tweet_type', 'language']",lookup_table_original_present_links_original_present_media_original_tweet_type_original_language,"['present_links', 'present_media', 'tweet_type', 'language', 'TE_present_links_present_media_tweet_type_language_reply_timestamp']"
"['hashtags', 'present_media', 'tweet_type', 'language']",lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language,"['hashtags', 'present_media', 'tweet_type', 'language', 'TE_hashtags_present_media_tweet_type_language_reply_timestamp', 'TE_hashtags_present_media_tweet_type_language_retweet_timestamp', 'TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp', 'TE_hashtags_present_media_tweet_type_language_like_timestamp']"


### For Data preparation ###

In [None]:
# optional: split input data to valid, train and test

categorified_with_text_df = spark.read.parquet("/recsys2021_0608_processed/tweet_text_processed_fixed/")
categorified_with_text_df = categorified_with_text_df.drop('tokens')
# 1.1 get timestamp range
import datetime
min_timestamp = categorified_with_text_df.select('tweet_timestamp').agg({'tweet_timestamp': 'min'}).collect()[0]['min(tweet_timestamp)']
max_timestamp = categorified_with_text_df.select('tweet_timestamp').agg({'tweet_timestamp': 'max'}).collect()[0]['max(tweet_timestamp)']
seconds_in_day = 3600 * 24

print(
    "min_timestamp is %s, max_timestamp is %s, 20 days max is %s" % (
        datetime.datetime.fromtimestamp(min_timestamp).strftime('%Y-%m-%d %H:%M:%S'),
        datetime.datetime.fromtimestamp(max_timestamp).strftime('%Y-%m-%d %H:%M:%S'),
        datetime.datetime.fromtimestamp(min_timestamp + 20 * seconds_in_day).strftime('%Y-%m-%d %H:%M:%S')
    ))

time_range_split = {
    '20days': (min_timestamp, seconds_in_day * 20 + min_timestamp),
    'validate': (min_timestamp, seconds_in_day * 2 + min_timestamp),
    'train': (seconds_in_day * 2 + min_timestamp, seconds_in_day * 18 + min_timestamp),
    'test': (seconds_in_day * 18 + min_timestamp, seconds_in_day * 20 + min_timestamp)
}

print(time_range_split)

# 1.2 save ranged data for train
# filtering out train range data and save
train_start, train_end = time_range_split['train']
valid_start, valid_end = time_range_split['validate']
test_start, test_end = time_range_split['test']

train_data_processed = "/recsys2021_0608_processed/tweet_text_processed_train"
valid_data_processed = "/recsys2021_0608_processed/tweet_text_processed_valid"
test_data_processed = "/recsys2021_0608_processed/tweet_text_processed_test"

df = categorified_with_text_df.filter(
    (f.col('tweet_timestamp') >= f.lit(train_start)) & (f.col('tweet_timestamp') < f.lit(train_end)))
df.write.format('parquet').mode('overwrite').save(path_prefix + train_data_processed)

df = categorified_with_text_df.filter(
    (f.col('tweet_timestamp') >= f.lit(valid_start)) & (f.col('tweet_timestamp') < f.lit(valid_end)))
df.write.format('parquet').mode('overwrite').save(path_prefix + valid_data_processed)

df = categorified_with_text_df.filter(
    (f.col('tweet_timestamp') >= f.lit(test_start)) & (f.col('tweet_timestamp') < f.lit(test_end)))
df.write.format('parquet').mode('overwrite').save(path_prefix + test_data_processed)

#df = spark.read.parquet(processed_input).sample(False, 0.5, 3)
#valid, train, test = split_data(df)

min_timestamp is 2021-02-04 08:00:00, max_timestamp is 2021-02-24 07:59:59, 20 days max is 2021-02-24 08:00:00
{'20days': (1612396800, 1614124800), 'validate': (1612396800, 1612569600), 'train': (1612569600, 1613952000), 'test': (1613952000, 1614124800)}


In [37]:
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window
fold = 5
# Add fold to train data
train_data_path= "/recsys2021_0608_processed/sample_0_3_20days/tweet_text_processed_20days_with_fold_0_3_original_word"
train = spark.read.parquet("%s/" % (train_data_path)).sample(0.3)
train = train.withColumn('elapsed_time',(f.col('engage_time')-f.col('tweet_timestamp')).cast(DoubleType()))
train = train.drop("tw_last_word").drop("tw_first_word").drop("tw_second_word").drop("tw_llast_word").drop('fold').drop('tweet').withColumnRenamed('original_tweet', 'tweet')
tweet_hash_dict = spark.read.parquet('/recsys2021_0608_processed/recsys_dicts/tw_hash').withColumnRenamed('dict_col_id','tw_hash').withColumnRenamed('dict_col','original_tw_hash')
train = train.join(tweet_hash_dict, 'tw_hash', 'left')

w = Window().partitionBy(lit('A'))
train = train.withColumn("fold", (row_number().over(w))%lit(5))
train.write.format('parquet').mode('overwrite').save("file:///mnt/nvme2/chendi/BlueWhale/sample_0_3/stage4_20days/train_processed")

# let's re-gen tweet word
#current_path = "/recsys2021_0608_processed/"
#path_prefix = "hdfs://"
#dicts_path = "recsys_dicts"
#tweet_dict_df = spark.read.parquet("%s/%s/%s/%s" % (path_prefix, current_path, dicts_path, "tweet"))
#tweet_dict_df.orderBy('dict_col_id').show(100)
#proc = DataProcessor(spark, path_prefix,
#                     current_path=current_path, dicts_path=dict_path)
#
#train.printSchema()
#
#folds=5
##
#if 'fold' not in train.columns:
#    train = train.withColumn("fold", f.round(f.rand(seed=42)*folds))
#train.write.format('parquet').mode('overwrite').save("/recsys2021_0608_processed/tweet_text_processed_20days_with_fold_0_3")

AnalysisException: Window function row_number() requires window to be ordered, please add ORDER BY clause. For example SELECT row_number()(value_expr) OVER (PARTITION BY window_partition ORDER BY window_ordering) from table

### Backup

In [15]:
data_path= "/recsys2021_0608_fe_sample0.1/dask_fe_processed/train_fe_processed/"
tgt_train = spark.read.parquet("%s/" % (data_path))

data_path= "/recsys2021_0608_fe_sample0.1/dask_fe_processed/valid_fe_processed/"
tgt_valid = spark.read.parquet("%s/" % (data_path))

train = spark.read.parquet("%s/%s" % (current_path, "train_fe_op"))
cmp = train.select('fold', 'tweet_id', 'enaging_user_id', 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp')\
           .join(tgt_train.select('tweet_id', 'enaging_user_id', 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp'), ['tweet_id', 'enaging_user_id'], 'left')\
           .show(vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------
 tweet_id                                                                                                                                  | 135799937    
 enaging_user_id                                                                                                                           | 297264       
 fold                                                                                                                                      | 3            
 TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp | 5.036111E-4  
 TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp | 0.009985318  
-RECORD 1-------------------------------------------------------------

In [2]:
# Below are codes to verify current TE is same as what dask did
train_data_path= "/recsys2021_0608_fe_sample0.1/train_sample_0_1/part.0.parquet"
valid_data_path= "/recsys2021_0608_fe_sample0.1/test/part-00000-7b97dc02-ed27-4171-9961-320ef9cb9236-c000.snappy.parquet"
valid, train = (spark.read.parquet("%s/" % (valid_data_path)), spark.read.parquet("%s/" % (train_data_path)))
current_path = "/recsys2021_0608_processed/"
data_path= "/recsys2021_0608_fe_sample0.1/dask_fe_processed/train_fe_processed/"
tgt_train = spark.read.parquet("%s/" % (data_path))
data_path= "/recsys2021_0608_fe_sample0.1/dask_fe_processed/valid_fe_processed/"
tgt_valid = spark.read.parquet("%s/" % (data_path))
train = train.join(tgt_train.select('tweet_id', 'enaging_user_id', 'fold'), ['tweet_id', 'enaging_user_id'], 'left')
train = train.withColumnRenamed('enaging_user_id', 'engaging_user_id')\
             .withColumnRenamed('enaging_user_account_creation', 'engaging_user_account_creation')\
             .withColumnRenamed('enaging_user_follower_count', 'engaging_user_follower_count')\
             .withColumnRenamed('enaging_user_following_count', 'engaging_user_following_count')\
             .withColumnRenamed('enaging_user_is_verified', 'engaging_user_is_verified')
valid = valid.withColumnRenamed('enaging_user_id', 'engaging_user_id')\
             .withColumnRenamed('enaging_user_account_creation', 'engaging_user_account_creation')\
             .withColumnRenamed('enaging_user_follower_count', 'engaging_user_follower_count')\
             .withColumnRenamed('enaging_user_following_count', 'engaging_user_following_count')\
             .withColumnRenamed('enaging_user_is_verified', 'engaging_user_is_verified')

from collections import *

proc = DataProcessor(spark, path_prefix,
                     current_path=current_path, dicts_path=dicts_folder)

label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

te_dicts_train = OrderedDict() # value should be tuple, (input_cols, output_col, y_mean)
te_dicts_valid = OrderedDict() # value should be tuple, (input_cols, output_col, y_mean)
encoding_columns = []

#############################################################################################
### 1. Target Encoding ###
# begin = time.time()
# y_mean_all = []
# y_mean_all_dict = {}
# for t in ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']:
#     y_mean = np.array(train.groupBy().mean(t).collect())[0][0]
#     y_mean_all.append(y_mean)
#     y_mean_all_dict[t] = y_mean
#     for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'engaging_user_id']:
#         start = time.time()
#         out_col = f'TE_{c}_{t}'
#         encoder = TargetEncoder(proc, c, t, out_col, y_mean, out_dtype=FloatType())
#         te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid)      
#         print(out_col," y_mean is %f, %.1f seconds"%(y_mean, time.time()-start))
# print(F"Target encoding #1 total time:{time.time()-begin}")
# #############################################################################################
# ### 2. Target Encoding for multi columns ###
# begin = time.time()
# k=0
# for t in ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']:
#     y_mean = y_mean_all_dict[t]
#     for c in [
#         ['present_domains','language','engagee_follows_engager','tweet_type','present_media','engaged_with_user_is_verified'],
#         ['engaged_with_user_id','tweet_type','language'],
#         ['tw_first_word','tweet_type','language'],
#         ['tw_last_word','tweet_type','language'],
#         ['tw_hash0','tweet_type','language'],
#         ['tw_hash1','tweet_type','language'],
#         ['tw_uhash','tweet_type','language'],
#         ['tw_hash'],
#         ['present_media','tweet_type','language','engaged_with_user_is_verified','engaging_user_is_verified','engagee_follows_engager'],
#         ['present_domains','present_media','tweet_type','language'],
#         ['present_links','present_media','tweet_type','language'],
#         ['hashtags','present_media','tweet_type','language'],
#         ]:
#         start = time.time()
#         out_col = 'TE_'+'_'.join(c)+'_'+t
#         encoder = TargetEncoder(proc, c, t, out_col, y_mean, out_dtype=FloatType())
#         te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid)
#         print(out_col," y_mean is %f, %.1f seconds"%(y_mean, time.time()-start))
# print(F"Target encoding #2 total time:{time.time()-begin}")
# #############################################################################################
# # 3. Target Encoding for elapse_time ###
# begin = time.time()
# y_mean = np.array(train.groupBy().mean('elapsed_time').collect())[0][0]
# y_mean_all_dict['elapsed_time'] = y_mean
# for c in ['present_media', 'tweet_type', 'language']:#, 'a_user_id', 'b_user_id']:
#     start = time.time()
#     for t in ['elapsed_time']:
#         out_col = f'TE_{c}_{t}'
#         encoder = TargetEncoder(proc, c, t, out_col, y_mean, out_dtype=FloatType())
#         te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid)
#         print(out_col," y_mean is %f, %.1f seconds"%(y_mean, time.time()-start))
# print(F"Target encoding #3 total time:{time.time()-begin}")
# 
# #############################################################################################
### 4. Count Encoding ###
# begin = time.time()
# for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'engaging_user_id']:
#     start = time.time()
#     out_col = f'CE_{c}'
#     encoder = CountEncoder(proc, c, out_col)
#     te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid)
#     print(out_col,"%.1f seconds"%(time.time()-start))
# print(F"Count Encoding total time:{time.time() - begin}")
#############################################################################################
### 5. Frequency Encoding ###
begin = time.time()
for c in ['present_media', 'tweet_type', 'language', 'engaged_with_user_id', 'engaging_user_id']:
    start = time.time()
    out_col = f'CE_{c}_norm'
    encoder = FrequencyEncoder(proc, c, out_col)
    te_dicts_train[out_col], te_dicts_valid[out_col] = encoder.transform(train, valid)
    print(out_col,"%.1f seconds"%(time.time()-start))
print(F"Frequency encoding total time:{time.time()-begin}")
#############################################################################################
# print(y_mean_all_dict)

encoding_columns = [
    ('TE_present_media_reply_timestamp',                       ['present_media']),
    ('TE_tweet_type_reply_timestamp',                          ['tweet_type']),
    ('TE_language_reply_timestamp',                            ['language']),
    ('TE_tw_first_word_tweet_type_language_reply_timestamp', ['tw_first_word', 'tweet_type', 'language']),
    ('TE_tw_last_word_tweet_type_language_reply_timestamp', ['tw_last_word', 'tweet_type', 'language']),
    ('TE_tw_hash0_tweet_type_language_reply_timestamp', ['tw_hash0', 'tweet_type', 'language']),
    ('TE_tw_hash1_tweet_type_language_reply_timestamp', ['tw_hash1', 'tweet_type', 'language']),
    ('TE_tw_uhash_tweet_type_language_reply_timestamp', ['tw_uhash', 'tweet_type', 'language']),
    ('TE_tw_hash_reply_timestamp', ['tw_hash']),
    ('TE_present_media_elapsed_time', ['present_media']),
    ('TE_tweet_type_elapsed_time', ['tweet_type']),
    ('TE_language_elapsed_time', ['language']),
    ('CE_present_media', ['present_media']),
    ('CE_tweet_type', ['tweet_type']),
    ('CE_language', ['language']),
    ('CE_present_media_norm', ['present_media']),
    ('CE_tweet_type_norm', ['tweet_type']),
    ('CE_language_norm', ['language'])
]
for table_name, input_cols in encoding_columns:
    df = spark.read.parquet("%s/train/%s" % (current_path, table_name))
    condition = ' or '.join(["(%s is not null)" % c for c in input_cols])
    if 'TE' in table_name:
        df = tgt_train.filter(condition).select(['fold', table_name] + input_cols)\
                      .join(df, ['fold'] + input_cols, 'left')\
                      .filter(f.abs(df[table_name] - tgt_train[table_name]) > 0.0000001)\
                      .withColumn("src_%s" % table_name, df[table_name])
    else:
        df = tgt_train.filter(condition).select([table_name] + input_cols)\
                      .join(df, input_cols, 'left')\
                      .filter(f.abs(df[table_name] - tgt_train[table_name]) > 0.0000001)\
                      .withColumn("src_%s" % table_name, df[table_name])   
    diff_num_rows = df.count()
    print("train %s has %d different rows" % (table_name, diff_num_rows))
    if diff_num_rows > 0:
        df.show()
    
for table_name, input_cols in encoding_columns:
    df = spark.read.parquet("%s/valid/%s" % (current_path, table_name))
    condition = ' or '.join(["(%s is not null)" % c for c in input_cols])
    df = tgt_valid.filter(condition).select([table_name] + input_cols).join(df, input_cols, 'left').filter(f.abs(df[table_name] - tgt_valid[table_name]) > 0.0000001)
    diff_num_rows = df.count()
    print("valid %s has %d different rows" % (table_name, diff_num_rows))
    if diff_num_rows > 0:
        df.show()

CE_present_media_norm 15.4 seconds
CE_tweet_type_norm 3.9 seconds
CE_language_norm 3.4 seconds
CE_engaged_with_user_id_norm 3.8 seconds
CE_engaging_user_id_norm 3.8 seconds
Frequency encoding total time:30.210938215255737
train TE_present_media_reply_timestamp has 0 different rows
train TE_tweet_type_reply_timestamp has 0 different rows
train TE_language_reply_timestamp has 0 different rows
train TE_tw_first_word_tweet_type_language_reply_timestamp has 0 different rows
train TE_tw_last_word_tweet_type_language_reply_timestamp has 0 different rows
train TE_tw_hash0_tweet_type_language_reply_timestamp has 0 different rows
train TE_tw_hash1_tweet_type_language_reply_timestamp has 0 different rows
train TE_tw_uhash_tweet_type_language_reply_timestamp has 0 different rows
train TE_tw_hash_reply_timestamp has 0 different rows
train TE_present_media_elapsed_time has 0 different rows
train TE_tweet_type_elapsed_time has 0 different rows
train TE_language_elapsed_time has 0 different rows
train

In [None]:
train = train.withColumn('engaged_following_follower_rate',(col('engaged_with_user_following_count')/col('engaged_with_user_follower_count')).cast(FloatType()))
train = train.withColumn('enaging_follower_following_rate',(col('enaging_user_follower_count')/col('enaging_user_following_count')).cast(FloatType()))
valid = valid.withColumn('engaged_following_follower_rate',(col('engaged_with_user_following_count')/col('engaged_with_user_follower_count')).cast(FloatType()))
valid = valid.withColumn('enaging_follower_following_rate',(col('enaging_user_follower_count')/col('enaging_user_following_count')).cast(FloatType()))



train.write.format('parquet').mode('overwrite').save("/recsys2021/output/fe_processed_train")

valid.write.format('parquet').mode('overwrite').save("/recsys2021/output/fe_processed_valid")

In [None]:
table_name_map = [
    ('TE_present_media_reply_timestamp',                       ['present_media']),
    ('TE_tweet_type_reply_timestamp',                          ['tweet_type']),
    ('TE_language_reply_timestamp',                            ['language']),
    ('TE_engaged_with_user_id_reply_timestamp',                ['engaged_with_user_id']),
    ('TE_engaging_user_id_reply_timestamp',                    ['engaging_user_id'],        48  ),
    ('TE_present_media_retweet_timestamp',                     ['present_media'],           5),
    ('TE_tweet_type_retweet_timestamp',                        ['tweet_type'],              5),
    ('TE_language_retweet_timestamp',                          ['language'],                5),
    ('TE_engaged_with_user_id_retweet_timestamp',              ['engaged_with_user_id'],    5),
    ('TE_engaging_user_id_retweet_timestamp',                  ['engaging_user_id'] ,       95),
    ('TE_present_media_retweet_with_comment_timestamp',        ['present_media'],           85),
    ('TE_tweet_type_retweet_with_comment_timestamp',           ['tweet_type'],              85),
    ('TE_language_retweet_with_comment_timestamp',             ['language'],                85),
    ('TE_engaged_with_user_id_retweet_with_comment_timestamp', ['engaged_with_user_id'],    85),
    ('TE_engaging_user_id_retweet_with_comment_timestamp',     ['engaging_user_id'],        585),
    ('TE_present_media_like_timestamp',                        ['present_media'],           ),
    ('TE_tweet_type_like_timestamp',                           ['tweet_type'],              ),
    ('TE_language_like_timestamp',                             ['language']  ,              ),
    ('TE_engaged_with_user_id_like_timestamp',                 ['engaged_with_user_id'],    ),
    ('TE_engaging_user_id_like_timestamp',                     ['engaging_user_id'] ,       4),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.027071024316871948),
    ('TE_engaged_with_user_id_tweet_type_language_reply_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_first_word_tweet_type_language_reply_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_last_word_tweet_type_language_reply_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_hash0_tweet_type_language_reply_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_hash1_tweet_type_language_reply_timestamp', ['tw_hash1', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_uhash_tweet_type_language_reply_timestamp', ['tw_uhash', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_hash_reply_timestamp', ['tw_hash'], 0.027071024316871948),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_reply_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.027071024316871948),
    ('TE_present_domains_present_media_tweet_type_language_reply_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_present_links_present_media_tweet_type_language_reply_timestamp', ['present_links', 'present_media', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_hashtags_present_media_tweet_type_language_reply_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.087306165472513950),
    ('TE_engaged_with_user_id_tweet_type_language_retweet_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_first_word_tweet_type_language_retweet_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_last_word_tweet_type_language_retweet_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_hash0_tweet_type_language_retweet_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_hash1_tweet_type_language_retweet_timestamp', ['tw_hash1', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_uhash_tweet_type_language_retweet_timestamp', ['tw_uhash', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_hash_retweet_timestamp', ['tw_hash'], 0.087306165472513950),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.087306165472513950),
    ('TE_present_domains_present_media_tweet_type_language_retweet_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_present_links_present_media_tweet_type_language_retweet_timestamp', ['present_links', 'present_media', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_hashtags_present_media_tweet_type_language_retweet_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.006721722044055585),
    ('TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_hash0_tweet_type_language_retweet_with_comment_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_hash1_tweet_type_language_retweet_with_comment_timestamp', ['tw_hash1', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp', ['tw_uhash', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_hash_retweet_with_comment_timestamp', ['tw_hash'], 0.006721722044055585),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.006721722044055585),
    ('TE_present_domains_present_media_tweet_type_language_retweet_with_comment_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_present_links_present_media_tweet_type_language_retweet_with_comment_timestamp', ['present_links', 'present_media', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.398006794503377392),
    ('TE_engaged_with_user_id_tweet_type_language_like_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_first_word_tweet_type_language_like_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_last_word_tweet_type_language_like_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_hash0_tweet_type_language_like_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_hash1_tweet_type_language_like_timestamp', ['tw_hash1', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_uhash_tweet_type_language_like_timestamp', ['tw_uhash', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_hash_like_timestamp', ['tw_hash'], 0.398006794503377392),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_like_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.398006794503377392),
    ('TE_present_domains_present_media_tweet_type_language_like_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_present_links_present_media_tweet_type_language_like_timestamp', ['present_links', 'present_media', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_hashtags_present_media_tweet_type_language_like_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_present_media_elapsed_time', ['present_media'], -1613246812.928331136703491211),
    ('TE_tweet_type_elapsed_time', ['tweet_type'], -1613246812.928331136703491211),
    ('TE_language_elapsed_time', ['language'], -1613246812.928331136703491211),
    ('CE_present_media', ['present_media'], 0),
    ('CE_tweet_type', ['tweet_type'], 0),
    ('CE_language', ['language'], 0),
    ('CE_engaged_with_user_id', ['engaged_with_user_id'], 0),
    ('CE_engaging_user_id', ['engaging_user_id'], 0),
    ('CE_present_media_norm', ['present_media'], 0),
    ('CE_tweet_type_norm', ['tweet_type'], 0),
    ('CE_language_norm', ['language'], 0),
    ('CE_engaged_with_user_id_norm', ['engaged_with_user_id'], 0),
    ('CE_engaging_user_id_norm', ['engaging_user_id'], 0)
]


In [None]:
# 55 features
table_name_map = [    
    ('TE_engaging_user_id_reply_timestamp',                    ['engaging_user_id'],        48  ),    
    ('TE_engaging_user_id_retweet_timestamp',                  ['engaging_user_id'] ,       95),    
    ('TE_engaging_user_id_retweet_with_comment_timestamp',     ['engaging_user_id'],        585),
    ('TE_engaging_user_id_like_timestamp',                     ['engaging_user_id'] ,       4),
    ('CE_engaging_user_id', ['engaging_user_id'], 0),
    ('CE_engaging_user_id_norm', ['engaging_user_id'], 0),    
    ('TE_engaged_with_user_id_reply_timestamp',                ['engaged_with_user_id']),
    ('TE_engaged_with_user_id_retweet_timestamp',              ['engaged_with_user_id'],    5),
    ('TE_engaged_with_user_id_retweet_with_comment_timestamp', ['engaged_with_user_id'],    85),
    ('TE_engaged_with_user_id_like_timestamp',                 ['engaged_with_user_id'],    ),    
    ('CE_engaged_with_user_id', ['engaged_with_user_id'], 0),    
    ('CE_engaged_with_user_id_norm', ['engaged_with_user_id'], 0),
    ('TE_present_links_present_media_tweet_type_language_reply_timestamp', ['present_links', 'present_media', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_hash0_tweet_type_language_reply_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_hash0_tweet_type_language_retweet_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_hash0_tweet_type_language_retweet_with_comment_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.006721722044055585),  
    ('TE_tw_hash0_tweet_type_language_like_timestamp', ['tw_hash0', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_hash1_tweet_type_language_retweet_with_comment_timestamp', ['tw_hash1', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_uhash_tweet_type_language_retweet_with_comment_timestamp', ['tw_uhash', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_reply_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.027071024316871948),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.087306165472513950),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_retweet_with_comment_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.006721722044055585),
    ('TE_present_media_tweet_type_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager_like_timestamp', ['present_media', 'tweet_type', 'language', 'engaged_with_user_is_verified', 'engaging_user_is_verified', 'engagee_follows_engager'], 0.398006794503377392),
    ('TE_tw_first_word_tweet_type_language_reply_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_first_word_tweet_type_language_retweet_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_first_word_tweet_type_language_retweet_with_comment_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_first_word_tweet_type_language_like_timestamp', ['tw_first_word', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_tw_last_word_tweet_type_language_reply_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_tw_last_word_tweet_type_language_retweet_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_tw_last_word_tweet_type_language_retweet_with_comment_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_tw_last_word_tweet_type_language_like_timestamp', ['tw_last_word', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.027071024316871948),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.087306165472513950),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_retweet_with_comment_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.006721722044055585),
    ('TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_like_timestamp', ['present_domains', 'language', 'engagee_follows_engager', 'tweet_type', 'present_media', 'engaged_with_user_is_verified'], 0.398006794503377392),
    ('TE_present_domains_present_media_tweet_type_language_reply_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_present_domains_present_media_tweet_type_language_like_timestamp', ['present_domains', 'present_media', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_engaged_with_user_id_tweet_type_language_reply_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_engaged_with_user_id_tweet_type_language_retweet_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_engaged_with_user_id_tweet_type_language_retweet_with_comment_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_engaged_with_user_id_tweet_type_language_like_timestamp', ['engaged_with_user_id', 'tweet_type', 'language'], 0.398006794503377392),
    ('TE_hashtags_present_media_tweet_type_language_reply_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.027071024316871948),
    ('TE_hashtags_present_media_tweet_type_language_retweet_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.087306165472513950),
    ('TE_hashtags_present_media_tweet_type_language_retweet_with_comment_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.006721722044055585),
    ('TE_hashtags_present_media_tweet_type_language_like_timestamp', ['hashtags', 'present_media', 'tweet_type', 'language'], 0.398006794503377392)
]
