In [None]:
#!/env/bin/python

import init

import findspark
findspark.init()

import os
import pandas as pd
import numpy as np
from pyspark.sql import *
from pyspark import *
import pyspark.sql.functions as f
import pyspark.sql.types as t
from timeit import default_timer as timer
import logging
from RecsysSchema import RecsysSchema
from pyrecdp.data_processor import *
from pyrecdp.encoder import *
from pyrecdp.utils import *
import hashlib

target_list = [
 'reply_timestamp',
 'retweet_timestamp',
 'retweet_with_comment_timestamp',
 'like_timestamp'
]

final_feature_list = [
 'engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_is_verified',
 'engaging_user_follower_count',
 'engaging_user_following_count',
 'engaging_user_is_verified',
 'has_photo',
 'has_video',
 'has_gif',
 'a_ff_rate',
 'b_ff_rate', 
 'dt_hour',
 'dt_dow',
 'has_mention',  
 'mentioned_bucket_id',    
 'mentioned_count',    
 'most_used_word_bucket_id',
 'second_used_word_bucket_id',
 'TE_tweet_type_reply_timestamp',
 'TE_tweet_type_retweet_timestamp',
 'TE_dt_dow_retweet_timestamp',
 'TE_most_used_word_bucket_id_reply_timestamp',
 'TE_most_used_word_bucket_id_retweet_timestamp',
 'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_most_used_word_bucket_id_like_timestamp',
 'TE_second_used_word_bucket_id_reply_timestamp',
 'TE_second_used_word_bucket_id_retweet_timestamp',
 'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_second_used_word_bucket_id_like_timestamp',
 'TE_mentioned_bucket_id_retweet_timestamp',
 'TE_mentioned_bucket_id_retweet_with_comment_timestamp',
 'TE_mentioned_bucket_id_like_timestamp',
 'TE_mentioned_bucket_id_reply_timestamp',
 'TE_language_reply_timestamp',
 'TE_language_retweet_timestamp',
 'TE_language_retweet_with_comment_timestamp',
 'TE_language_like_timestamp',
 'TE_mentioned_count_reply_timestamp',
 'TE_mentioned_count_retweet_timestamp',
 'TE_mentioned_count_retweet_with_comment_timestamp',
 'TE_mentioned_count_like_timestamp',
 'TE_engaged_with_user_id_reply_timestamp',
 'TE_engaged_with_user_id_retweet_timestamp',
 'TE_engaged_with_user_id_retweet_with_comment_timestamp',
 'TE_engaged_with_user_id_like_timestamp',
 'GTE_language_engaged_with_user_id_reply_timestamp',
 'GTE_language_engaged_with_user_id_retweet_timestamp',
 'GTE_language_engaged_with_user_id_retweet_with_comment_timestamp',
 'GTE_language_engaged_with_user_id_like_timestamp',
 'GTE_tweet_type_engaged_with_user_id_reply_timestamp',
 'GTE_tweet_type_engaged_with_user_id_retweet_timestamp',
 'GTE_tweet_type_engaged_with_user_id_retweet_with_comment_timestamp',
 'GTE_tweet_type_engaged_with_user_id_like_timestamp',
 'GTE_has_mention_engaging_user_id_reply_timestamp',
 'GTE_has_mention_engaging_user_id_retweet_timestamp',
 'GTE_has_mention_engaging_user_id_retweet_with_comment_timestamp',
 'GTE_has_mention_engaging_user_id_like_timestamp',
 'GTE_mentioned_bucket_id_engaging_user_id_reply_timestamp',
 'GTE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',
 'GTE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',
 'GTE_mentioned_bucket_id_engaging_user_id_like_timestamp',
 'GTE_language_engaging_user_id_reply_timestamp',
 'GTE_language_engaging_user_id_retweet_timestamp',
 'GTE_language_engaging_user_id_retweet_with_comment_timestamp',
 'GTE_language_engaging_user_id_like_timestamp',
 'GTE_tweet_type_engaging_user_id_reply_timestamp',
 'GTE_tweet_type_engaging_user_id_retweet_timestamp',
 'GTE_tweet_type_engaging_user_id_retweet_with_comment_timestamp',
 'GTE_tweet_type_engaging_user_id_like_timestamp',
 'GTE_dt_dow_engaged_with_user_id_reply_timestamp',
 'GTE_dt_dow_engaged_with_user_id_retweet_timestamp',
 'GTE_dt_dow_engaged_with_user_id_retweet_with_comment_timestamp',
 'GTE_dt_dow_engaged_with_user_id_like_timestamp',
 'GTE_mentioned_count_engaging_user_id_reply_timestamp',
 'GTE_mentioned_count_engaging_user_id_retweet_timestamp',
 'GTE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',
 'GTE_mentioned_count_engaging_user_id_like_timestamp',
 'GTE_dt_hour_engaged_with_user_id_reply_timestamp',
 'GTE_dt_hour_engaged_with_user_id_retweet_timestamp',
 'GTE_dt_hour_engaged_with_user_id_retweet_with_comment_timestamp',
 'GTE_dt_hour_engaged_with_user_id_like_timestamp',
 'GTE_dt_dow_engaging_user_id_reply_timestamp',
 'GTE_dt_dow_engaging_user_id_retweet_timestamp',
 'GTE_dt_dow_engaging_user_id_retweet_with_comment_timestamp',
 'GTE_dt_dow_engaging_user_id_like_timestamp',
 'GTE_dt_hour_engaging_user_id_reply_timestamp',
 'GTE_dt_hour_engaging_user_id_retweet_timestamp',
 'GTE_dt_hour_engaging_user_id_retweet_with_comment_timestamp',
 'GTE_dt_hour_engaging_user_id_like_timestamp'
]

def decodeBertTokenizerAndExtractFeatures(df, proc, output_name):
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-multilingual-cased', do_lower_case=False)

    # define UDF
    tokenizer_decode = f.udf(lambda x: tokenizer.decode(
        [int(n) for n in x.split('\t')]))
    format_url = f.udf(lambda x: x.replace(
        'https : / / t. co / ', 'https://t.co/').replace('@ ', '@'))

    # define decode udf operations
    op_feature_modification_tokenizer_decode = FeatureAdd(
        cols={'tweet': 'text_tokens'}, udfImpl=tokenizer_decode)
    op_feature_modification_format_url = FeatureModification(
        cols=['tweet'], udfImpl=format_url)
    
    op_feature_target_classify = FeatureModification(cols={
        "reply_timestamp": "f.when(f.col('reply_timestamp') > 0, 1).otherwise(0)",
        "retweet_timestamp": "f.when(f.col('retweet_timestamp') > 0, 1).otherwise(0)",
        "retweet_with_comment_timestamp": "f.when(f.col('retweet_with_comment_timestamp') > 0, 1).otherwise(0)",
        "like_timestamp": "f.when(f.col('like_timestamp') > 0, 1).otherwise(0)"}, op='inline')
    
    # define new features
    op_feature_from_original = FeatureAdd(
        cols={"has_photo": "f.col('present_media').contains('Photo').cast(t.IntegerType())",
              "has_video": "f.col('present_media').contains('Vedio').cast(t.IntegerType())",
              "has_gif": "f.col('present_media').contains('GIF').cast(t.IntegerType())",             
              "a_ff_rate": "f.col('engaged_with_user_following_count')/f.col('engaged_with_user_follower_count')",
              "b_ff_rate": "f.col('engaging_user_following_count') /f.col('engaging_user_follower_count')",
              "dt_dow": "f.dayofweek(f.from_unixtime(f.col('tweet_timestamp'))).cast(t.IntegerType())",
              "dt_hour": "f.hour(f.from_unixtime(f.col('tweet_timestamp'))).cast(t.IntegerType())",           
              "mention": "f.regexp_extract(f.col('tweet'), r'[^RT]\s@(\S+)', 1)",
              "has_mention": "(f.col('mention')!= '').cast(t.IntegerType())"
        }, op='inline')

    # execute
    proc.reset_ops([op_feature_modification_tokenizer_decode,
                    op_feature_modification_format_url,
                    op_feature_target_classify,
                    op_feature_from_original])
    t1 = timer()
    df = proc.transform(df, name=output_name)
    t2 = timer()
    print("BertTokenizer decode and feature extacting took %.3f" % (t2 - t1))

    return df

def categorifyFeatures(df, proc, output_name, gen_dict, sampleRatio=1):
    # 1. prepare dictionary
    dict_dfs = []
    if gen_dict:
        # only call below function when target dicts were not pre-prepared
        op_gen_dict_multiItems = GenerateDictionary(['tweet'], doSplit=True, sep=' ', bucketSize=100)
        op_gen_dict_singleItems = GenerateDictionary(['mention'], bucketSize=100)
        proc.reset_ops([op_gen_dict_multiItems, op_gen_dict_singleItems])
        t1 = timer()
        dict_dfs = proc.generate_dicts(df)
        t2 = timer()
        print("Generate Dictionary took %.3f" % (t2 - t1))
    else:
        dict_names = ['tweet', 'mention']
        dict_dfs = [{'col_name': name, 'dict': proc.spark.read.parquet(
            "%s/%s/%s/%s" % (proc.path_prefix, proc.current_path, proc.dicts_path, name))} for name in dict_names]
    # 2. since we need both mentioned_bucket_id and mentioned_count, add two mention id dict_dfs
    for dict_df in dict_dfs:
        if dict_df['col_name'] == 'mention':
            dict_dfs.append({'col_name': 'mentioned_bucket_id', 'dict':dict_df['dict']})
            dict_dfs.append({'col_name': 'mentioned_count', 'dict':dict_df['dict'].drop('dict_col_id').withColumnRenamed('count', 'dict_col_id')})
    op_feature_add = FeatureAdd({"mentioned_bucket_id": "f.col('mention')", "mentioned_count": "f.col('mention')"}, op='inline')
    
    # 3. categorify
    op_categorify_multiItems = Categorify([{'bucketized_tweet_word': 'tweet'}], dict_dfs=dict_dfs, doSplit=True, sep=' ')
    op_categorify_singleItem = Categorify(['mentioned_bucket_id', 'mentioned_count'], dict_dfs=dict_dfs)
    proc.reset_ops([op_feature_add, op_categorify_multiItems, op_categorify_singleItem])
    
    # 4. get most and second used bucketized_tweet_word
    op_feature_add_sorted_bucketized_tweet_word = FeatureAdd(
        cols={'sorted_bucketized_tweet_word': "f.expr('sortIntArrayByFrequency(bucketized_tweet_word)')"}, op='inline')
    op_feature_add_convert = FeatureAdd(
        cols={'most_used_word_bucket_id': "f.when(f.size(f.col('sorted_bucketized_tweet_word'))>0, f.col('sorted_bucketized_tweet_word').getItem(0)).otherwise(np.nan)",
             'second_used_word_bucket_id': "f.when(f.size(f.col('sorted_bucketized_tweet_word'))>1, f.col('sorted_bucketized_tweet_word').getItem(1)).otherwise(np.nan)"}, op='inline')
    proc.append_ops([op_feature_add_sorted_bucketized_tweet_word, op_feature_add_convert])

    # 5. transform
    t1 = timer()
    if sampleRatio != 1:
        df = df.sample(sampleRatio)
    df = proc.transform(df, name=output_name)
    t2 = timer()
    print("categorify and getMostAndSecondUsedWordBucketId took %.3f" % (t2 - t1))
    return (df, dict_dfs)


def encodingFeatures(df, proc, output_name, gen_dict, sampleRatio=1):   
    targets = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
    y_mean_all = []
    
    t1 = timer()
    if gen_dict:
        for tgt in targets:
            tmp = df.groupBy().mean(tgt).collect()[0]
            y_mean = tmp[f"avg({tgt})"]
            y_mean_all.append(y_mean)
        schema = t.StructType([t.StructField(tgt, t.FloatType(), True) for tgt in targets])
        y_mean_all_df = proc.spark.createDataFrame([tuple(y_mean_all)], schema)
        y_mean_all_df.write.format("parquet").mode("overwrite").save(
            "%s/%s/%s/targets_mean" % (proc.path_prefix, proc.current_path, proc.dicts_path))
    y_mean_all_df = proc.spark.read.parquet(
        "%s/%s/%s/targets_mean" % (proc.path_prefix, proc.current_path, proc.dicts_path))

    features = [
            'engaged_with_user_id',
            'language',
            'dt_dow',
            'tweet_type',
            'most_used_word_bucket_id',
            'second_used_word_bucket_id',
            'mentioned_count',
            'mentioned_bucket_id',
            ['has_mention', 'engaging_user_id'],
            ['mentioned_count', 'engaging_user_id'],
            ['mentioned_bucket_id', 'engaging_user_id'],
            ['language', 'engaged_with_user_id'],
            ['language', 'engaging_user_id'],
            ['dt_dow', 'engaged_with_user_id'],
            ['dt_dow', 'engaging_user_id'],
            ['dt_hour', 'engaged_with_user_id'],
            ['dt_hour', 'engaging_user_id'],
            ['tweet_type', 'engaged_with_user_id'],
            ['tweet_type', 'engaging_user_id']
    ]
    excludes = {'dt_dow': ['reply_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'],
               'tweet_type': ['like_timestamp', 'retweet_with_comment_timestamp']
              }

    te_train_dfs = []
    te_test_dfs = []
    for c in features:
        target_tmp = targets
        out_name = ""
        if str(c) in excludes:
            target_tmp = []
            for tgt in targets:
                if tgt not in excludes[c]:
                    target_tmp.append(tgt)
        out_col_list = []
        for tgt in target_tmp:
            if isinstance(c, list):
                out_col_list.append('GTE_'+'_'.join(c)+'_'+tgt)
                out_name = 'GTE_'+'_'.join(c)
            else:
                out_col_list.append(f'TE_{c}_{tgt}')
                out_name = f'TE_{c}'
        if gen_dict:
            start = timer()
            encoder = TargetEncoder(proc, c, target_tmp, out_col_list, out_name, out_dtype=t.FloatType(), y_mean_list=y_mean_all)
            te_train_df, te_test_df = encoder.transform(df)
            te_train_dfs.append({'col_name': ['fold'] + (c if isinstance(c, list) else [c]), 'dict': te_train_df})
            te_test_dfs.append({'col_name': c, 'dict': te_test_df})
            print(f"generating target encoding for %s upon %s took %.1f seconds"%(str(c), str(target_tmp), timer()-start))
        else:
            te_train_path = "%s/%s/%s/train/%s" % (proc.path_prefix, proc.current_path, proc.dicts_path, out_name)
            te_test_path = "%s/%s/%s/test/%s" % (proc.path_prefix, proc.current_path, proc.dicts_path, out_name)               
            te_train_dfs.append({'col_name': ['fold'] + (c if isinstance(c, list) else [c]), 'dict': proc.spark.read.parquet(te_train_path)})
            te_test_dfs.append({'col_name': c, 'dict': proc.spark.read.parquet(te_test_path)})
    t2 = timer()
    print("Generate encoding feature totally took %.3f" % (t2 - t1))

    # merge dicts to original table
    op_merge_to_train = ModelMerge(te_train_dfs)
    proc.reset_ops([op_merge_to_train])
    
    # select features
    op_select = SelectFeature(target_list + final_feature_list)
    proc.append_ops([op_select])

    t1 = timer()
    if sampleRatio != 1:
        df = df.sample(sampleRatio)
    df = proc.transform(df, name=output_name)
    t2 = timer()
    print("encodingFeatures took %.3f" % (t2 - t1))
    
    return (df, te_train_dfs, te_test_dfs, y_mean_all_df)


def splitByDate(df, proc, train_output, test_output, numFolds=5):
    # 1.1 get timestamp range
    import datetime
    min_timestamp = df.select('tweet_timestamp').agg({'tweet_timestamp': 'min'}).collect()[0]['min(tweet_timestamp)']
    max_timestamp = df.select('tweet_timestamp').agg({'tweet_timestamp': 'max'}).collect()[0]['max(tweet_timestamp)']
    seconds_in_day = 3600 * 24

    print(
        "min_timestamp is %s, max_timestamp is %s, 20 days max is %s" % (
            datetime.datetime.fromtimestamp(min_timestamp).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.datetime.fromtimestamp(max_timestamp).strftime('%Y-%m-%d %H:%M:%S'),
            datetime.datetime.fromtimestamp(min_timestamp + 20 * seconds_in_day).strftime('%Y-%m-%d %H:%M:%S')
        ))

    time_range_split = {
        'train': (min_timestamp, seconds_in_day * 18 + min_timestamp),
        'test': (seconds_in_day * 18 + min_timestamp, max_timestamp)
    }

    print(time_range_split)

    # 1.2 save ranged data for train
    # filtering out train range data and save
    train_start, train_end = time_range_split['train']
    test_start, test_end = time_range_split['test']
    t1 = timer()
    train_df = df.filter(
        (f.col('tweet_timestamp') >= f.lit(train_start)) & (f.col('tweet_timestamp') < f.lit(train_end)))
    train_df = train_df.withColumn("fold", f.round(f.rand(seed=42)*(numFolds-1)).cast("int"))
    train_df.write.format('parquet').mode('overwrite').save(proc.path_prefix + proc.current_path + train_output)
    t2 = timer()
    print("split to train took %.3f" % (t2 - t1))
    
    t1 = timer()
    test_df = df.filter(
        (f.col('tweet_timestamp') >= f.lit(test_start)) & (f.col('tweet_timestamp') < f.lit(test_end)))
    test_df.write.format('parquet').mode('overwrite').save(proc.path_prefix + proc.current_path + test_output)
    t2 = timer()
    print("split to test took %.3f" % (t2 - t1))
    
    return (proc.spark.read.parquet(proc.path_prefix + proc.current_path + train_output),
            proc.spark.read.parquet(proc.path_prefix + proc.current_path + test_output))


def mergeFeaturesToTest(df, dict_dfs, te_test_dfs, y_mean_all_df, proc, output_name):
    # categorify test data with train generated dictionary
    # 1. since we need both mentioned_bucket_id and mentioned_count, add two mention id dict_dfs
    for dict_df in dict_dfs:
        if dict_df['col_name'] == 'mention':
            dict_dfs.append({'col_name': 'mentioned_bucket_id', 'dict':dict_df['dict']})
            dict_dfs.append({'col_name': 'mentioned_count', 'dict':dict_df['dict'].drop('dict_col_id').withColumnRenamed('count', 'dict_col_id')})
    op_feature_add = FeatureAdd({"mentioned_bucket_id": "f.col('mention')", "mentioned_count": "f.col('mention')"}, op='inline')
    
    # 2. categorify
    op_categorify_multiItems = Categorify([{'bucketized_tweet_word': 'tweet'}], dict_dfs=dict_dfs, doSplit=True, sep=' ')
    op_categorify_singleItem = Categorify(['mentioned_bucket_id', 'mentioned_count'], dict_dfs=dict_dfs)
    proc.reset_ops([op_feature_add, op_categorify_multiItems, op_categorify_singleItem])
    
    # 3. get most and second used bucketized_tweet_word
    op_feature_add_sorted_bucketized_tweet_word = FeatureAdd(
        cols={'sorted_bucketized_tweet_word': "f.expr('sortIntArrayByFrequency(bucketized_tweet_word)')"}, op='inline')
    op_feature_add_convert = FeatureAdd(
        cols={'most_used_word_bucket_id': "f.when(f.size(f.col('sorted_bucketized_tweet_word'))>0, f.col('sorted_bucketized_tweet_word').getItem(0)).otherwise(np.nan)",
             'second_used_word_bucket_id': "f.when(f.size(f.col('sorted_bucketized_tweet_word'))>1, f.col('sorted_bucketized_tweet_word').getItem(1)).otherwise(np.nan)"}, op='inline')
    proc.append_ops([op_feature_add_sorted_bucketized_tweet_word, op_feature_add_convert])
    
    # 4. merge dicts to original table
    op_merge_to_test = ModelMerge(te_test_dfs)
    proc.append_ops([op_merge_to_test])
        
    # 5. set null in encoding features to y_mean
    y_mean_all = y_mean_all_df.collect()[0]
    for tgt in target_list:
        to_fill_list = []
        for feature in final_feature_list:
            if 'TE_' in feature and tgt in feature:
                to_fill_list.append(feature)
        op_fill_na = FillNA(to_fill_list, y_mean_all[tgt])
        proc.append_ops([op_fill_na])
    
    # select features
    op_select = SelectFeature(target_list + final_feature_list)
    proc.append_ops([op_select])

    t1 = timer()
    df = proc.transform(df, name=output_name)
    t2 = timer()
    print("mergeFeaturesToTest took %.3f" % (t2 - t1))


def get_encoding_features_dicts(proc):
    targets = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
    y_mean_all = []
    y_mean_all_df = proc.spark.read.parquet(
        "%s/%s/%s/targets_mean" % (proc.path_prefix, proc.current_path, proc.dicts_path))
    features = [
            'engaged_with_user_id',
            'language',
            'dt_dow',
            'tweet_type',
            'most_used_word_bucket_id',
            'second_used_word_bucket_id',
            'mentioned_count',
            'mentioned_bucket_id',
            ['has_mention', 'engaging_user_id'],
            ['mentioned_count', 'engaging_user_id'],
            ['mentioned_bucket_id', 'engaging_user_id'],
            ['language', 'engaged_with_user_id'],
            ['language', 'engaging_user_id'],
            ['dt_dow', 'engaged_with_user_id'],
            ['dt_dow', 'engaging_user_id'],
            ['dt_hour', 'engaged_with_user_id'],
            ['dt_hour', 'engaging_user_id'],
            ['tweet_type', 'engaged_with_user_id'],
            ['tweet_type', 'engaging_user_id']
    ]
    excludes = {'dt_dow': ['reply_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'],
               'tweet_type': ['like_timestamp', 'retweet_with_comment_timestamp']
              }

    te_train_dfs = []
    te_test_dfs = []
    for c in features:
        target_tmp = targets
        out_name = ""
        if str(c) in excludes:
            target_tmp = []
            for tgt in targets:
                if tgt not in excludes[c]:
                    target_tmp.append(tgt)
        out_col_list = []
        for tgt in target_tmp:
            if isinstance(c, list):
                out_col_list.append('GTE_'+'_'.join(c)+'_'+tgt)
                out_name = 'GTE_'+'_'.join(c)
            else:
                out_col_list.append(f'TE_{c}_{tgt}')
                out_name = f'TE_{c}'
        te_train_path = "%s/%s/%s/train/%s" % (proc.path_prefix, proc.current_path, proc.dicts_path, out_name)
        te_test_path = "%s/%s/%s/test/%s" % (proc.path_prefix, proc.current_path, proc.dicts_path, out_name)
        te_train_dfs.append({'col_name': ['fold'] + (c if isinstance(c, list) else [c]), 'dict': proc.spark.read.parquet(te_train_path)})
        te_test_dfs.append({'col_name': c, 'dict': proc.spark.read.parquet(te_test_path)})
        
    return (te_train_dfs, te_test_dfs, y_mean_all_df)



In [None]:
path_prefix = "hdfs://"
current_path = "/recsys2021_0608_example/"
original_folder = "/recsys2021_0608/"
dicts_folder = "recsys_dicts/"
recsysSchema = RecsysSchema()

##### 1. Start spark and initialize data processor #####
scala_udf_jars = "/mnt/nvme2/chendi/BlueWhale/recdp/ScalaProcessUtils/target/recdp-scala-extensions-0.1.0-jar-with-dependencies.jar"

t0 = timer()
spark = SparkSession.builder.master('yarn')\
    .appName("Recsys2021_data_process")\
    .config("spark.executor.memory", "20g")\
    .config("spark.executor.memoryOverhead", "10g")\
    .config("spark.sql.broadcastTimeout", "7200")\
    .config("spark.cleaner.periodicGC.interval", "10min")\
    .config("spark.executorEnv.HF_DATASETS_OFFLINE", "1")\
    .config("spark.executorEnv.TRANSFORMERS_OFFLINE", "1")\
    .config("spark.driver.extraClassPath", f"{scala_udf_jars}")\
    .config("spark.executor.extraClassPath", f"{scala_udf_jars}")\
    .getOrCreate()

schema = recsysSchema.toStructType()

# 1.1 prepare dataFrames
# 1.2 create RecDP DataProcessor
proc = DataProcessor(spark, path_prefix,
                     current_path=current_path, dicts_path=dicts_folder, shuffle_disk_capacity="1200GB")
df = spark.read.parquet(path_prefix + original_folder)
df = df.withColumnRenamed('enaging_user_following_count', 'engaging_user_following_count')
df = df.withColumnRenamed('enaging_user_is_verified', 'engaging_user_is_verified')

# # fast test, comment for full dataset
# df.sample(0.01).write.format("parquet").mode("overwrite").save("%s/sample_0_0_1" % current_path)
# df = spark.read.parquet("%s/sample_0_0_1" % current_path)

# ===============================================
# decode tweet_tokens
df = decodeBertTokenizerAndExtractFeatures(df, proc, output_name="decoded_with_extracted_features")

# ===============================================
# splitting and sampling
df, test_df = splitByDate(df, proc, train_output="train", test_output="test", numFolds=5)

# ===============================================
# generate dictionary for categorify indexing
df, dict_dfs = categorifyFeatures(df, proc, output_name="train_with_categorified_features", gen_dict=True, sampleRatio=1)

# ===============================================
# encoding features
df, te_train_dfs, te_test_dfs, y_mean_all_df = encodingFeatures(df, proc, output_name="train_with_features_sample_0_0_3", gen_dict=True, sampleRatio=0.03)

# ===============================================
# adding features to test
### Below codes is used to prepare for mergeFeaturesToTest for test separately ###
# test_df = spark.read.parquet("/recsys2021_0608_example/test")
# dict_names = ['tweet', 'mention']
# dict_dfs = [{'col_name': name, 'dict': spark.read.parquet(
#     "%s/%s/%s/%s" % (proc.path_prefix, proc.current_path, proc.dicts_path, name))} for name in dict_names]
# te_train_dfs, te_test_dfs, y_mean_all_df = get_encoding_features_dicts(proc)
##################################################################################
test_df = mergeFeaturesToTest(test_df, dict_dfs, te_test_dfs, y_mean_all_df, proc, output_name="test_with_features")


* per core memory size is 5.000 GB and shuffle_disk maximum capacity is 1200.000 GB
* None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
* BertTokenizer decode and feature extacting took 2992.285

======

* min_timestamp is 2021-02-04 08:00:00, max_timestamp is 2021-02-25 07:59:59, 20 days max is 2021-02-24 08:00:00
* {'train': (1612396800, 1613952000), 'test': (1613952000, 1614211199)}
* split to train took 595.243
* split to test took 311.072

======

* Generate Dictionary took 847.997

======

* bhj total threshold is 67.109 M rows, one bhj threshold is 30.000 M rows, flush_threshold is 960.000 GB
* ('tweet', DataFrame[dict_col: string, count: bigint, dict_col_id: int], 148528557)
* etstimated_to_shuffle_size for tweet is 123.897 GB, will do smj
* do smj to bucketized_tweet_word
* bhj total threshold is 67.109 M rows, one bhj threshold is 30.000 M rows, flush_threshold is 960.000 GB
* ('mentioned_bucket_id', DataFrame[dict_col: string, count: bigint, dict_col_id: int], 4891669)
* ('mentioned_count', DataFrame[dict_col: string, dict_col_id: bigint], 4891669)
* mentioned_bucket_id will do bhj
* mentioned_count will do bhj
* do bhj to mentioned_bucket_id
* do bhj to mentioned_count
* categorify and getMostAndSecondUsedWordBucketId took 3731.166

======

* generating target encoding for engaged_with_user_id upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 92.6 seconds
* generating target encoding for language upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 21.1 seconds
* generating target encoding for dt_dow upon ['retweet_timestamp'] took 11.7 seconds
* generating target encoding for tweet_type upon ['retweet_with_comment_timestamp', 'like_timestamp'] took 14.7 seconds
* generating target encoding for most_used_word_bucket_id upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 17.6 seconds
* generating target encoding for second_used_word_bucket_id upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 16.5 seconds
* generating target encoding for mentioned_count upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 20.6 seconds
* generating target encoding for mentioned_bucket_id upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 14.1 seconds
* generating target encoding for ['has_mention', 'engaging_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 120.0 seconds
* generating target encoding for ['mentioned_count', 'engaging_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 126.9 seconds
* generating target encoding for ['mentioned_bucket_id', 'engaging_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 119.3 seconds
* generating target encoding for ['language', 'engaged_with_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 126.6 seconds
* generating target encoding for ['language', 'engaging_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 144.0 seconds
* generating target encoding for ['dt_dow', 'engaged_with_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 136.2 seconds
* generating target encoding for ['dt_dow', 'engaging_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 157.3 seconds
* generating target encoding for ['dt_hour', 'engaged_with_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 153.2 seconds
* generating target encoding for ['dt_hour', 'engaging_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 177.4 seconds
* generating target encoding for ['tweet_type', 'engaged_with_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 118.2 seconds
* generating target encoding for ['tweet_type', 'engaging_user_id'] upon ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp'] took 141.2 seconds
* Generate encoding feature totally took 1738.177

======

* bhj total threshold is 67.109 M rows, one bhj threshold is 30.000 M rows, flush_threshold is 960.000 GB
* (['fold', 'tweet_type'], DataFrame[tweet_type: string, fold: double, TE_tweet_type_retweet_with_comment_timestamp: float, TE_tweet_type_like_timestamp: float], 18)
* (['fold', 'dt_dow'], DataFrame[dt_dow: int, fold: double, TE_dt_dow_retweet_timestamp: float], 42)
* (['fold', 'most_used_word_bucket_id'], DataFrame[most_used_word_bucket_id: string, fold: double, TE_most_used_word_bucket_id_reply_timestamp: float, TE_most_used_word_bucket_id_retweet_timestamp: float, TE_most_used_word_bucket_id_retweet_with_comment_timestamp: float, TE_most_used_word_bucket_id_like_timestamp: float], 120)
* (['fold', 'second_used_word_bucket_id'], DataFrame[second_used_word_bucket_id: string, fold: double, TE_second_used_word_bucket_id_reply_timestamp: float, TE_second_used_word_bucket_id_retweet_timestamp: float, TE_second_used_word_bucket_id_retweet_with_comment_timestamp: float, TE_second_used_word_bucket_id_like_timestamp: float], 126)
* (['fold', 'mentioned_bucket_id'], DataFrame[mentioned_bucket_id: int, fold: double, TE_mentioned_bucket_id_reply_timestamp: float, TE_mentioned_bucket_id_retweet_timestamp: float, TE_mentioned_bucket_id_retweet_with_comment_timestamp: float, TE_mentioned_bucket_id_like_timestamp: float], 132)
* (['fold', 'language'], DataFrame[language: string, fold: double, TE_language_reply_timestamp: float, TE_language_retweet_timestamp: float, TE_language_retweet_with_comment_timestamp: float, TE_language_like_timestamp: float], 396)
* (['fold', 'mentioned_count'], DataFrame[mentioned_count: bigint, fold: double, TE_mentioned_count_reply_timestamp: float, TE_mentioned_count_retweet_timestamp: float, TE_mentioned_count_retweet_with_comment_timestamp: float, TE_mentioned_count_like_timestamp: float], 25968)
* (['fold', 'engaged_with_user_id'], DataFrame[engaged_with_user_id: string, fold: double, TE_engaged_with_user_id_reply_timestamp: float, TE_engaged_with_user_id_retweet_timestamp: float, TE_engaged_with_user_id_retweet_with_comment_timestamp: float, TE_engaged_with_user_id_like_timestamp: float], 68843219)
* (['fold', 'language', 'engaged_with_user_id'], DataFrame[language: string, engaged_with_user_id: string, fold: double, GTE_language_engaged_with_user_id_reply_timestamp: float, GTE_language_engaged_with_user_id_retweet_timestamp: float, GTE_language_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_language_engaged_with_user_id_like_timestamp: float], 89264853)
* (['fold', 'tweet_type', 'engaged_with_user_id'], DataFrame[tweet_type: string, engaged_with_user_id: string, fold: double, GTE_tweet_type_engaged_with_user_id_reply_timestamp: float, GTE_tweet_type_engaged_with_user_id_retweet_timestamp: float, GTE_tweet_type_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_tweet_type_engaged_with_user_id_like_timestamp: float], 95344319)
* (['fold', 'mentioned_bucket_id', 'engaging_user_id'], DataFrame[mentioned_bucket_id: int, engaging_user_id: string, fold: double, GTE_mentioned_bucket_id_engaging_user_id_reply_timestamp: float, GTE_mentioned_bucket_id_engaging_user_id_retweet_timestamp: float, GTE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp: float, GTE_mentioned_bucket_id_engaging_user_id_like_timestamp: float], 129179090)
* (['fold', 'has_mention', 'engaging_user_id'], DataFrame[has_mention: int, engaging_user_id: string, fold: double, GTE_has_mention_engaging_user_id_reply_timestamp: float, GTE_has_mention_engaging_user_id_retweet_timestamp: float, GTE_has_mention_engaging_user_id_retweet_with_comment_timestamp: float, GTE_has_mention_engaging_user_id_like_timestamp: float], 131959138)
* (['fold', 'language', 'engaging_user_id'], DataFrame[language: string, engaging_user_id: string, fold: double, GTE_language_engaging_user_id_reply_timestamp: float, GTE_language_engaging_user_id_retweet_timestamp: float, GTE_language_engaging_user_id_retweet_with_comment_timestamp: float, GTE_language_engaging_user_id_like_timestamp: float], 149779869)
* (['fold', 'dt_dow', 'engaged_with_user_id'], DataFrame[dt_dow: int, engaged_with_user_id: string, fold: double, GTE_dt_dow_engaged_with_user_id_reply_timestamp: float, GTE_dt_dow_engaged_with_user_id_retweet_timestamp: float, GTE_dt_dow_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_dt_dow_engaged_with_user_id_like_timestamp: float], 155971686)
* (['fold', 'tweet_type', 'engaging_user_id'], DataFrame[tweet_type: string, engaging_user_id: string, fold: double, GTE_tweet_type_engaging_user_id_reply_timestamp: float, GTE_tweet_type_engaging_user_id_retweet_timestamp: float, GTE_tweet_type_engaging_user_id_retweet_with_comment_timestamp: float, GTE_tweet_type_engaging_user_id_like_timestamp: float], 162333565)
* (['fold', 'mentioned_count', 'engaging_user_id'], DataFrame[mentioned_count: bigint, engaging_user_id: string, fold: double, GTE_mentioned_count_engaging_user_id_reply_timestamp: float, GTE_mentioned_count_engaging_user_id_retweet_timestamp: float, GTE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp: float, GTE_mentioned_count_engaging_user_id_like_timestamp: float], 167086112)
* (['fold', 'dt_hour', 'engaged_with_user_id'], DataFrame[dt_hour: int, engaged_with_user_id: string, fold: double, GTE_dt_hour_engaged_with_user_id_reply_timestamp: float, GTE_dt_hour_engaged_with_user_id_retweet_timestamp: float, GTE_dt_hour_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_dt_hour_engaged_with_user_id_like_timestamp: float], 201562399)
* (['fold', 'dt_dow', 'engaging_user_id'], DataFrame[dt_dow: int, engaging_user_id: string, fold: double, GTE_dt_dow_engaging_user_id_reply_timestamp: float, GTE_dt_dow_engaging_user_id_retweet_timestamp: float, GTE_dt_dow_engaging_user_id_retweet_with_comment_timestamp: float, GTE_dt_dow_engaging_user_id_like_timestamp: float], 247354193)
* (['fold', 'dt_hour', 'engaging_user_id'], DataFrame[dt_hour: int, engaging_user_id: string, fold: double, GTE_dt_hour_engaging_user_id_reply_timestamp: float, GTE_dt_hour_engaging_user_id_retweet_timestamp: float, GTE_dt_hour_engaging_user_id_retweet_with_comment_timestamp: float, GTE_dt_hour_engaging_user_id_like_timestamp: float], 337378319)
* ['fold', 'tweet_type'] will do bhj
* ['fold', 'dt_dow'] will do bhj
* ['fold', 'most_used_word_bucket_id'] will do bhj
* ['fold', 'second_used_word_bucket_id'] will do bhj
* ['fold', 'mentioned_bucket_id'] will do bhj
* ['fold', 'language'] will do bhj
* ['fold', 'mentioned_count'] will do bhj
* etstimated_to_shuffle_size for ['fold', 'engaged_with_user_id'] is 3.328 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'language', 'engaged_with_user_id'] is 3.238 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'tweet_type', 'engaged_with_user_id'] is 3.148 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'mentioned_bucket_id', 'engaging_user_id'] is 3.058 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'has_mention', 'engaging_user_id'] is 2.968 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'language', 'engaging_user_id'] is 2.878 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'dt_dow', 'engaged_with_user_id'] is 2.788 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'tweet_type', 'engaging_user_id'] is 2.698 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'mentioned_count', 'engaging_user_id'] is 2.608 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'dt_hour', 'engaged_with_user_id'] is 2.518 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'dt_dow', 'engaging_user_id'] is 2.428 GB, will do smj
* etstimated_to_shuffle_size for ['fold', 'dt_hour', 'engaging_user_id'] is 2.338 GB, will do smj
* do bhj to ['fold', 'tweet_type']
* do bhj to ['fold', 'dt_dow']
* do bhj to ['fold', 'most_used_word_bucket_id']
* do bhj to ['fold', 'second_used_word_bucket_id']
* do bhj to ['fold', 'mentioned_bucket_id']
* do bhj to ['fold', 'language']
* do bhj to ['fold', 'mentioned_count']
* do smj to ['fold', 'engaged_with_user_id']
* do smj to ['fold', 'language', 'engaged_with_user_id']
* do smj to ['fold', 'tweet_type', 'engaged_with_user_id']
* do smj to ['fold', 'mentioned_bucket_id', 'engaging_user_id']
* do smj to ['fold', 'has_mention', 'engaging_user_id']
* do smj to ['fold', 'language', 'engaging_user_id']
* do smj to ['fold', 'dt_dow', 'engaged_with_user_id']
* do smj to ['fold', 'tweet_type', 'engaging_user_id']
* do smj to ['fold', 'mentioned_count', 'engaging_user_id']
* do smj to ['fold', 'dt_hour', 'engaged_with_user_id']
* do smj to ['fold', 'dt_dow', 'engaging_user_id']
* do smj to ['fold', 'dt_hour', 'engaging_user_id']
* encodingFeatures took 362.135

======

* bhj total threshold is 67.109 M rows, one bhj threshold is 30.000 M rows, flush_threshold is 960.000 GB
* ('tweet', DataFrame[dict_col: string, count: bigint, dict_col_id: int], 148528557)
* etstimated_to_shuffle_size for tweet is 20.076 GB, will do smj
* do smj to bucketized_tweet_word
* bhj total threshold is 67.109 M rows, one bhj threshold is 30.000 M rows, flush_threshold is 960.000 GB
* ('mentioned_bucket_id', DataFrame[dict_col: string, count: bigint, dict_col_id: int], 4891669)
* ('mentioned_count', DataFrame[dict_col: string, dict_col_id: bigint], 4891669)
* ('mentioned_bucket_id', DataFrame[dict_col: string, count: bigint, dict_col_id: int], 4891669)
* ('mentioned_count', DataFrame[dict_col: string, dict_col_id: bigint], 4891669)
* mentioned_bucket_id will do bhj
* mentioned_count will do bhj
* mentioned_bucket_id will do bhj
* mentioned_count will do bhj
* do bhj to mentioned_bucket_id
* do bhj to mentioned_count
* do bhj to mentioned_bucket_id
* do bhj to mentioned_count
* bhj total threshold is 67.109 M rows, one bhj threshold is 30.000 M rows, flush_threshold is 960.000 GB
* ('tweet_type', DataFrame[tweet_type: string, TE_tweet_type_retweet_with_comment_timestamp: float, TE_tweet_type_like_timestamp: float], 3)
* ('dt_dow', DataFrame[dt_dow: int, TE_dt_dow_retweet_timestamp: float], 7)
* ('most_used_word_bucket_id', DataFrame[most_used_word_bucket_id: string, TE_most_used_word_bucket_id_reply_timestamp: float, TE_most_used_word_bucket_id_retweet_timestamp: float, TE_most_used_word_bucket_id_retweet_with_comment_timestamp: float, TE_most_used_word_bucket_id_like_timestamp: float], 20)
* ('second_used_word_bucket_id', DataFrame[second_used_word_bucket_id: string, TE_second_used_word_bucket_id_reply_timestamp: float, TE_second_used_word_bucket_id_retweet_timestamp: float, TE_second_used_word_bucket_id_retweet_with_comment_timestamp: float, TE_second_used_word_bucket_id_like_timestamp: float], 21)
* ('mentioned_bucket_id', DataFrame[mentioned_bucket_id: int, TE_mentioned_bucket_id_reply_timestamp: float, TE_mentioned_bucket_id_retweet_timestamp: float, TE_mentioned_bucket_id_retweet_with_comment_timestamp: float, TE_mentioned_bucket_id_like_timestamp: float], 22)
* ('language', DataFrame[language: string, TE_language_reply_timestamp: float, TE_language_retweet_timestamp: float, TE_language_retweet_with_comment_timestamp: float, TE_language_like_timestamp: float], 66)
* ('mentioned_count', DataFrame[mentioned_count: bigint, TE_mentioned_count_reply_timestamp: float, TE_mentioned_count_retweet_timestamp: float, TE_mentioned_count_retweet_with_comment_timestamp: float, TE_mentioned_count_like_timestamp: float], 4328)
* ('engaged_with_user_id', DataFrame[engaged_with_user_id: string, TE_engaged_with_user_id_reply_timestamp: float, TE_engaged_with_user_id_retweet_timestamp: float, TE_engaged_with_user_id_retweet_with_comment_timestamp: float, TE_engaged_with_user_id_like_timestamp: float], 22887349)
* (['language', 'engaged_with_user_id'], DataFrame[language: string, engaged_with_user_id: string, GTE_language_engaged_with_user_id_reply_timestamp: float, GTE_language_engaged_with_user_id_retweet_timestamp: float, GTE_language_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_language_engaged_with_user_id_like_timestamp: float], 35483505)
* (['tweet_type', 'engaged_with_user_id'], DataFrame[tweet_type: string, engaged_with_user_id: string, GTE_tweet_type_engaged_with_user_id_reply_timestamp: float, GTE_tweet_type_engaged_with_user_id_retweet_timestamp: float, GTE_tweet_type_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_tweet_type_engaged_with_user_id_like_timestamp: float], 36043484)
* (['has_mention', 'engaging_user_id'], DataFrame[has_mention: int, engaging_user_id: string, GTE_has_mention_engaging_user_id_reply_timestamp: float, GTE_has_mention_engaging_user_id_retweet_timestamp: float, GTE_has_mention_engaging_user_id_retweet_with_comment_timestamp: float, GTE_has_mention_engaging_user_id_like_timestamp: float], 47561701)
* (['mentioned_bucket_id', 'engaging_user_id'], DataFrame[mentioned_bucket_id: int, engaging_user_id: string, GTE_mentioned_bucket_id_engaging_user_id_reply_timestamp: float, GTE_mentioned_bucket_id_engaging_user_id_retweet_timestamp: float, GTE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp: float, GTE_mentioned_bucket_id_engaging_user_id_like_timestamp: float], 52816686)
* (['language', 'engaging_user_id'], DataFrame[language: string, engaging_user_id: string, GTE_language_engaging_user_id_reply_timestamp: float, GTE_language_engaging_user_id_retweet_timestamp: float, GTE_language_engaging_user_id_retweet_with_comment_timestamp: float, GTE_language_engaging_user_id_like_timestamp: float], 62810211)
* (['tweet_type', 'engaging_user_id'], DataFrame[tweet_type: string, engaging_user_id: string, GTE_tweet_type_engaging_user_id_reply_timestamp: float, GTE_tweet_type_engaging_user_id_retweet_timestamp: float, GTE_tweet_type_engaging_user_id_retweet_with_comment_timestamp: float, GTE_tweet_type_engaging_user_id_like_timestamp: float], 63468214)
* (['dt_dow', 'engaged_with_user_id'], DataFrame[dt_dow: int, engaged_with_user_id: string, GTE_dt_dow_engaged_with_user_id_reply_timestamp: float, GTE_dt_dow_engaged_with_user_id_retweet_timestamp: float, GTE_dt_dow_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_dt_dow_engaged_with_user_id_like_timestamp: float], 69049586)
* (['mentioned_count', 'engaging_user_id'], DataFrame[mentioned_count: bigint, engaging_user_id: string, GTE_mentioned_count_engaging_user_id_reply_timestamp: float, GTE_mentioned_count_engaging_user_id_retweet_timestamp: float, GTE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp: float, GTE_mentioned_count_engaging_user_id_like_timestamp: float], 92173751)
* (['dt_hour', 'engaged_with_user_id'], DataFrame[dt_hour: int, engaged_with_user_id: string, GTE_dt_hour_engaged_with_user_id_reply_timestamp: float, GTE_dt_hour_engaged_with_user_id_retweet_timestamp: float, GTE_dt_hour_engaged_with_user_id_retweet_with_comment_timestamp: float, GTE_dt_hour_engaged_with_user_id_like_timestamp: float], 102369370)
* (['dt_dow', 'engaging_user_id'], DataFrame[dt_dow: int, engaging_user_id: string, GTE_dt_dow_engaging_user_id_reply_timestamp: float, GTE_dt_dow_engaging_user_id_retweet_timestamp: float, GTE_dt_dow_engaging_user_id_retweet_with_comment_timestamp: float, GTE_dt_dow_engaging_user_id_like_timestamp: float], 116332109)
* (['dt_hour', 'engaging_user_id'], DataFrame[dt_hour: int, engaging_user_id: string, GTE_dt_hour_engaging_user_id_reply_timestamp: float, GTE_dt_hour_engaging_user_id_retweet_timestamp: float, GTE_dt_hour_engaging_user_id_retweet_with_comment_timestamp: float, GTE_dt_hour_engaging_user_id_like_timestamp: float], 192785849)
* tweet_type will do bhj
* dt_dow will do bhj
* most_used_word_bucket_id will do bhj
* second_used_word_bucket_id will do bhj
* mentioned_bucket_id will do bhj
* language will do bhj
* mentioned_count will do bhj
* engaged_with_user_id will do bhj
* etstimated_to_shuffle_size for ['language', 'engaged_with_user_id'] is 17.399 GB, will do smj
* etstimated_to_shuffle_size for ['tweet_type', 'engaged_with_user_id'] is 16.897 GB, will do smj
* etstimated_to_shuffle_size for ['has_mention', 'engaging_user_id'] is 16.396 GB, will do smj
* etstimated_to_shuffle_size for ['mentioned_bucket_id', 'engaging_user_id'] is 15.894 GB, will do smj
* etstimated_to_shuffle_size for ['language', 'engaging_user_id'] is 15.392 GB, will do smj
* etstimated_to_shuffle_size for ['tweet_type', 'engaging_user_id'] is 14.890 GB, will do smj
* etstimated_to_shuffle_size for ['dt_dow', 'engaged_with_user_id'] is 14.388 GB, will do smj
* etstimated_to_shuffle_size for ['mentioned_count', 'engaging_user_id'] is 13.886 GB, will do smj
* etstimated_to_shuffle_size for ['dt_hour', 'engaged_with_user_id'] is 13.384 GB, will do smj
* etstimated_to_shuffle_size for ['dt_dow', 'engaging_user_id'] is 12.882 GB, will do smj
* etstimated_to_shuffle_size for ['dt_hour', 'engaging_user_id'] is 12.380 GB, will do smj
* do bhj to tweet_type
* do bhj to dt_dow
* do bhj to most_used_word_bucket_id
* do bhj to second_used_word_bucket_id
* do bhj to mentioned_bucket_id
* do bhj to language
* do bhj to mentioned_count
* do bhj to engaged_with_user_id
* do smj to ['language', 'engaged_with_user_id']
* do smj to ['tweet_type', 'engaged_with_user_id']
* do smj to ['has_mention', 'engaging_user_id']
* do smj to ['mentioned_bucket_id', 'engaging_user_id']
* do smj to ['language', 'engaging_user_id']
* do smj to ['tweet_type', 'engaging_user_id']
* do smj to ['dt_dow', 'engaged_with_user_id']
* do smj to ['mentioned_count', 'engaging_user_id']
* do smj to ['dt_hour', 'engaged_with_user_id']
* do smj to ['dt_dow', 'engaging_user_id']
* do smj to ['dt_hour', 'engaging_user_id']
* mergeFeaturesToTest took 1259.446

======
