In [16]:
spark.stop()

In [17]:
import init

import findspark
findspark.init()

import os
import pandas as pd
import numpy as np
from pyspark.sql import *
from pyspark import *
import pyspark.sql.functions as f
from pyspark.sql.types import *
import time
from pyrecdp.data_processor import *
from pyrecdp.encoder import *
from pyrecdp.utils import *

spark = SparkSession.builder.master('yarn')\
        .appName("Recsys2021_data_process")\
        .getOrCreate()
current_path = "/recsys2021_0608_processed/sample_0_3_20days/"
path_prefix = "hdfs://"
dicts_folder = "recsys_dicts/"

In [None]:
lookup_tbl = spark.read.parquet("file:///mnt/nvme2/chendi/BlueWhale/EricFeatures/CPD_hashtags_engaging_user_id_fold.parquet")
lookup_tbl.printSchema()
lookup_tbl.show(vertical=True, truncate=50)

In [5]:
train = spark.read.parquet("file:///mnt/nvme2/chendi/BlueWhale/sample_0_3/stage4_20days/add_chendi_feature/rsc_0622_best/train_with_features_0.15")
train.sample(0.66, 3).write.format('parquet').save("file:///mnt/nvme2/chendi/BlueWhale/sample_0_3/stage4_20days/add_chendi_feature/rsc_0622_best/train_with_features_0.1")


In [19]:
from collections import *
dtypes = {
    'text_tokens': StringType(),
    'hashtags': StringType(),
    'tweet_id': StringType(),
    'present_media': StringType(),
    'present_links': StringType(),
    'present_domains': StringType(),
    'tweet_type': StringType(),
    'language': StringType(),
    'tweet_timestamp': FloatType(),
    'engaged_with_user_id': StringType(),
    'engaged_with_user_follower_count': IntegerType(),
    'engaged_with_user_following_count': IntegerType(),
    'engaged_with_user_is_verified': BooleanType(),
    'engaged_with_user_account_creation': IntegerType(),
    'engaging_user_id': StringType(),
    'engaging_user_follower_count': IntegerType(),
    'engaging_user_following_count': IntegerType(),
    'engaging_user_is_verified': BooleanType(),     
    'engaging_user_account_creation': IntegerType(),
    'engagee_follows_engager': BooleanType(),
    'reply_timestamp': FloatType(),
    'retweet_timestamp': FloatType(),
    'retweet_with_comment_timestamp': FloatType(),
    'like_timestamp': FloatType()
}

schema = StructType()
for name, dtype in dtypes.items():
    schema = schema.add(name, dtype, True)
test = "file:///mnt/nvme2/chendi/BlueWhale/valid"
df = spark.read.format("csv").option("sep", "\x01").schema(schema).load(test)
df = df.sample(0.01)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
        'bert-base-multilingual-cased', do_lower_case=False)
tokenizer_decode = f.udf(lambda x: tokenizer.decode(
    [int(n) for n in x.split('\t')]))
format_url = f.udf(lambda x: x.replace(
    'https : / / t. co / ', 'https://t.co/').replace('@ ', '@'))
df = df.withColumn('tweet', format_url(tokenizer_decode(f.col('text_tokens'))))
#df = df.drop('reply_timestamp').drop('retweet_timestamp').drop('retweet_with_comment_timestamp').drop('like_timestamp')
df.write.format('parquet').mode('overwrite').save("file:///mnt/nvme2/chendi/BlueWhale/docker/valid_0.01.parquet")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [15]:
import pyspark.sql.functions as F
to_select = [
 'fold',
 'mentioned_bucket_id',
 'engaging_user_id',
 'mentioned_count',
 'has_mention',
 'second_used_word_bucket_id',
 'most_used_word_bucket_id',
 'mention',
 'engaged_with_user_id',
 'language',
 'has_rt',
 'tweet_type',
 'tweet_hour',
 'tweet_dow',
 'tweet_timestamp',
 'engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_is_verified',
 'engaged_with_user_account_creation',
 'engaging_user_follower_count',
 'engaging_user_following_count',
 'engaging_user_is_verified',
 'engaging_user_account_creation',
 'engagee_follows_engager',
 'reply_timestamp',
 'retweet_timestamp',
 'retweet_with_comment_timestamp',
 'like_timestamp',
 'present_domains',
 'present_links',
 'hashtags',
 'tweet_id',
 'present_media',
 'dt_dow',
 'dt_hour',
 #'a_ff_rate',
 #'b_ff_rate',
 'TE_language_reply_timestamp_indicator',
 'TE_language_retweet_timestamp_indicator',
 'TE_language_retweet_with_comment_timestamp_indicator',
 'TE_language_like_timestamp_indicator',
 'TE_tweet_dow_reply_timestamp_indicator',
 'TE_tweet_dow_retweet_timestamp_indicator',
 'TE_tweet_dow_retweet_with_comment_timestamp_indicator',
 'TE_tweet_dow_like_timestamp_indicator',
 'TE_tweet_type_reply_timestamp_indicator',
 'TE_tweet_type_retweet_timestamp_indicator',
 'TE_tweet_type_retweet_with_comment_timestamp_indicator',
 'TE_tweet_type_like_timestamp_indicator',
 'TE_engaged_with_user_id_reply_timestamp_indicator',
 'TE_engaged_with_user_id_retweet_timestamp_indicator',
 'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',
 'TE_engaged_with_user_id_like_timestamp_indicator',
 'TE_engaging_user_id_reply_timestamp_indicator',
 'TE_engaging_user_id_retweet_timestamp_indicator',
 'TE_engaging_user_id_retweet_with_comment_timestamp_indicator',
 'TE_engaging_user_id_like_timestamp_indicator',
 'GTE_engaged_with_user_id_reply_timestamp_indicator_engaging_user_id',
 'GTE_engaged_with_user_id_retweet_timestamp_indicator_engaging_user_id',
 'GTE_engaged_with_user_id_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_engaged_with_user_id_like_timestamp_indicator_engaging_user_id',
 'GTE_language_reply_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_language_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',
 'GTE_has_rt_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_has_rt_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_has_rt_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_has_rt_like_timestamp_indicator_engaged_with_user_id',
 'GTE_language_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_language_like_timestamp_indicator_engaged_with_user_id',
 'has_photo',
 'has_video',
 'has_gif',
 'has_links',
 'TE_most_used_word_bucket_id_reply_timestamp',
 'TE_most_used_word_bucket_id_retweet_timestamp',
 'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_most_used_word_bucket_id_like_timestamp',
 'TE_second_used_word_bucket_id_reply_timestamp',
 'TE_second_used_word_bucket_id_retweet_timestamp',
 'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_second_used_word_bucket_id_like_timestamp',
 'TE_mentioned_count_reply_timestamp',
 'TE_mentioned_count_retweet_timestamp',
 'TE_mentioned_count_retweet_with_comment_timestamp',
 'TE_mentioned_count_like_timestamp',
 'TE_mentioned_bucket_id_reply_timestamp',
 'TE_mentioned_bucket_id_retweet_timestamp',
 'TE_mentioned_bucket_id_retweet_with_comment_timestamp',
 'TE_mentioned_bucket_id_like_timestamp',
 'TE_has_mention_engaging_user_id_reply_timestamp',
 'TE_has_mention_engaging_user_id_retweet_timestamp',
 'TE_has_mention_engaging_user_id_retweet_with_comment_timestamp',
 'TE_has_mention_engaging_user_id_like_timestamp',
 'TE_mentioned_count_engaging_user_id_reply_timestamp',
 'TE_mentioned_count_engaging_user_id_retweet_timestamp',
 'TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',
 'TE_mentioned_count_engaging_user_id_like_timestamp',
 'TE_mentioned_bucket_id_engaging_user_id_reply_timestamp',
 'TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',
 'TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',
 'TE_mentioned_bucket_id_engaging_user_id_like_timestamp']


train = spark.read.parquet("file:///mnt/nvme2/chendi/BlueWhale/sample_0_3/stage4_20days/add_chendi_feature/train_with_features_chendi_0.15")
train = train.select(*to_select)\
    .withColumn('a_ff_rate', F.col("engaged_with_user_following_count")/F.col("engaged_with_user_follower_count"))\
    .withColumn('b_ff_rate', F.col("engaging_user_following_count") /F.col("engaging_user_follower_count"))
train.sample(0.2, 3).write.format('parquet').mode('overwrite').save("file:///mnt/nvme2/chendi/BlueWhale/sample_0_3/stage4_20days/add_chendi_feature/train_with_features_fold_0.03")

#valid = spark.read.parquet("file:///mnt/nvme2/chendi/BlueWhale/sample_0_3/stage4_20days/add_chendi_feature/validate_with_features_chendi_te_2")
#columns = valid.select(*to_select).columns
#\
#    .withColumn('a_ff_rate', F.col("engaged_with_user_following_count")/F.col("engaged_with_user_follower_count"))\
#    .withColumn('b_ff_rate', F.col("engaging_user_following_count") /F.col("engaging_user_follower_count"))
#valid.write.format('parquet').save("file:///mnt/nvme2/chendi/BlueWhale/sample_0_3/stage4_20days/add_chendi_feature/train_with_features_0.15")

In [16]:
for feature in feature_list:
    if feature not in columns:
        print(feature)

In [15]:
feature_list = ['has_photo',
'has_video',
'has_gif',
'a_ff_rate',
'b_ff_rate',
'dt_dow',
'dt_hour',
'engaged_with_user_follower_count',
'engaged_with_user_following_count',
'engaging_user_follower_count',
'engaging_user_following_count',
'engaged_with_user_is_verified',
'engaging_user_is_verified',
'GTE_language_reply_timestamp_indicator_engaged_with_user_id',
'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_language_like_timestamp_indicator_engaged_with_user_id',
'GTE_language_reply_timestamp_indicator_engaging_user_id',
'GTE_language_retweet_timestamp_indicator_engaging_user_id',
'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_language_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',
'TE_engaged_with_user_id_reply_timestamp_indicator',
'TE_engaged_with_user_id_retweet_timestamp_indicator',
'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',
'TE_engaged_with_user_id_like_timestamp_indicator',
'TE_language_reply_timestamp_indicator',
'TE_language_retweet_timestamp_indicator',
'TE_language_retweet_with_comment_timestamp_indicator',
'TE_language_like_timestamp_indicator',
'TE_tweet_dow_retweet_timestamp_indicator',
'TE_tweet_type_reply_timestamp_indicator',
'TE_tweet_type_retweet_timestamp_indicator',
'mentioned_bucket_id',
'mentioned_count',
'second_used_word_bucket_id',
'most_used_word_bucket_id',
'has_mention',
'TE_most_used_word_bucket_id_reply_timestamp',
'TE_most_used_word_bucket_id_retweet_timestamp',
'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',
'TE_most_used_word_bucket_id_like_timestamp',
'TE_second_used_word_bucket_id_reply_timestamp',
'TE_second_used_word_bucket_id_retweet_timestamp',
'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',
'TE_second_used_word_bucket_id_like_timestamp',
'TE_mentioned_count_reply_timestamp',
'TE_mentioned_count_retweet_timestamp',
'TE_mentioned_count_retweet_with_comment_timestamp',
'TE_mentioned_count_like_timestamp',
'TE_mentioned_bucket_id_reply_timestamp',
'TE_mentioned_bucket_id_retweet_timestamp',
'TE_mentioned_bucket_id_retweet_with_comment_timestamp',
'TE_mentioned_bucket_id_like_timestamp',
'TE_has_mention_engaging_user_id_reply_timestamp',
'TE_has_mention_engaging_user_id_retweet_timestamp',
'TE_has_mention_engaging_user_id_retweet_with_comment_timestamp',
'TE_has_mention_engaging_user_id_like_timestamp',
'TE_mentioned_count_engaging_user_id_reply_timestamp',
'TE_mentioned_count_engaging_user_id_retweet_timestamp',
'TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',
'TE_mentioned_count_engaging_user_id_like_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_reply_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_like_timestamp'
]


In [59]:
valid = spark.read.parquet("/recsys2021_0608_processed/sample_0_3_20days_eric_features/validate_with_features_chendi_te")
valid.select('has_mention').filter('has_mention == 1').count()

2024666

In [60]:
valid = spark.read.parquet("/recsys2021_0608_processed/sample_0_3_20days_eric_features/validate_with_features_chendi_te")
valid.select('tweet').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|tweet                                                                                                                                                                                                                                                                                                                                                          |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
#valid = spark.read.parquet("/recsys2021_0608_processed/sample_0_3_20days_eric_features/validate_with_features_chendi_te_2")
#valid.printSchema()
word_dict_df = spark.read.parquet("file:///mnt/nvme2/chendi/BlueWhale/docker/data/mention_with_bucketid")
word_dict_df.
#word_dict_df.drop('dict_col_id').drop('count').withColumn('bucket_id', f.col('bucket_id').cast(ByteType())).write.format('parquet').mode("overwrite").save("file:///mnt/nvme2/chendi/BlueWhale/docker/data/tweet_word_bucketid.parquet")

root
 |-- mention: string (nullable = true)
 |-- count: long (nullable = true)
 |-- bucket_id: double (nullable = true)



In [32]:
feature_list = [
'has_photo',
'has_video',
'has_gif',
'a_ff_rate',
'b_ff_rate',
'dt_dow',
'dt_hour',
'engaged_with_user_follower_count',
'engaged_with_user_following_count',
'engaging_user_follower_count',
'engaging_user_following_count',
'engaged_with_user_is_verified',
'engaging_user_is_verified',
'GTE_language_reply_timestamp_indicator_engaged_with_user_id',
'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_language_like_timestamp_indicator_engaged_with_user_id',
'GTE_language_reply_timestamp_indicator_engaging_user_id',
'GTE_language_retweet_timestamp_indicator_engaging_user_id',
'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_language_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',
'TE_engaged_with_user_id_reply_timestamp_indicator',
'TE_engaged_with_user_id_retweet_timestamp_indicator',
'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',
'TE_engaged_with_user_id_like_timestamp_indicator',
'TE_language_reply_timestamp_indicator',
'TE_language_retweet_timestamp_indicator',
'TE_language_retweet_with_comment_timestamp_indicator',
'TE_language_like_timestamp_indicator',
'TE_tweet_dow_retweet_timestamp_indicator',
'TE_tweet_type_reply_timestamp_indicator',
'TE_tweet_type_retweet_timestamp_indicator',
'mentioned_bucket_id',
'mentioned_count',
'second_used_word_bucket_id',
'most_used_word_bucket_id',
'has_mention',
'TE_most_used_word_bucket_id_reply_timestamp',
'TE_most_used_word_bucket_id_retweet_timestamp',
'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',
'TE_most_used_word_bucket_id_like_timestamp',
'TE_second_used_word_bucket_id_reply_timestamp',
'TE_second_used_word_bucket_id_retweet_timestamp',
'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',
'TE_second_used_word_bucket_id_like_timestamp',
'TE_mentioned_count_reply_timestamp',
'TE_mentioned_count_retweet_timestamp',
'TE_mentioned_count_retweet_with_comment_timestamp',
'TE_mentioned_count_like_timestamp',
'TE_mentioned_bucket_id_reply_timestamp',
'TE_mentioned_bucket_id_retweet_timestamp',
'TE_mentioned_bucket_id_retweet_with_comment_timestamp',
'TE_mentioned_bucket_id_like_timestamp',
'TE_has_mention_engaging_user_id_reply_timestamp',
'TE_has_mention_engaging_user_id_retweet_timestamp',
'TE_has_mention_engaging_user_id_retweet_with_comment_timestamp',
'TE_has_mention_engaging_user_id_like_timestamp',
'TE_mentioned_count_engaging_user_id_reply_timestamp',
'TE_mentioned_count_engaging_user_id_retweet_timestamp',
'TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',
'TE_mentioned_count_engaging_user_id_like_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_reply_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_like_timestamp'
]


In [44]:
current_path = "file:///mnt/nvme2/chendi/BlueWhale/docker/data/valid_lookup"
file_list = [
"lookup_table_engaged_with_user_id_fold",
"lookup_table_engaged_with_user_id_fold_engaging_user_id",
"lookup_table_engaged_with_user_id_fold_has_rt",
"lookup_table_engaged_with_user_id_fold_language",
"lookup_table_engaged_with_user_id_fold_tweet_dow",
"lookup_table_engaged_with_user_id_fold_tweet_hour",
"lookup_table_engaged_with_user_id_fold_tweet_type",
"lookup_table_fold_engaging_user_id",
"lookup_table_fold_engaging_user_id_tweet_hour",
"lookup_table_fold_language",
"lookup_table_fold_language_engaging_user_id",
"lookup_table_fold_tweet_dow",
"lookup_table_fold_tweet_dow_engaging_user_id",
"lookup_table_fold_tweet_type",
"lookup_table_fold_tweet_type_engaging_user_id",
"lookup_table_has_mention_engaging_user_id",
"lookup_table_mentioned_bucket_id",
"lookup_table_mentioned_bucket_id_engaging_user_id",
"lookup_table_mentioned_count",
"lookup_table_mentioned_count_engaging_user_id",
"lookup_table_most_used_word_bucket_id",
"lookup_table_original_engaged_with_user_id",
"lookup_table_original_engaged_with_user_id_original_tweet_type_original_language",
"lookup_table_original_engaging_user_id",
"lookup_table_original_hashtags_original_present_media_original_tweet_type_original_language",
"lookup_table_original_present_domains_original_language_engagee_follows_engager_original_tweet_type_original_present_media_engaged_with_user_is_verified",
"lookup_table_original_present_domains_original_present_media_original_tweet_type_original_language",
"lookup_table_original_present_links_original_present_media_original_tweet_type_original_language",
"lookup_table_original_present_media_original_tweet_type_original_language_engaged_with_user_is_verified_engaging_user_is_verified_engagee_follows_engager",
"lookup_table_second_used_word_bucket_id",
"lookup_table_text_tokens_original_tweet_type_original_language",
"lookup_table_tw_first_word_original_tweet_type_original_language",
"lookup_table_tw_last_word_original_tweet_type_original_language",
]
after_choose = {}
original_columns = {}
num_feature = 0
for fn in file_list:
    try:
        df = spark.read.parquet("%s/%s" % (current_path, fn))
        original_columns[fn] = []
        for c in df.columns:
            if 'TE' in c:
                original_columns[fn].append(c)
            if c in feature_list and 'TE' in c:
                if fn not in after_choose:
                    after_choose[fn] = []
                after_choose[fn].append(c)
                num_feature += 1
    except:
        pass
print("numFeatures from lookup is %d" % num_feature)
for table_name, fl in after_choose.items():
    print("*** ", table_name, " ***")
    for fe in fl:
        print(fe)
    print()

numFeatures from lookup is 71
***  lookup_table_engaged_with_user_id_fold  ***
TE_engaged_with_user_id_reply_timestamp_indicator
TE_engaged_with_user_id_retweet_timestamp_indicator
TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator
TE_engaged_with_user_id_like_timestamp_indicator

***  lookup_table_engaged_with_user_id_fold_language  ***
GTE_language_reply_timestamp_indicator_engaged_with_user_id
GTE_language_retweet_timestamp_indicator_engaged_with_user_id
GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id
GTE_language_like_timestamp_indicator_engaged_with_user_id

***  lookup_table_engaged_with_user_id_fold_tweet_dow  ***
GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id
GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id
GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id
GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id

***  lookup_table_engaged_with_user_id_fold_tweet_hour  ***
GTE_tweet

In [52]:
#for feature in feature_list:
#    found = False
#    for table_name, fl in after_choose.items():
#        if feature in fl:
#            found = True
#            break
#    if found:
#        continue
#    print(feature)

for table_name, fl in after_choose.items():
    df = spark.read.parquet("%s/%s" % ("file:///mnt/nvme2/chendi/BlueWhale/docker/data/valid_lookup", table_name))
    join_cols = []
    for c in df.columns:
        if 'TE' not in c:
            join_cols.append(c)
    print("CMBD_MODEL(join_cols=",join_cols,", lookup_table='",table_name,"'),")


CMBD_MODEL(join_cols= ['engaged_with_user_id'] , lookup_table=' lookup_table_engaged_with_user_id_fold '),
CMBD_MODEL(join_cols= ['engaged_with_user_id', 'language'] , lookup_table=' lookup_table_engaged_with_user_id_fold_language '),
CMBD_MODEL(join_cols= ['engaged_with_user_id', 'tweet_dow'] , lookup_table=' lookup_table_engaged_with_user_id_fold_tweet_dow '),
CMBD_MODEL(join_cols= ['engaged_with_user_id', 'tweet_hour'] , lookup_table=' lookup_table_engaged_with_user_id_fold_tweet_hour '),
CMBD_MODEL(join_cols= ['engaged_with_user_id', 'tweet_type'] , lookup_table=' lookup_table_engaged_with_user_id_fold_tweet_type '),
CMBD_MODEL(join_cols= ['engaging_user_id', 'tweet_hour'] , lookup_table=' lookup_table_fold_engaging_user_id_tweet_hour '),
CMBD_MODEL(join_cols= ['language'] , lookup_table=' lookup_table_fold_language '),
CMBD_MODEL(join_cols= ['engaging_user_id', 'language'] , lookup_table=' lookup_table_fold_language_engaging_user_id '),
CMBD_MODEL(join_cols= ['tweet_dow'] , lookup

In [43]:
for table_name, fl in go_delete.items():
    df = spark.read.parquet('%s/%s' % ("file:///mnt/nvme2/chendi/BlueWhale/docker/data/valid_lookup", table_name))
    for fe in fl:
        df = df.drop(fe)
    df.write.format('parquet').save('%s/%s_1' % ("file:///mnt/nvme2/chendi/BlueWhale/docker/data/valid_lookup", table_name))

In [15]:
df = spark.read.parquet("%s/train_with_features" % current_path)
%time df.sample(0.1).write.format('parquet').mode('overwrite').save("/recsys2021_0608_processed/sample_0_0_3/train_with_features")

CPU times: user 15.7 ms, sys: 9.35 ms, total: 25.1 ms
Wall time: 1min 41s


In [16]:
df = spark.read.parquet("%s/train_with_features" % current_path)
%time df.sample(0.1).write.format('parquet').mode('overwrite').save("/recsys2021_0608_processed/sample_0_0_0_3/train_with_features")

CPU times: user 15.2 ms, sys: 8.04 ms, total: 23.3 ms
Wall time: 1min 27s


In [20]:
first_df = spark.read.parquet("%s/train_categorified_lookup/tw_first_word_categorified_tw_first_word" % current_path)
last_df = spark.read.parquet("%s/train_categorified_lookup/tw_last_word_categorified_tw_last_word" % current_path)

df = first_df.union(last_df).withColumnRenamed('tw_first_word', 'word').withColumnRenamed('categorified_tw_first_word', 'categorified_word').groupby('word').agg(f.first(f.col('categorified_word')).alias('categorified_word'))
df.write.format('parquet').mode('overwrite').save("%s/train_categorified_lookup/word_categorified_word" % current_path)
