In [None]:
spark.stop()

In [1]:
from init import *
from pyrecdp.utils import *
from pyrecdp.data_processor import *
from RecsysSchema import RecsysSchema

import logging
from timeit import default_timer as timer
import os
from pyspark import *
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import pandas as pd

path_prefix = "hdfs://"
folder = "/recsys2021/decompress/"
file = "/recsys2021/decompress/part-00036"
files = ['part-00036', 'part-00037', 'part-00038', 'part-00039', 'part-00040', 'part-00041', 'part-00042', 'part-00043', 'part-00044', 'part-00045'] 
#path = [os.path.join(path_prefix, folder, file) for file in files]
#path = os.path.join(path_prefix, folder)
path = os.path.join(path_prefix, file)
recsysSchema = RecsysSchema()

['/mnt/nvme2/chendi/BlueWhale/frameworks.bigdata.bluewhale/RecDP/examples/notebooks', '/hadoop/spark/python', '/hadoop/spark/python/lib/py4j-0.10.8.1-src.zip', '/usr/lib64/python37.zip', '/usr/lib64/python3.7', '/usr/lib64/python3.7/lib-dynload', '', '/root/.local/lib/python3.7/site-packages', '/usr/local/lib64/python3.7/site-packages', '/usr/local/lib/python3.7/site-packages', '/usr/lib64/python3.7/site-packages', '/usr/lib/python3.7/site-packages', '/usr/local/lib/python3.7/site-packages/IPython/extensions', '/root/.ipython', '/mnt/nvme2/chendi/BlueWhale/frameworks.bigdata.bluewhale/RecDP']


DataFrame[
*  text_tokens: string, 
*  hashtags: string, 
*  tweet_id: string, 
*  present_media: string, 
*  present_links: string, 
*  present_domains: string, 
*  tweet_type: string, 
*  language: string, 
*  tweet_timestamp: int, 
*  engaged_with_user_id: string, 
*  engaged_with_user_follower_count: int, 
*  engaged_with_user_following_count: int, 
*  engaged_with_user_is_verified: boolean, 
*  engaged_with_user_account_creation: int, 
*  enaging_user_id: string, 
*  enaging_user_follower_count: int, 
*  enaging_user_following_count: int, 
*  enaging_user_is_verified: boolean, 
*  enaging_user_account_creation: int, 
*  engagee_follows_engager: boolean, 
*  reply_timestamp: float,    
*  retweet_timestamp: float, 
*  retweet_with_comment_timestamp: float, 
*  like_timestamp: float

]

In [2]:
##### 1. Start spark and initialize data processor #####
t0 = timer()
spark = SparkSession\
    .builder\
    .master('yarn')\
    .appName("Recsys2021_DATA_PROCESS") \
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

schema = recsysSchema.toStructType()

In [3]:
df = spark.read.schema(schema).option('sep', '\x01').csv(path)
proc = DataProcessor(spark)

In [4]:
###### 2. define operations and append them to data processor ######
# pre-define
# 0.1 define udfs
replace = udf(lambda x:  '_'.join(x.split('\t')[:2]) if x else "", StringType())
count = udf(lambda x: str(x).count('\t')+1 if x else 0, LongType())
# 0.2 define dictionary
media = {
    '': 0,
    'GIF': 1,
    'GIF_GIF': 2,
    'GIF_Photo': 3,
    'GIF_Video': 4,
    'Photo': 5,
    'Photo_GIF': 6,
    'Photo_Photo': 7,
    'Photo_Video': 8,
    'Video': 9,
    'Video_GIF': 10,
    'Video_Photo': 11,
    'Video_Video': 12
}

tweet_type = {'Quote': 0, 'Retweet': 1, 'TopLevel': 2}

# 1. define operations

# 1.1 fill na
op_fillna_num = FillNA(['reply_timestamp', 'retweet_timestamp',
                        'retweet_with_comment_timestamp', 'like_timestamp'], 0)
op_fillna_str = FillNA(['present_domains', 'present_links', 'hashtags', 'present_media'], "")


# 1.2 feature modify and add
op_feature_modification_type_convert = FeatureModification(cols=['tweet_timestamp',
                                                                 'engaged_with_user_follower_count',
                                                                 'engaged_with_user_following_count',
                                                                 'engaged_with_user_account_creation',
                                                                 'enaging_user_follower_count',
                                                                 'enaging_user_following_count',
                                                                 'enaging_user_account_creation', 'reply_timestamp',
                                                                 'retweet_timestamp',
                                                                 'retweet_with_comment_timestamp',
                                                                 'like_timestamp'], op='toInt')
op_feature_modification_present_media_replace = FeatureModification(
    cols = {'present_media': "concat_ws('_', split(col('present_media'),'\t'))"}, op = 'inline')
#op_feature_modification_present_media_replace = FeatureModification(
#    cols=['present_media'], udfImpl=replace)

#op_feature_add_len = FeatureAdd(
#    cols={'len_hashtags': 'hashtags', 'len_domains': 'present_domains', 'len_links': 'present_links'}, udfImpl=count)
op_feature_add_len_hashtags = FeatureAdd(
    cols={'len_hashtags': "when(col('hashtags') == '', lit(0)).otherwise(size(split(col('hashtags'), '\t')))"}, op = 'inline')
op_feature_add_len_domains = FeatureAdd(
    cols={'len_domains': "when(col('present_domains') == '', lit(0)).otherwise(size(split(col('present_domains'), '\t')))"}, op = 'inline')
op_feature_add_len_links = FeatureAdd(
    cols={'len_links': "when(col('present_links') == '', lit(0)).otherwise(size(split(col('present_links'), '\t')))"}, op = 'inline')
op_feature_add_engage_time = FeatureAdd(
    cols={'engage_time': "least(col('reply_timestamp'), col('retweet_timestamp'), col('retweet_with_comment_timestamp'), col('like_timestamp'))"}, op='inline')
op_new_feature_dt_dow = FeatureAdd(cols={
    "dt_dow": "dayofweek(from_unixtime(col('tweet_timestamp'))).cast(IntegerType())",
    "dt_hour": "hour(from_unixtime(col('tweet_timestamp'))).cast(IntegerType())",
    "dt_minute": "minute(from_unixtime(col('tweet_timestamp'))).cast(IntegerType())",
    "dt_second": "second(from_unixtime(col('tweet_timestamp'))).cast(IntegerType())"}, op='inline')

op_feature_change = FeatureModification(cols={
    "reply_timestamp": "when(col('reply_timestamp') > 0, 1).otherwise(0)",
    "retweet_timestamp": "when(col('retweet_timestamp') > 0, 1).otherwise(0)",
    "retweet_with_comment_timestamp": "when(col('retweet_with_comment_timestamp') > 0, 1).otherwise(0)",
    "like_timestamp": "when(col('like_timestamp') > 0, 1).otherwise(0)"}, op='inline')

ops = [op_fillna_num, op_fillna_str, 
       op_feature_modification_type_convert, op_feature_modification_present_media_replace,
       op_feature_add_len_hashtags, op_feature_add_len_domains, op_feature_add_len_links, 
       op_feature_add_engage_time, op_new_feature_dt_dow, op_feature_change]
proc.reset_ops(ops)

# 1.3 categorify    
# udf took lots of memory, process in advance
op_categorifyMultiItems = CategorifyMultiItems(
    ['present_domains', 'present_links', 'hashtags'])
op_categorify_present_media = CategorifyWithDictionary(
    ['present_media'], media)
op_categorify_tweet_type = CategorifyWithDictionary(
    ['tweet_type'], tweet_type)
# since language dict is small, we may use udf to make partition more even
op_categorify_language = Categorify(['language'], hint = 'udf')

ops_1 = [op_categorifyMultiItems, op_categorify_present_media, op_categorify_tweet_type, op_categorify_language]
proc.append_ops(ops_1)


In [5]:
##### 3. do data transform(data frame materialize) #####
t1 = timer()
df = proc.transform(df)
t2 = timer()
print("Data Process and udf categorify took %.3f" % (t2 - t1))

Data Process and udf categorify took 26.259


In [11]:
# below are using join

# since we observed extremely high mem footage to 
# do below joins, split each run to save memory

op_categorify_tweet_id = Categorify(['tweet_id'])
proc.reset_ops([op_categorify_tweet_id])
t1 = timer()
df = proc.transform(df)
t2 = timer()

op_categorify_user_id = Categorify(['engaged_with_user_id', 'enaging_user_id'], src_cols=[
                                   'engaged_with_user_id', 'enaging_user_id'])
proc.reset_ops([op_categorify_user_id])
t5 = timer()
df = proc.transform(df)
t6 = timer()

print("Categorify w/join took %.3f %.3f" % ((t2 - t1), (t6 - t5)))

Categorify w/join took 5.543 7.025


In [32]:
df.write.format('parquet').mode(
                'overwrite').save("/recsys2021/10files/processed_phase_1/")

In [34]:
df = spark.read.parquet("/recsys2021/10files/processed_phase_1/")

In [12]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

# define UDF
tokenizer_decode = udf(lambda x: tokenizer.decode( [ int(n) for n in x.split('\t') ] ))
format_url = udf(lambda x: x.replace('https : / / t. co / ', 'https://t.co/').replace('@ ', '@'))

# define operations
op_feature_modification_tokenizer_decode = FeatureAdd(
    cols={'tweet':'text_tokens'}, udfImpl=tokenizer_decode)
op_feature_modification_format_url = FeatureModification(
    cols=['tweet'], udfImpl=format_url)

# execute
proc.reset_ops([op_feature_modification_tokenizer_decode, op_feature_modification_format_url])
t1 = timer()
df = proc.transform(df)
t2 = timer()
print("BertTokenizer decode and format took %.3f" % (t2 - t1))


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


BertTokenizer decode and format took 22.334


In [13]:
import hashlib

def extract_hash(text, split_text='@', no=0):
    text = text.lower()
    uhash = ''
    text_split = text.split('@')
    if len(text_split)>(no+1):
        text_split = text_split[no+1].split(' ')
        cl_loop = True
        uhash += clean_text(text_split[0])
        while cl_loop:
            if len(text_split)>1:
                if text_split[1] in ['_']:
                    uhash += clean_text(text_split[1]) + clean_text(text_split[2])
                    text_split = text_split[2:]
                else:
                    cl_loop = False
            else:
                cl_loop = False
    hash_object = hashlib.md5(uhash.encode('utf-8'))
    return hash_object.hexdigest()

def clean_text(text):
    if len(text)>1:
        if text[-1] in ['!', '?', ':', ';', '.', ',']:
            return(text[:-1])
    return(text)

In [14]:
# features upon tweet
to_notsign = udf(lambda x: x.replace('\[CLS\] RT @', ''))
count_space = udf(lambda x: x.count(' '))
count_text_length = udf(lambda x: len(x))
user_defined_hash = udf(lambda x: extract_hash(x, split_text='RT @', no=0))
# features upon tweet_nortsign
count_at = udf(lambda x: x.count('@'))
user_define_hash_1 = udf(lambda x: extract_hash(x))
user_define_hash_2 = udf(lambda x: extract_hash(x, no=1))

# features upon tweet
op_feature_add_tweet_nortsign = FeatureAdd(cols={'tweet_nortsign': 'tweet'}, udfImpl=to_notsign)
op_feature_add_count_words = FeatureAdd(cols={'count_words': 'tweet'}, udfImpl=count_space)
op_feature_add_count_char = FeatureAdd(cols={'count_char': 'tweet'}, udfImpl=count_text_length)
op_feature_add_tw_uhash = FeatureAdd(cols={'tw_uhash': 'tweet'}, udfImpl=user_defined_hash)
op_feature_add_tw_hash = FeatureAdd(cols={'tw_hash': "hash(col('tweet'))%1000000000"}, op='inline')
# features upon tweet_nortsign
op_feature_add_count_at = FeatureAdd(cols={'count_ats': 'tweet_nortsign'}, udfImpl=count_at)
op_feature_add_tw_uhash0 = FeatureAdd(cols={'tw_hash0': 'tweet_nortsign'}, udfImpl=user_define_hash_1)
op_feature_add_tw_uhash1 = FeatureAdd(cols={'tw_hash1': 'tweet_nortsign'}, udfImpl=user_define_hash_2)
    
# execute
proc.reset_ops([op_feature_add_tweet_nortsign, op_feature_add_count_words, op_feature_add_count_char, 
                op_feature_add_tw_uhash, op_feature_add_tw_hash,
                op_feature_add_count_at, op_feature_add_tw_uhash0, op_feature_add_tw_uhash1])
t1 = timer()
df = proc.transform(df)
t2 = timer()
print("Adding Feature upon tweet and tweet_nortsign column took %.3f" % (t2 - t1))
# expect to spend about 1000secs

Adding Feature upon tweet and tweet_nortsign column took 7.115


In [15]:
df.write.format('parquet').mode(
                'overwrite').save("/recsys2021/1file/processed_phase_2/")

In [19]:
df = spark.read.parquet("/recsys2021/1file/processed_phase_2/")

In [23]:
i = 'tweet'
freqRange = [2, 100000]
skipList = ['', '[', ']', '.', '!', '@', '_', '#']
sep = ' '
sorted_data_df = df.select(explode(split(col(i), ' '))).groupBy('col').count().orderBy(
                desc('count'), 'col').select('col', 'count')
sorted_data = sorted_data_df.collect()
dict_data = dict((id['col'], [id['count'], idx]) for (id, idx) in zip(
    sorted_data, range(len(sorted_data))))
%time broadcast_dict = spark.sparkContext.broadcast(dict_data)


CPU times: user 4.51 s, sys: 338 ms, total: 4.84 s
Wall time: 5.31 s


In [24]:
def frequence_encode(x):
    dict_data = broadcast_dict.value
    li = []
    for v in x.split(sep):
        if v not in skipList and v in dict_data:
            f, i = dict_data[v]
            if f < freqRange[1] and f > freqRange[0]:
                li.append(i)
    return sorted(li, reverse=True)
udf_impl = udf(frequence_encode, ArrayType(IntegerType()))
converted_df = df.withColumn(i, udf_impl(col(i)))
%time converted_df.select(i).collect()

CPU times: user 18.7 s, sys: 416 ms, total: 19.1 s
Wall time: 35 s


[Row(tweet=[399154, 261800, 70050, 63102, 25931, 11846, 8675, 5069, 4208, 3912, 3182, 3030, 2318, 1719, 1633, 1018, 782, 202, 70]),
 Row(tweet=[716069, 71335, 33976, 28826, 11337, 7218, 3127, 1297, 731, 326, 200, 163, 75, 71, 69]),
 Row(tweet=[235425, 45200, 42837, 31286, 21742, 13353, 8371, 4518, 809]),
 Row(tweet=[23209]),
 Row(tweet=[26657, 14059, 5595]),
 Row(tweet=[286150, 80530, 33841, 16886, 13474, 7997, 5977, 3924, 2773, 1739, 1739, 1114, 687, 305, 304, 134, 125, 75]),
 Row(tweet=[825481, 62636, 2853]),
 Row(tweet=[772505, 546603, 266970, 193338, 175243, 156357, 117993, 66319, 57591, 47523, 41676, 32001, 28357, 17684, 11654, 11122, 10542, 9507, 1978, 1089, 597, 434, 119]),
 Row(tweet=[30457, 18542, 11012, 7567, 5163, 3375, 1263, 829, 738, 476, 471, 222, 221, 157, 91, 55]),
 Row(tweet=[410254, 74633, 27646, 27646, 27159, 21061, 16076, 12122, 8626, 4069, 2309, 221, 175, 110, 68, 68, 55]),
 Row(tweet=[645534, 644866, 643382, 642620, 642450, 22131]),
 Row(tweet=[380249, 273244, 100

In [41]:
@pandas_udf("string")
def frequence_encode(x_list):
    dict_data = broadcast_dict.value
    return x_list.str.split(' ').sort_values(ascending=False).map(lambda x: x[0]).replace(dict_data)

converted_df = df.withColumn(i, frequence_encode(col(i)))
%time converted_df.select(i).collect()

KeyboardInterrupt: 

In [None]:
#out = []
#for i in x_list.size:
#    x = x_list.iloc[i]
#    li = x.split(' ')
#    #if v not in skipList and v in dict_data:
#    #    f, i = dict_data[v]
#    #    if f < freqRange[1] and f > freqRange[0]:
#    #        li.append(i)
#    li.append(v)
#    sorted(li)
##return '/'.join(sorted(li, reverse=True))
#
#return str(li[0])

In [4]:
proc = DataProcessor(spark)
op_fillna_for_tweet = FillNA(['tweet'], "")
op_categorify_multiple_tweet = CategorifyMultiItems(['tweet'], strategy=1, sep=' ', skipList=['', '[', ']', '.', '!', '@', '_', '#'])
proc.reset_ops([op_fillna_for_tweet, op_categorify_multiple_tweet])
t1 = timer()
df = proc.transform(df)
t2 = timer()
print("Frequency encode tweet column took %.3f" % (t2 - t1))

Frequency encode tweet column took 71.011


In [7]:
op_feature_add_tw_freq_hash = FeatureAdd({'tw_freq_hash': "col('tw_hash')"}, op='inline')
op_feature_add_tw_first_word = FeatureAdd({'tw_first_word': "col('tweet').getItem(0)"}, op='inline')
op_feature_add_tw_second_word = FeatureAdd({'tw_second_word': "col('tweet').getItem(1)"}, op='inline')
op_feature_add_tw_last_word = FeatureAdd({'tw_last_word': "col('tweet').getItem(size(col('tweet')) - 1)"}, op='inline')
op_feature_add_tw_second_last_word = FeatureAdd({'tw_llast_word': "col('tweet').getItem(size(col('tweet')) - 1)"}, op='inline')
op_feature_add_tw_word_len = FeatureAdd({'tw_len': "size(col('tweet'))"}, op='inline')
op_feature_modification_fillna = FillNA(['tw_freq_hash', 'tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word', 'tw_len'], -1)

proc.reset_ops([op_feature_add_tw_freq_hash, op_feature_add_tw_first_word, op_feature_add_tw_second_word,
                op_feature_add_tw_last_word, op_feature_add_tw_second_last_word, op_feature_add_tw_word_len,
                op_feature_modification_fillna])
t1 = timer()
df = proc.transform(df)
t2 = timer()
print("feature engineering upon Frequency encoded tweet column took %.3f" % (t2 - t1))


feature engineering upon Frequency encoded tweet column took 5.338


In [8]:
df.select('tw_freq_hash', 'tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word', 'tw_len').show(truncate=100)

+------------+-------------+--------------+------------+-------------+------+
|tw_freq_hash|tw_first_word|tw_second_word|tw_last_word|tw_llast_word|tw_len|
+------------+-------------+--------------+------------+-------------+------+
|   499420595|       399154|        261800|          70|           70|    19|
|  -525989518|       716069|         71335|          69|           69|    15|
|   560765826|       235425|         45200|         809|          809|     9|
|  -603094632|        23209|            -1|       23209|        23209|     1|
|    81094762|        26657|         14059|        5595|         5595|     3|
|  -682976397|       286150|         80530|          75|           75|    18|
|   173727706|       825481|         62636|        2853|         2853|     3|
|  -141265501|       772505|        546603|         119|          119|    23|
|   463336980|        30457|         18542|          55|           55|    16|
|  -468553637|       410254|         74633|          55|        

In [None]:
df.write.format('parquet').mode(
                'overwrite').save("/recsys2021/1file/processed_phase_3/")