In [None]:
spark.stop()

In [None]:
#!/env/bin/python

import init

import findspark
findspark.init()

import os
import pandas as pd
import numpy as np
from pyspark.sql import *
from pyspark import *
import pyspark.sql.functions as f
from timeit import default_timer as timer
import logging
from RecsysSchema import RecsysSchema
from pyrecdp.data_processor import *
from pyrecdp.utils import *
import hashlib

def categorifyAllFeatures(df, proc, output_name="categorified", gen_dicts=False):
    # 1. define operations
    # 1.1 fill na and features
    op_fillna_str = FillNA(
        ['present_domains', 'present_links', 'hashtags'], "")

    # 1.3 categorify
    # since language dict is small, we may use udf to make partition more even
    #'present_domains', 'present_links', 
    op_categorify_multi = Categorify(
        ['present_domains', 'present_links', 'hashtags'], gen_dicts=gen_dicts, doSplit=True, keepMostFrequent=True)
    op_fillna_for_categorified = FillNA(['present_domains', 'present_links', 'hashtags'], -1)

    # transform
    proc.append_ops([op_fillna_str, op_categorify_multi, op_fillna_for_categorified])
    t1 = timer()
    df = proc.transform(df, name=output_name)
    t2 = timer()
    print("Data Process 1 and udf categorify took %.3f" % (t2 - t1))

    return df




In [None]:
path_prefix = "hdfs://"
current_path = "/recsys2020_0608_categorify_example_1/"
original_folder = "/recsys2021_0608/"
dicts_folder = "recsys_dicts/"
recsysSchema = RecsysSchema()

scala_udf_jars = "/mnt/nvme2/chendi/BlueWhale/recdp/ScalaProcessUtils/target/recdp-scala-extensions-0.1.0-jar-with-dependencies.jar"

##### 1. Start spark and initialize data processor #####
t0 = timer()
spark = SparkSession.builder.master('yarn')\
    .appName("Recsys2020_data_process")\
    .config("spark.executor.memory", "30g")\
    .config("spark.executor.memoryOverhead", "5g")\
    .config("spark.driver.extraClassPath", f"{scala_udf_jars}")\
    .config("spark.executor.extraClassPath", f"{scala_udf_jars}")\
    .config("spark.executor.cores", "4")\
    .getOrCreate()

schema = recsysSchema.toStructType()

# 1.1 prepare dataFrames
# 1.2 create RecDP DataProcessor
proc = DataProcessor(spark, path_prefix,
                     current_path=current_path, dicts_path=dicts_folder, shuffle_disk_capacity="1200GB")

# ===============================================
# basic: Do categorify for all columns
df = spark.read.parquet(path_prefix + original_folder)

# rename firstly
df = df.withColumnRenamed('enaging_user_following_count', 'engaging_user_following_count')
df = df.withColumnRenamed('enaging_user_is_verified', 'engaging_user_is_verified')
df = categorifyAllFeatures(df, proc, gen_dicts=True)


* per core memory size is 7.500 GB and shuffle_disk maximum capacity is 1200.000 GB
* hashtags has numRows as 6541935
* language has numRows as 66
* present_domains has numRows as 896783
* present_links has numRows as 14804300
* tweet_id has numRows as 279597527
* user_id has numRows as 42850559
* present_media has numRows as 13
* tweet_type has numRows as 3
* categorify threshold is 50.000 M rows, flush_threshold is 960.000 GB
* ('tweet_type', DataFrame[dict_col: string, dict_col_id: bigint], 3)
* ('present_media', DataFrame[dict_col: string, dict_col_id: bigint], 13)
* ('language', DataFrame[dict_col: string, dict_col_id: int], 66)
* ('present_domains', DataFrame[dict_col: string, dict_col_id: int], 896783)
* ('hashtags', DataFrame[dict_col: string, dict_col_id: int], 6541935)
* ('present_links', DataFrame[dict_col: string, dict_col_id: int], 14804300)
* ('user_id', DataFrame[dict_col: string, dict_col_id: int], 42850559)
* ('tweet_id', DataFrame[dict_col: string, dict_col_id: int], 279597527)
* tweet_type will do udf
* present_media will do udf
* language will do udf
* present_domains will do udf
* hashtags will do udf
* present_links will do udf
* etstimated_to_shuffle_size for user_id is 81.653 GB, will do shj
* etstimated_to_shuffle_size for tweet_id is 78.153 GB, will do shj
* Data Process 1 and udf categorify took 799.987

### After switching to use scala udf, process time will be longer while we don't need to assign bigger memoryOverHead to spark

* recdp-scala-extension is enabled
* per core memory size is 4.500 GB and shuffle_disk maximum capacity is 1200.000 GB
* hashtags has numRows as 6541935
* present_domains has numRows as 896783
* present_links has numRows as 14804300
* present_media has numRows as 13
* tweet_type has numRows as 3
* bhj total threshold is 60.398 M rows, one bhj threshold is 30.000 M rows, flush_threshold is 960.000 GB
* ('present_domains', DataFrame[dict_col: string, dict_col_id: int, count: bigint], 896783)
* ('hashtags', DataFrame[dict_col: string, dict_col_id: int, count: bigint], 6541935)
* ('present_links', DataFrame[dict_col: string, dict_col_id: int, count: bigint], 14804300)
* present_domains will do udf
* hashtags will do udf
* present_links will do udf
* Data Process 1 and udf categorify took 1045.646