In [1]:
#!/env/bin/python

import init

import findspark
findspark.init()

import os
import pandas as pd
import numpy as np
from pyspark.sql import *
from pyspark import *
import pyspark.sql.functions as f
from timeit import default_timer as timer
import logging
from pyrecdp.data_processor import *
from pyrecdp.utils import *
from pyspark.sql.functions import pandas_udf, PandasUDFType


def decodeBertTokenizer(df, proc, output_name):
    #from transformers import BertTokenizer
    #tokenizer = BertTokenizer.from_pretrained(
    #    'bert-base-multilingual-cased', do_lower_case=False)

    # define UDF
    def decode_and_clean_tweet_text(x):
        # x = tokenizer.decode([int(n) for n in x.split('\t')])
        # x = x.replace('https : / / t. co / ', 'https://t.co/').replace('@ ', '@')
        x = "_".join([n for n in x.split('\t')])
        return x
        
    @pandas_udf('string')
    def tweet_tokens_decode_and_format(v):
        v1s = []
        for index, token in v.items():
            v1s.append(decode_and_clean_tweet_text(token))
        return pd.Series(v1s, dtype=str)
    
    # # define UDF
    tokenizer_decode = f.udf(lambda x: "_".join([n for n in x.split('\t')]))
    
    # define decode udf operations
    op_feature_modification_tokenizer_decode = FeatureAdd(
        cols={'tweet': 'text_tokens'}, udfImpl=tokenizer_decode)
    
    # define decode udf operations
    #op_feature_modification_tokenizer_decode = FeatureAdd(
    #    cols={'tweet': 'text_tokens'}, udfImpl=tweet_tokens_decode_and_format)

    # execute
    proc.reset_ops([op_feature_modification_tokenizer_decode])
    t1 = timer()
    df = proc.transform(df, name=output_name, df_cnt=626242930)
    t2 = timer()
    print("BertTokenizer decode took %.3f" % (t2 - t1))

    return df

/mnt/nvme2/chendi/BlueWhale/frameworks.bigdata.bluewhale/RecDP


In [4]:
path_prefix = "hdfs://"
current_path = "/recsys2021_0608_udf_test/"
original_folder = "/recsys2021_0608/"

native_sql_path = "/mnt/nvme2/chendi/intel-bigdata/OAP/native-sql-engine/native-sql-engine/core/target/spark-columnar-core-1.2.0-snapshot-jar-with-dependencies.jar"
native_arrow_datasource_path = "/mnt/nvme2/chendi/intel-bigdata/OAP/native-sql-engine/arrow-data-source/standard/target/spark-arrow-datasource-standard-1.2.0-snapshot-jar-with-dependencies.jar"

##### 1. Start spark and initialize data processor #####
t0 = timer()
spark = SparkSession.builder.master('yarn')\
    .appName("udf_column")\
    .config("spark.sql.broadcastTimeout", "7200")\
    .config("spark.cleaner.periodicGC.interval", "10min")\
    .config("spark.executorEnv.HF_DATASETS_OFFLINE", "1")\
    .config("spark.executorEnv.TRANSFORMERS_OFFLINE", "1")\
    .config("spark.executorEnv.LD_LIBRARY_PATH", "/usr/local/lib64/")\
    .config("spark.driver.extraClassPath", 
            f"{native_sql_path}:{native_arrow_datasource_path}")\
    .config("spark.executor.extraClassPath",
            f"{native_sql_path}:{native_arrow_datasource_path}")\
    .config("spark.sql.extensions", "com.intel.oap.ColumnarPlugin")\
    .config("spark.shuffle.manager", "org.apache.spark.shuffle.sort.ColumnarShuffleManager")\
    .config("spark.executor.memory", "10g")\
    .config("spark.executor.memoryOverhead", "16g")\
    .config("spark.memory.offHeap.use", "true")\
    .config("spark.memory.offHeap.size", "12G")\
    .config("spark.executor.extraJavaOptions", "-XX:MaxDirectMemorySize=25G")\
    .config("spark.oap.sql.columnar.arrowudf", "false")\
    .getOrCreate()


# 1.1 prepare dataFrames
# 1.2 create RecDP DataProcessor
proc = DataProcessor(spark, path_prefix,
                     current_path=current_path, shuffle_disk_capacity="1200GB")
df = spark.read.format("arrow").load(path_prefix + original_folder)
df = df.select("text_tokens")

# ===============================================
# decode tweet_tokens
df = decodeBertTokenizer(df, proc, output_name="decoded_with_extracted_features")

per core memory size is 2.500 GB and shuffle_disk maximum capacity is 1200.000 GB
BertTokenizer decode took 236.789


In [2]:
path_prefix = "hdfs://"
current_path = "/recsys2021_0608_udf_test/"
original_folder = "/recsys2021_0608/"

native_sql_path = "/mnt/nvme2/chendi/intel-bigdata/OAP/native-sql-engine/native-sql-engine/core/target/spark-columnar-core-1.2.0-snapshot-jar-with-dependencies.jar"
native_arrow_datasource_path = "/mnt/nvme2/chendi/intel-bigdata/OAP/native-sql-engine/arrow-data-source/standard/target/spark-arrow-datasource-standard-1.2.0-snapshot-jar-with-dependencies.jar"

##### 1. Start spark and initialize data processor #####
t0 = timer()
spark = SparkSession.builder.master('yarn')\
    .appName("udf_row")\
    .config("spark.sql.broadcastTimeout", "7200")\
    .config("spark.cleaner.periodicGC.interval", "10min")\
    .config("spark.executorEnv.HF_DATASETS_OFFLINE", "1")\
    .config("spark.executorEnv.TRANSFORMERS_OFFLINE", "1")\
    .config("spark.executor.memory", "10g")\
    .config("spark.executor.memoryOverhead", "16g")\
    .config("spark.memory.offHeap.use", "true")\
    .config("spark.memory.offHeap.size", "12G")\
    .config("spark.executor.extraJavaOptions", "-XX:MaxDirectMemorySize=25G")\
    .getOrCreate()


# 1.1 prepare dataFrames
# 1.2 create RecDP DataProcessor
proc = DataProcessor(spark, path_prefix,
                     current_path=current_path, shuffle_disk_capacity="1200GB")
df = spark.read.parquet(path_prefix + original_folder)
df = df.select("text_tokens")

# ===============================================
# decode tweet_tokens
df = decodeBertTokenizer(df, proc, output_name="decoded_with_extracted_features")

per core memory size is 2.500 GB and shuffle_disk maximum capacity is 1200.000 GB
BertTokenizer decode took 276.205


======
#### pandas udf
* per core memory size is 2.500 GB and shuffle_disk maximum capacity is 1200.000 GB
* BertTokenizer decode took 237.324

======
#### python udf
* per core memory size is 2.500 GB and shuffle_disk maximum capacity is 1200.000 GB
* BertTokenizer decode took 276.205

In [3]:
df.show()

+--------------------+--------------------+
|         text_tokens|               tweet|
+--------------------+--------------------+
|101	56898	137	144...|101_56898_137_144...|
|101	108	139	11403...|101_108_139_11403...|
|101	17160	55112	1...|101_17160_55112_1...|
|101	17116	15045	1...|101_17116_15045_1...|
|101	56898	137	653...|101_56898_137_653...|
|101	56898	137	552...|101_56898_137_552...|
|101	164	108	9519	...|101_164_108_9519_...|
|101	30407	10113	1...|101_30407_10113_1...|
|101	29869	86598	1...|101_29869_86598_1...|
|101	787	33691	201...|101_787_33691_201...|
|101	16437	29846	1...|101_16437_29846_1...|
|101	14516	59148	1...|101_14516_59148_1...|
|101	142	183	10804...|101_142_183_10804...|
|101	1097	20119	15...|101_1097_20119_15...|
|101	73293	10918	7...|101_73293_10918_7...|
|101	56898	137	300...|101_56898_137_300...|
|101	56898	137	337...|101_56898_137_337...|
|101	10092	1963	60...|101_10092_1963_60...|
|101	56898	137	101...|101_56898_137_101...|
|101	43527	15211	1...|101_43527_