In [1]:
#!/env/bin/python

import init

import findspark
findspark.init()

import os
import pandas as pd
import numpy as np
from pyspark.sql import *
from pyspark import *
import pyspark.sql.functions as f
from timeit import default_timer as timer
import logging
from pyrecdp.data_processor import *
from pyrecdp.utils import *
from pyspark.sql.functions import pandas_udf, PandasUDFType

path_prefix = "hdfs://"
current_path = "/recsys2021_0608_scalaudf_test/"
original_folder = "/recsys2021_0608/"

scala_udf_jars = "/mnt/nvme2/chendi/BlueWhale/recdp/ScalaProcessUtils/target/recdp-scala-extensions-0.1.0-jar-with-dependencies.jar"

##### 1. Start spark and initialize data processor #####
t0 = timer()
spark = SparkSession.builder.master('yarn')\
    .appName("scala_udf_column")\
    .config("spark.sql.broadcastTimeout", "7200")\
    .config("spark.cleaner.periodicGC.interval", "10min")\
    .config("spark.driver.extraClassPath", f"{scala_udf_jars}")\
    .config("spark.executor.extraClassPath", f"{scala_udf_jars}")\
    .config("spark.executor.memory", "20g")\
    .config("spark.executor.memoryOverhead", "6g")\
    .getOrCreate()



/mnt/nvme2/chendi/BlueWhale/recdp


### Example 1.1: CategorifyByFreqForArray

In [9]:
dict_df = spark.read.parquet("/recsys2020_0608_categorify_example/recsys_dicts/hashtags")
df = spark.read.parquet("/recsys2021_0608/")
df = df.select("hashtags").withColumn("hashtags_orig", f.col("hashtags"))

proc = DataProcessor(spark, "hdfs://", "/recdp_operations_example/")
op_categorify = Categorify(
    ['hashtags'],
    dict_dfs=[{'col_name': 'hashtags', 'dict': dict_df}],
    hint = 'udf',
    doSplit = True,
    keepMostFrequent = True
)
proc.reset_ops([op_categorify])
proc.get_sample(df)

recdp-scala-extension is enabled
per core memory size is 5.000 GB and shuffle_disk maximum capacity is 8589934592.000 GB
+--------+--------------------+
|hashtags|       hashtags_orig|
+--------+--------------------+
|    null|                null|
|      80|1A703973FF98425EE...|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|     119|A31D528F61979FC70...|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
|    null|                null|
+--------+--------------------+
only showing top 20 rows



### Example 1.2: CategorifyForArray

In [10]:
dict_df = spark.read.parquet("/recsys2020_0608_categorify_example/recsys_dicts/hashtags")
df = spark.read.parquet("/recsys2021_0608/")
df = df.select("hashtags").withColumn("hashtags_orig", f.col("hashtags"))

proc = DataProcessor(spark, "hdfs://", "/recdp_operations_example/")
op_categorify = Categorify(
    ['hashtags'],
    dict_dfs=[{'col_name': 'hashtags', 'dict': dict_df}],
    hint = 'udf',
    doSplit = True
)
proc.reset_ops([op_categorify])
proc.get_sample(df)

recdp-scala-extension is enabled
per core memory size is 5.000 GB and shuffle_disk maximum capacity is 8589934592.000 GB
+--------------------+--------------------+
|            hashtags|       hashtags_orig|
+--------------------+--------------------+
|                  []|                null|
|                [80]|1A703973FF98425EE...|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|[5680, 6617, 120,...|A31D528F61979FC70...|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|
|                  []|                null|

### Example 2: Categorify

In [11]:
dict_df = spark.read.parquet("/recsys2021_0608_example/recsys_dicts/mention")
df = spark.read.parquet("/recsys2021_0608_example/train_with_categorified_features_test")
df = df.select("mention").withColumn("mention_orig", f.col("mention"))

proc = DataProcessor(spark, "hdfs://", "/recdp_operations_example/")
op_categorify = Categorify(
    ['mention'],
    dict_dfs=[{'col_name': 'mention', 'dict': dict_df}],
    hint = 'udf'
)
proc.reset_ops([op_categorify])
proc.get_sample(df)

recdp-scala-extension is enabled
per core memory size is 5.000 GB and shuffle_disk maximum capacity is 8589934592.000 GB
+-------+-------------+
|mention| mention_orig|
+-------+-------------+
|     22|             |
|     22|             |
|      7|pedrojubierna|
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
|     22|             |
+-------+-------------+
only showing top 20 rows



### Example 3: CodegenSeparator

In [7]:
gateway = spark.sparkContext._gateway
spark._jsparkSession.udf().register("CodegenSeparator", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator())
spark._jsparkSession.udf().register("CodegenSeparator0", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator0())
spark._jsparkSession.udf().register("CodegenSeparator1", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator1())
spark._jsparkSession.udf().register("CodegenSeparator2", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator2())
df = spark.read.parquet("/recsys2021_0608_example/train_with_categorified_features_test")
df = df.select("tweet_timestamp")
df = df.withColumn('tweet_timestamp', f.expr("CodegenSeparator2(tweet_timestamp)"))
print(df.dtypes)
df.show()

[('tweet_timestamp', 'bigint')]
+---------------+
|tweet_timestamp|
+---------------+
|     1613574302|
|     1612628499|
|     1613233141|
|     1613373619|
|     1614053869|
|     1613130630|
|     1613721703|
|     1612706347|
|     1613905759|
|     1613854589|
|     1613217462|
|     1612584698|
|     1613590692|
|     1613805413|
|     1612540790|
|     1613509274|
|     1613437436|
|     1614099587|
|     1612914478|
|     1612815075|
+---------------+
only showing top 20 rows



### Example 4: sortStringArrayByFrequency and sortIntegerArrayByFrequency

In [23]:
# 1.1 prepare dataFrames
# 1.2 create RecDP DataProcessor
spark.udf.registerJavaFunction("SortStringArrayByFrequency","com.intel.recdp.SortStringArrayByFrequency")
df = spark.read.parquet("/recsys2021_0608_example/train_with_categorified_features_test")
df = df.select("engaged_with_user_id", 'tweet_id').groupby('engaged_with_user_id').agg(f.collect_list('tweet_id').alias('tweet_id'))
df = df.withColumn('sorted_tweet_id', f.expr('SortStringArrayByFrequency(tweet_id)'))

df.show(vertical=True, truncate=100)

-RECORD 0--------------------------------------------------------------------------------------------------------------------
 engaged_with_user_id | 000097973EC6EAD0F129E47A652711F8                                                                     
 tweet_id             | [9A8B5B7DCF2211FB642C78DF8DF54699]                                                                   
 sorted_tweet_id      | [9A8B5B7DCF2211FB642C78DF8DF54699]                                                                   
-RECORD 1--------------------------------------------------------------------------------------------------------------------
 engaged_with_user_id | 00019AA271E5715BC857B49D97C162AB                                                                     
 tweet_id             | [29C0142E44F15DCFE3EB7E03F961FF83]                                                                   
 sorted_tweet_id      | [29C0142E44F15DCFE3EB7E03F961FF83]                                                            