In [1]:
#!/env/bin/python

import init

import findspark
findspark.init()

import os
import pandas as pd
import numpy as np
from pyspark.sql import *
from pyspark import *
import pyspark.sql.functions as f
from timeit import default_timer as timer
import logging
from pyrecdp.data_processor import *
from pyrecdp.utils import *
from pyspark.sql.functions import pandas_udf, PandasUDFType

path_prefix = "hdfs://"
current_path = "/recsys2021_0608_scalaudf_test/"
original_folder = "/recsys2021_0608/"

scala_udf_jars = "/mnt/nvme2/chendi/BlueWhale/recdp/ScalaProcessUtils/target/recdp-scala-extensions-0.1.0-jar-with-dependencies.jar"

##### 1. Start spark and initialize data processor #####
t0 = timer()
spark = SparkSession.builder.master('yarn')\
    .appName("scala_udf_column")\
    .config("spark.sql.broadcastTimeout", "7200")\
    .config("spark.cleaner.periodicGC.interval", "10min")\
    .config("spark.driver.extraClassPath", f"{scala_udf_jars}")\
    .config("spark.executor.extraClassPath", f"{scala_udf_jars}")\
    .config("spark.executor.memory", "20g")\
    .config("spark.executor.memoryOverhead", "6g")\
    .getOrCreate()



/mnt/nvme2/chendi/BlueWhale/recdp


### Example 1: CategorifyForArray

In [5]:
dict_df = spark.read.parquet("/recsys2020_0608_categorify_example/recsys_dicts/hashtags")
gateway = spark.sparkContext._gateway
categorify_broadcast_handler = gateway.jvm.org.apache.spark.sql.api.CategorifyBroadcast.broadcast(spark.sparkContext._jsc, dict_df._jdf)
print(categorify_broadcast_handler)
spark._jsparkSession.udf().register("CategorifyForArray", gateway.jvm.org.apache.spark.sql.api.CategorifyForArray(categorify_broadcast_handler))
df = spark.read.parquet("/recsys2021_0608_example/train_with_categorified_features_test")
df = df.select("hashtags").filter("hashtags is not null").withColumn("hashtags_splitted", f.split(f.col("hashtags"), '\t'))
print(df.dtypes)
df = df.withColumn('hashtags_idx', f.expr("CategorifyForArray(hashtags_splitted)"))
print(df.dtypes)
df.show()

# ===============================================
# decode tweet_tokens
#df = process_data(df, proc, output_name="scala_udf_test")

Broadcast(27)
[('hashtags', 'string'), ('hashtags_splitted', 'array<string>')]
[('hashtags', 'string'), ('hashtags_splitted', 'array<string>'), ('hashtags_idx', 'array<int>')]
+--------------------+--------------------+--------------------+
|            hashtags|   hashtags_splitted|        hashtags_idx|
+--------------------+--------------------+--------------------+
|349472F2F74A475A0...|[349472F2F74A475A...|[20848, 38675, 21...|
|2FCFA4566CBF1C1DB...|[2FCFA4566CBF1C1D...|[1206604, 1516246...|
|CCA7C7232D8BC8177...|[CCA7C7232D8BC817...|               [752]|
|CBC290422A3CCA26B...|[CBC290422A3CCA26...|            [330671]|
|563F733233D158ED5...|[563F733233D158ED...|[71899, 229729, 6...|
|EEF5D60A3A8DBF74B...|[EEF5D60A3A8DBF74...|             [98037]|
|94E858B77C242683E...|[94E858B77C242683...|               [905]|
|A338C5C1637BEDA84...|[A338C5C1637BEDA8...|                 [0]|
|791E94CB06C33F419...|[791E94CB06C33F41...|               [477]|
|0360E2682218E7DFA...|[0360E2682218E7DF...| 

### Example 2: Categorify

In [4]:
dict_df = spark.read.parquet("/recsys2021_0608_example/recsys_dicts/mention")
gateway = spark.sparkContext._gateway
categorify_broadcast_handler = gateway.jvm.org.apache.spark.sql.api.CategorifyBroadcast.broadcast(spark.sparkContext._jsc, dict_df._jdf)
print(categorify_broadcast_handler)

spark._jsparkSession.udf().register("Categorify", gateway.jvm.org.apache.spark.sql.api.Categorify(categorify_broadcast_handler))

df = spark.read.parquet("/recsys2021_0608_example/train_with_categorified_features_test")
df = df.select("mention").filter(f.col('mention') != "")
print(df.dtypes)
df = df.withColumn('mention_idx', f.expr("Categorify(mention)"))
print(df.dtypes)
df.show()

Broadcast(18)
[('mention', 'string')]
[('mention', 'string'), ('mention_idx', 'int')]
+-------------------------+-----------+
|                  mention|mention_idx|
+-------------------------+-----------+
|            pedrojubierna|          7|
|                   maddow|         22|
|              smritiirani|         22|
|                       mb|         22|
|                     jcxx|          1|
|              TomScibelli|         22|
|            rottenheights|         14|
|               Shemalesbr|         22|
|                     capt|         22|
|               villainnap|         22|
|             ritsuitotaro|         19|
|              jinjinpocha|         22|
|              frajervenca|         21|
|                mrjamesob|         22|
|               JaimeRdzNL|         22|
|CaramelCorn1971をフォロー|         22|
|                  Soledad|         22|
|          wiosnabiedronia|         22|
|                ObviousIy|          2|
|          ORDems'Portland|         13|

### Example 3: CodegenSeparator

In [None]:
gateway = spark.sparkContext._gateway
spark._jsparkSession.udf().register("CodegenSeparator", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator())
spark._jsparkSession.udf().register("CodegenSeparator0", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator0())
spark._jsparkSession.udf().register("CodegenSeparator1", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator1())
spark._jsparkSession.udf().register("CodegenSeparator2", gateway.jvm.org.apache.spark.sql.api.CodegenSeparator2())
df = spark.read.parquet(path_prefix + original_folder)
df = df.select("tweet_timestamp")
df = df.withColumn('tweet_timestamp', f.expr("CodegenSeparator2(tweet_timestamp)"))
print(df.dtypes)
df.show()

### Example 4: sortStringArrayByFrequency and sortIntegerArrayByFrequency

In [None]:
# 1.1 prepare dataFrames
# 1.2 create RecDP DataProcessor
df = spark.read.parquet(path_prefix + original_folder)
df = df.select("reply_timestamp", 'tweet_id').groupby('tweet_id').agg(f.collect_list('reply_timestamp').alias('reply_timestamp'))
df = df.withColumn('sorted_reply_timestamp', f.expr('sortStringArrayByFrequency(reply_timestamp)'))

df.show()