# initialize Spark session in single task mode

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("SingleTaskApp") \
    .master("local[1]") \
    .getOrCreate()

sc = spark.sparkContext

# generate 1 million random strings with random length between 1 and 20000

In [None]:
spark.sql("""
    WITH t1 AS (
      SELECT rand() * 19999 + 1 AS str_len
      FROM range(1000000)
    )
    SELECT
      substr(
        base64(
          randstr(20000)
        ), 1, str_len
      ) as str
    FROM t1
""").write.mode("overwrite").parquet("randstring.parquet")


# First approach: Using spark SQL built-in functions

In [None]:
spark.sql("""
    SELECT str, sha2(str, 256) as hash 
    FROM parquet.`randstring.parquet`
""").write.mode("overwrite").parquet("randsha.parquet")



completes in 70.2s

# Second approach: Using Python's hashlib as UDF

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import hashlib

@udf(returnType=StringType())
def sha256_hash(text):
    return hashlib.sha256(text.encode()).hexdigest()

spark.read.parquet("randstring.parquet") \
    .withColumn("hash", sha256_hash("str")) \
    .write.mode("overwrite").parquet("randsha2.parquet")

completes in 77.2s

# Conclusion

PySpark's python UDF has limited overhead reflected by comparing Spark's native SHA256 hash function and python's SHA256 hash function as UDF.