In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("Import custom jar") \
                    .config("spark.executor.memory", "1G") \
                    .config("spark.driver.memory", "1G") \
                    .config("spark.driver.maxResultSize", "1G") \
                    .config("spark.jars", "/opt/scala-project_2.12-0.1.0-SNAPSHOT.jar") \
                    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
                    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
                    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

In [2]:
spark.version

'3.5.1'

## Create DataFrames

In [3]:
from pyspark.sql.types import Row, IntegerType
from pyspark.sql.functions import col

row_list_data = [Row('Alice', 1), Row('Braga', 2), Row('Steve', 3)]
df = spark.createDataFrame(row_list_data, ['name', 'id'])

df = df.withColumn("id", col("id").cast(IntegerType()))
    
df.createOrReplaceTempView('users')

df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- id: integer (nullable = true)



                                                                                

+-----+---+
| name| id|
+-----+---+
|Alice|  1|
|Braga|  2|
|Steve|  3|
+-----+---+



## Access via SparkSQL

In [4]:
from pyspark.sql.types import StringType

# register class that implements UDF interface
spark.udf.registerJavaFunction("lowerCaseString", "com.myudfs.LowerCaseString", returnType=StringType())

# call register function from class MathUDFs that will register the udf in spark 
spark.sparkContext._jvm.com.myudfs.MathUDFs.registerUdf()

JavaObject id=o80

In [5]:
spark.sql("""
    SELECT isGreaterThanZero(id), name
    FROM users
    """).show()

                                                                                

+---------------------+-----+
|isGreaterThanZero(id)| name|
+---------------------+-----+
|                 true|Alice|
|                 true|Braga|
|                 true|Steve|
+---------------------+-----+



                                                                                

In [6]:
spark.sql("""
    SELECT id, lowerCaseString(name)
    FROM users
    """).show()

+---+---------------------+
| id|lowerCaseString(name)|
+---+---------------------+
|  1|                alice|
|  2|                braga|
|  3|                steve|
+---+---------------------+



## Access via DataFrame API

In [7]:
from py4j.java_gateway import java_import
from pyspark.sql.column import Column, _to_java_column, _to_seq 

def greater_than_zero_udf(df_col: str):
    greaterThanZeroUDF = spark.sparkContext._jvm.com.myudfs.MathUDFs.isGreaterThanZeroUDF()
    return Column(greaterThanZeroUDF.apply(_to_seq(spark.sparkContext, [df_col], _to_java_column)))

def multiplier_udf(df_col: str, multiply_by: int):
    def multiplier(df_col: str):
        multiplierUDF = spark.sparkContext._jvm.com.myudfs.MathUDFs.multiplyBy(multiply_by)
        return Column(multiplierUDF.apply(_to_seq(spark.sparkContext, [df_col], _to_java_column)))

    return multiplier(df_col)

# work in progress
def lower_case_udf(df_col: str):
    lower_case_class = spark.sparkContext._jvm.com.myudfs.LowerCaseString
    return Column(lower_case_class.apply(_to_seq(spark.sparkContext, [df_col], _to_java_column)))

In [8]:
df \
    .withColumn("greater_than_zero", greater_than_zero_udf("id")) \
    .withColumn("multiplied", multiplier_udf("id", 10)) \
    .show()

+-----+---+-----------------+----------+
| name| id|greater_than_zero|multiplied|
+-----+---+-----------------+----------+
|Alice|  1|             true|        10|
|Braga|  2|             true|        20|
|Steve|  3|             true|        30|
+-----+---+-----------------+----------+

