# User Defined Functions (UDFs)

In [1]:
# Prerequisites
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType

In [2]:
# Get SparkSession
spark = SparkSession.builder.master("local") \
    .appName("hello_UDFs") \
    .getOrCreate() 
print("Spark Version: ", spark.version)

Spark Version:  3.5.0


###  Define UDF

In [3]:
# Cubed function as a UDF
def cubed(s):
    return s*s*s

In [5]:
# Register UDF
spark.udf.register("cubed", cubed, LongType())

<function __main__.cubed(s)>

In [6]:
# Generate Temporary View
spark.range(1,10).createOrReplaceTempView("udf_test")

In [8]:
spark.sql("SELECT * from udf_test").show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [12]:
# Query with UDF
spark.sql("SELECT id, cubed(id) as id_cubed FROM udf_test").show(n=11)

+---+--------+
| id|id_cubed|
+---+--------+
|  1|       1|
|  2|       8|
|  3|      27|
|  4|      64|
|  5|     125|
|  6|     216|
|  7|     343|
|  8|     512|
|  9|     729|
+---+--------+



## Pandas UDFs

In [13]:
# Prerequisites
import pandas as pd

from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

In [14]:
# Python New cubed function
def py_cubed(a: pd.Series) -> pd.Series:
    return a*a*a

In [15]:
# Create Pandas UDF
df_cubed = pandas_udf(cubed, returnType=LongType())

In [17]:
# Create a Pandas Series and check cubed() function
x = pd.Series([1, 2, 3, 4])
print(cubed(x))

0     1
1     8
2    27
3    64
dtype: int64


In [18]:
# Use cubed wuth Spark DataFrame
df = spark.range(1,5)
df.show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
+---+



In [20]:
# Execute function as a Spark vectorized UDF
df.select("id", cubed(col("id"))).show()

+---+----------------+
| id|((id * id) * id)|
+---+----------------+
|  1|               1|
|  2|               8|
|  3|              27|
|  4|              64|
+---+----------------+

