# Hello Higher Order Functions

In [3]:
# Prerequisites
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [4]:
# Get SparkSession
spark = SparkSession.builder.master("local") \
    .appName("hello_higher_order_functions") \
    .getOrCreate() 
print("Spark Version: ", spark.version)

Spark Version:  3.5.0


### Create DataFrame

In [7]:
# Create a DataFrame
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])

t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 60]]

df_t = spark.createDataFrame(t_list, schema)
df_t.show(truncate=False)



+----------------------------+
|celsius                     |
+----------------------------+
|[35, 36, 32, 30, 40, 42, 38]|
|[31, 32, 34, 55, 60]        |
+----------------------------+



In [10]:
df_t.printSchema()

root
 |-- celsius: array (nullable = true)
 |    |-- element: integer (containsNull = true)



### Create Temporary View

In [11]:
df_t.createOrReplaceTempView("tC")

### Transform()

In [13]:
# Calculate Fahrenheit from Celsius
spark.sql("""SELECT celsius, 
          transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit 
          FROM tC""").show(truncate=False)

+----------------------------+-------------------------------+
|celsius                     |fahrenheit                     |
+----------------------------+-------------------------------+
|[35, 36, 32, 30, 40, 42, 38]|[95, 96, 89, 86, 104, 107, 100]|
|[31, 32, 34, 55, 60]        |[87, 89, 93, 131, 140]         |
+----------------------------+-------------------------------+



### filter()

In [14]:
# Filter temperatures > 38C
spark.sql("""SELECT celsius, 
          filter(celsius, t -> t > 38) as high
          FROM tC""").show()

+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 60]|[55, 60]|
+--------------------+--------+



### exists()

In [18]:
spark.sql("""SELECT celsius,
          exists(celsius, t -> t = 38) as threshold
          from tC""").show()

+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|     true|
|[31, 32, 34, 55, 60]|    false|
+--------------------+---------+



### reduce()

In [19]:
spark.sql("""SELECT celsius,
          reduce(celsius, 0, (t, acc) -> t + acc,
          acc -> (acc div size(celsius) * 9 div 5) + 32) as avgFahrenheit 
          FROM tC""").show()

+--------------------+-------------+
|             celsius|avgFahrenheit|
+--------------------+-------------+
|[35, 36, 32, 30, ...|           96|
|[31, 32, 34, 55, 60]|          107|
+--------------------+-------------+

