In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("User Defined Functions")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

In [None]:
emp = spark.read.format("csv").option("header", True).option("inferSchema", True).load("hdfs://namenode:9000/input/data/employee_records.csv")
emp.rdd.getNumPartitions()

In [None]:
# Create a function to generate 10% of Salary as Bonus

def bonus(salary):
    return int(salary) * 0.1

In [None]:
# Register as UDF
from pyspark.sql import functions as F

bonus_udf = F.udf(bonus)

emp.withColumn("bonus", bonus_udf(F.col("salary"))).show()

In [None]:
# Create new column as bonus using UDF
from pyspark.sql.functions import expr
spark.udf.register("bonus_sql_udf", bonus, "double")
emp.withColumn("bonus", expr("bonus_sql_udf(salary)")).show()

In [None]:
# Create new column as bonus without UDF

emp.withColumn("bonus", expr("salary * 0.1")).show()

In [None]:
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)

In [None]:
from pyspark.sql import functions as F

emp_skew = spark.read.format("csv").option("header", True).option("inferSchema", True).load("hdfs://namenode:9000/input/data/employee_records_skewed.csv")
emp_skew = emp_skew.filter(F.col("salary") < 25000).join(emp_skew.filter(F.col("salary") < 25000), on="department_id")
emp_skew = emp_skew.coalesce(2)

In [None]:
# Register as UDF
from pyspark.sql import functions as F

dept_square = F.udf(lambda x: x**2)
# Apply the UDF to the DataFrame (skewed), which should trigger a spill
emp_skew.withColumn("dept_square", dept_square(F.col("department_id"))).write.format("noop").mode("overwrite").save()

In [None]:
spark.stop()