In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("User Defined Functions")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/11 16:24:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
emp = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load("hdfs://namenode:9000/input/data/employee_records.csv")
)
emp.rdd.getNumPartitions()

                                                                                

8

In [None]:
# Create a function to generate 10% of Salary as Bonus


def bonus(salary):
    return int(salary) * 0.1

In [5]:
# Register as UDF
from pyspark.sql import functions as F

bonus_udf = F.udf(bonus)

emp.withColumn("bonus", bonus_udf(F.col("salary"))).show()

[Stage 2:>                                                          (0 + 1) / 1]

+----------+----------+--------------------+----------+--------------------+--------------------+------+-------------+------------------+
|first_name| last_name|           job_title|       dob|               email|               phone|salary|department_id|             bonus|
+----------+----------+--------------------+----------+--------------------+--------------------+------+-------------+------------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653|            8|           51265.3|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836|            7|           99983.6|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900|           10|           13190.0|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506|            1|48550.600000000006|
|  Michelle|   Elliott|      Air c

                                                                                

In [None]:
# Create new column as bonus using UDF
from pyspark.sql.functions import expr

spark.udf.register("bonus_sql_udf", bonus, "double")
emp.withColumn("bonus", expr("bonus_sql_udf(salary)")).show()

[Stage 3:>                                                          (0 + 1) / 1]

+----------+----------+--------------------+----------+--------------------+--------------------+------+-------------+------------------+
|first_name| last_name|           job_title|       dob|               email|               phone|salary|department_id|             bonus|
+----------+----------+--------------------+----------+--------------------+--------------------+------+-------------+------------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653|            8|           51265.3|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836|            7|           99983.6|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900|           10|           13190.0|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506|            1|48550.600000000006|
|  Michelle|   Elliott|      Air c

                                                                                

In [7]:
# Create new column as bonus without UDF

emp.withColumn("bonus", expr("salary * 0.1")).show()

+----------+----------+--------------------+----------+--------------------+--------------------+------+-------------+-------+
|first_name| last_name|           job_title|       dob|               email|               phone|salary|department_id|  bonus|
+----------+----------+--------------------+----------+--------------------+--------------------+------+-------------+-------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653|            8|51265.3|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836|            7|99983.6|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900|           10|13190.0|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506|            1|48550.6|
|  Michelle|   Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...|       (705)900-5337|604738|       

In [8]:
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)

In [None]:
from pyspark.sql import functions as F

emp_skew = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load("hdfs://namenode:9000/input/data/employee_records_skewed.csv")
)
emp_skew = emp_skew.filter(F.col("salary") < 25000).join(
    emp_skew.filter(F.col("salary") < 25000), on="department_id"
)
emp_skew = emp_skew.coalesce(2)

                                                                                

In [None]:
# Register as UDF
from pyspark.sql import functions as F

dept_square = F.udf(lambda x: x**2)
# Apply the UDF to the DataFrame (skewed), which should trigger a spill
emp_skew.withColumn("dept_square", dept_square(F.col("department_id"))).write.format(
    "noop"
).mode("overwrite").save()

                                                                                

In [11]:
spark.stop()