In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Optimizing Shuffles")
    .master("spark://spark-master:7077")
    .config("spark.cores.max", 16)
    .config("spark.executor.cores", 4)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/31 17:04:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Check Spark defaultParallelism

spark.sparkContext.defaultParallelism

8

In [None]:
# Disable AQE
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)

In [None]:
# Read EMP CSV file with 10M records

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = (
    spark.read.format("csv")
    .schema(_schema)
    .option("header", True)
    .load("hdfs://namenode:9000/input/data/employee_records.csv")
)

In [None]:
from pyspark.sql.functions import spark_partition_id

emp.withColumn("partition_id", spark_partition_id()).groupBy(
    "partition_id"
).count().show()



+------------+------+
|partition_id| count|
+------------+------+
|           1|130406|
|           3|130393|
|           5|130403|
|           7| 87281|
|           0|130421|
|           2|130400|
|           4|130384|
|           6|130312|
+------------+------+



                                                                                

In [14]:
# Find out avg salary as per dept
from pyspark.sql.functions import avg

emp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))

In [8]:
# Check Spark Shuffle Partition setting

spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [15]:
# Write data for performance Benchmarking
spark.sparkContext.setJobDescription("shuffle partitions 200")
emp_avg.write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

                                                                                

In [None]:
emp_avg.withColumn("partition_id", spark_partition_id()).groupBy(
    "partition_id"
).count().show()



+------------+-----+
|partition_id|count|
+------------+-----+
|           0|   10|
+------------+-----+



                                                                                

SHUFFLE PARTITION AS A FACTOR OF CORES

In [18]:
spark.conf.set("spark.sql.shuffle.partitions", 16)

In [19]:
spark.sparkContext.setJobDescription("shuffle partitions 16")
emp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))
emp_avg.write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

                                                                                

In [None]:
emp_avg.withColumn("partition_id", spark_partition_id()).groupBy(
    "partition_id"
).count().show()



+------------+-----+
|partition_id|count|
+------------+-----+
|           3|    2|
|           2|    2|
|          14|    1|
|           6|    1|
|           9|    2|
|          11|    1|
|          15|    1|
+------------+-----+



                                                                                

SHUFFLE PARTITIONS = 1 (REPLICATING WHAT AQE DOES)

In [21]:
spark.conf.set("spark.sql.shuffle.partitions", 1)

In [22]:
spark.sparkContext.setJobDescription("shuffle partitions 1")
emp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))
emp_avg.write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

                                                                                

In [None]:
emp_avg.withColumn("partition_id", spark_partition_id()).groupBy(
    "partition_id"
).count().show()



+------------+-----+
|partition_id|count|
+------------+-----+
|           0|   10|
+------------+-----+



                                                                                

REACTIVATE AQE AND COMPARE

In [32]:
# Enable AQE
spark.conf.set("spark.sql.adaptive.enabled", True)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)

In [33]:
spark.conf.set("spark.sql.shuffle.partitions", 200)

In [34]:
spark.sparkContext.setJobDescription("shuffle partitions AQE")
emp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))
emp_avg.write.format("noop").mode("overwrite").save()
spark.sparkContext.setJobDescription(None)

In [None]:
emp_avg.withColumn("partition_id", spark_partition_id()).groupBy(
    "partition_id"
).count().show()

+------------+-----+
|partition_id|count|
+------------+-----+
|           0|   10|
+------------+-----+



In [24]:
spark.stop()