# Spark AQE Coalesce Explained

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Spark AQE Explained") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

In [None]:
# Lets check the current spark conf for AQE and shuffle partitions
print(spark.conf.get("spark.sql.adaptive.enabled"))
print(spark.conf.get("spark.sql.adaptive.coalescePartitions.enabled"))
print(spark.conf.get("spark.sql.shuffle.partitions"))
print(spark.conf.get("spark.sql.adaptive.advisoryPartitionSizeInBytes")) #approx 64MB Default

In [None]:
# Disable AQE and change Shuffle partition
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.shuffle.partitions", 289)

In [None]:
# Read example data set
df = spark.read.format("csv").option("header", True).load("dataset/sales.csv")
df.printSchema()
print("Initial Partition after read: " + str(df.rdd.getNumPartitions()))

# GroupBy opeartion to trigger Shuffle
from pyspark.sql.functions import sum
df_count = df.selectExpr("city_id","cast(amount as double) as amount_double").groupBy("city_id").agg(sum("amount_double"))
print("Output shuffle partitions: " + str(df_count.rdd.getNumPartitions()))

In [None]:
# Enable AQE and change Shuffle partition
spark.conf.set("spark.sql.adaptive.enabled", True)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)
spark.conf.set("spark.sql.shuffle.partitions", 289)

In [None]:
# Read example data set
df = spark.read.format("csv").option("header", True).load("dataset/sales.csv")
df.printSchema()
print("Initial Partition after read: " + str(df.rdd.getNumPartitions()))

In [None]:
# GroupBy opeartion to trigger Shuffle
# Since our output with city_id as group by is smaller than < 64MB thus the data is written in single partiton
from pyspark.sql.functions import sum
df_count = df.selectExpr("city_id","cast(amount as double) as amount_double").groupBy("city_id").agg(sum("amount_double"))
print("Output shuffle partitions: " + str(df_count.rdd.getNumPartitions()))

In [None]:
# GroupBy opeartion to trigger Shuffle but this time with trx_id (which is more unique - thus more data)
# Since our output with trx_id as group by is > 64MB thus the data is written in multiple partitions
from pyspark.sql.functions import sum
df_count = df.selectExpr("trx_id","cast(amount as double) as amount_double").groupBy("trx_id").agg(sum("amount_double"))
print("Output shuffle partitions: " + str(df_count.rdd.getNumPartitions()))