# Spark AQE Coalesce Explained

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Spark AQE Explained")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/04 16:22:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Lets check the current spark conf for AQE and shuffle partitions
print(spark.conf.get("spark.sql.adaptive.enabled"))
print(spark.conf.get("spark.sql.adaptive.coalescePartitions.enabled"))
print(spark.conf.get("spark.sql.shuffle.partitions"))
print(
    spark.conf.get("spark.sql.adaptive.advisoryPartitionSizeInBytes")
)  # approx 64MB Default

true
true
200
67108864b


In [6]:
# Disable AQE and change Shuffle partition
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.shuffle.partitions", 5)

In [4]:
# Read example data set
import pandas as pd

data_file_https_url = "https://media.githubusercontent.com/media/subhamkharwal/pyspark-zero-to-hero/refs/heads/master/datasets/sales.csv"
schema = "transacted_at string, trx_id long, retailer_id long, description string, amount float, city_id float"
df = spark.createDataFrame(data=pd.read_csv(data_file_https_url), schema=schema)
df.printSchema()
print("Initial Partition after read: " + str(df.rdd.getNumPartitions()))

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: long (nullable = true)
 |-- retailer_id: long (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- city_id: float (nullable = true)

Initial Partition after read: 8


In [None]:
# GroupBy opeartion to trigger Shuffle
from pyspark.sql.functions import sum

df_count = (
    df.selectExpr("city_id", "cast(amount as double) as amount_double")
    .groupBy("city_id")
    .agg(sum("amount_double"))
)
print("Output shuffle partitions: " + str(df_count.rdd.getNumPartitions()))

Output shuffle partitions: 5


In [8]:
df_count.count()

24/11/04 16:26:44 WARN TaskSetManager: Stage 0 contains a task of very large size (8830 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

246

In [9]:
# Enable AQE and change Shuffle partition
spark.conf.set("spark.sql.adaptive.enabled", True)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)
spark.conf.set("spark.sql.shuffle.partitions", 289)

In [10]:
# Read example data set
data_file_https_url = "https://media.githubusercontent.com/media/subhamkharwal/pyspark-zero-to-hero/refs/heads/master/datasets/sales.csv"
schema = "transacted_at string, trx_id long, retailer_id long, description string, amount float, city_id float"
df = spark.createDataFrame(data=pd.read_csv(data_file_https_url), schema=schema)
df.printSchema()
print("Initial Partition after read: " + str(df.rdd.getNumPartitions()))

root
 |-- transacted_at: string (nullable = true)
 |-- trx_id: long (nullable = true)
 |-- retailer_id: long (nullable = true)
 |-- description: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- city_id: float (nullable = true)

Initial Partition after read: 8


In [None]:
# GroupBy opeartion to trigger Shuffle
# Since our output with city_id as group by is smaller than < 64MB thus the data is written in single partiton
from pyspark.sql.functions import sum

df_count = (
    df.selectExpr("city_id", "cast(amount as double) as amount_double")
    .groupBy("city_id")
    .agg(sum("amount_double"))
)
print("Output shuffle partitions: " + str(df_count.rdd.getNumPartitions()))

24/11/04 16:28:36 WARN TaskSetManager: Stage 3 contains a task of very large size (8830 KiB). The maximum recommended task size is 1000 KiB.
[Stage 3:>                                                          (0 + 8) / 8]

Output shuffle partitions: 1




In [12]:
df_count.count()

24/11/04 16:28:48 WARN TaskSetManager: Stage 4 contains a task of very large size (8830 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

246

In [None]:
# GroupBy opeartion to trigger Shuffle but this time with trx_id (which is more unique - thus more data)
# Since our output with trx_id as group by is > 64MB thus the data is written in multiple partitions
from pyspark.sql.functions import sum

df_count = (
    df.selectExpr("trx_id", "cast(amount as double) as amount_double")
    .groupBy("trx_id")
    .agg(sum("amount_double"))
)
print("Output shuffle partitions: " + str(df_count.rdd.getNumPartitions()))

24/11/04 16:31:53 WARN TaskSetManager: Stage 22 contains a task of very large size (8830 KiB). The maximum recommended task size is 1000 KiB.
[Stage 22:>                                                         (0 + 8) / 8]

Output shuffle partitions: 8




In [17]:
df_count.count()

24/11/04 16:32:07 WARN TaskSetManager: Stage 23 contains a task of very large size (8830 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

1101906

In [14]:
df.select("city_id").distinct().count()

24/11/04 16:31:27 WARN TaskSetManager: Stage 10 contains a task of very large size (8830 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

246

In [15]:
df.select("trx_id").distinct().count()

24/11/04 16:31:36 WARN TaskSetManager: Stage 16 contains a task of very large size (8830 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

1101906

In [18]:
spark.stop()