In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, spark_partition_id, asc

In [2]:
spark = SparkSession.builder \
    .appName("DataFrameShuffleExample") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/30 11:27:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# DataFrame 생성
data = [(i, i % 5) for i in range(20)]
df = spark.createDataFrame(data, ["value", "key"])

In [5]:
# narrow transformation - stage 1
df2 = df.filter(col("value") > 5).select("value", "key")

In [None]:
# 각 Row가 속한 Partition 확인
df2_with_pid = df2.withColumn("partition_id", spark_partition_id())
print("Stage 1 Partition Info:")
df2_with_pid.sort("partition_id", ascending=True).show()

In [6]:
# wide transformation - shuffle이 발생 - stage 2
df3 = df2.groupBy("key").agg(spark_sum("value").alias("sum_value"))

In [7]:
result = df3.collect()

                                                                                

In [29]:
print("Result:")
for row in result:
    print(row)

Result:
Row(key=2, sum_value=36)
Row(key=3, sum_value=39)
Row(key=4, sum_value=42)
Row(key=1, sum_value=33)
Row(key=0, sum_value=25)


In [None]:
# Stage 2 Partition 확인
df3_with_pid = df3.withColumn("partition_id", spark_partition_id())
print("Stage 2 Partition Info (after shuffle):")
df3_with_pid.show()

In [30]:
df3.explain(extended=True)

== Parsed Logical Plan ==
'Aggregate ['key], ['key, sum('value) AS sum_value#93]
+- Project [value#84L, key#85L]
   +- Filter (value#84L > cast(5 as bigint))
      +- LogicalRDD [value#84L, key#85L], false

== Analyzed Logical Plan ==
key: bigint, sum_value: bigint
Aggregate [key#85L], [key#85L, sum(value#84L) AS sum_value#93L]
+- Project [value#84L, key#85L]
   +- Filter (value#84L > cast(5 as bigint))
      +- LogicalRDD [value#84L, key#85L], false

== Optimized Logical Plan ==
Aggregate [key#85L], [key#85L, sum(value#84L) AS sum_value#93L]
+- Filter (isnotnull(value#84L) AND (value#84L > 5))
   +- LogicalRDD [value#84L, key#85L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(2) HashAggregate(keys=[key#85L], functions=[sum(value#84L)], output=[key#85L, sum_value#93L])
   +- AQEShuffleRead coalesced
      +- ShuffleQueryStage 0
         +- Exchange hashpartitioning(key#85L, 8), ENSURE_REQUIREMENTS, [plan_id=147]
            +- *(1) HashAggregat

In [22]:
spark.stop()