## Hint 실습해보기

In [1]:
from pyspark.sql import (
    Row,
    SparkSession)
import pyspark.sql.functions as F

In [2]:
spark=(
    SparkSession
    .builder
    .appName("spark-hint")
    .master("spark://spark-master:7077")
    #  자동 Broadcast 끄기
    .config("spark.sql.autoBroadcastJoinThreshold", -1)
    # AQE 끄기
    .config("spark.sql.adaptive.enabled", "false")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/02 11:03:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# 연습용 데이터 

fact_df=(
    spark
    .range(0,10_000_000)
    .withColumnRenamed("id", "user_id")
    .withColumn("amount",F.rand()*100)
)
fact_df.take(2)

[Row(user_id=0, amount=97.68326426258781),
 Row(user_id=1, amount=25.12428175168362)]

In [5]:
# 작은 테이블 연습용
dim_df=(
    spark
    .range(0,100)
    .withColumnRenamed("id", "user_id")
    .withColumn("country",F.lit("US"))
)
dim_df.take(2)

[Row(user_id=0, country='US'), Row(user_id=1, country='US')]

In [5]:
# 힌트 없이 Join

In [6]:
join_no_hint=fact_df.join(dim_df,"user_id","inner")
join_no_hint.explain(mode="formatted")

== Physical Plan ==
AdaptiveSparkPlan (11)
+- Project (10)
   +- SortMergeJoin Inner (9)
      :- Sort (4)
      :  +- Exchange (3)
      :     +- Project (2)
      :        +- Range (1)
      +- Sort (8)
         +- Exchange (7)
            +- Project (6)
               +- Range (5)


(1) Range
Output [1]: [id#0L]
Arguments: Range (0, 10000000, step=1, splits=Some(10))

(2) Project
Output [2]: [id#0L AS user_id#2L, (rand(4581568525187603755) * 100.0) AS amount#4]
Input [1]: [id#0L]

(3) Exchange
Input [2]: [user_id#2L, amount#4]
Arguments: hashpartitioning(user_id#2L, 200), ENSURE_REQUIREMENTS, [plan_id=49]

(4) Sort
Input [2]: [user_id#2L, amount#4]
Arguments: [user_id#2L ASC NULLS FIRST], false, 0

(5) Range
Output [1]: [id#7L]
Arguments: Range (0, 100, step=1, splits=Some(10))

(6) Project
Output [1]: [id#7L AS user_id#9L]
Input [1]: [id#7L]

(7) Exchange
Input [1]: [user_id#9L]
Arguments: hashpartitioning(user_id#9L, 200), ENSURE_REQUIREMENTS, [plan_id=50]

(8) Sort
Input [1]: [us

In [7]:
# SortMergeJoin:  Spark가 Broadcast를 안 쓰고, 양쪽 데이터를 정렬 + 셔플해서 조인함
# Exchange : 네트워크 셔플 발생 (파티션 재분배)

In [8]:
# 힌트 강제 
# 큰 테이블.join(작은 테이블.hint("broadcast"), 조건)

In [9]:
join_broadcast=(
    fact_df
    .join(dim_df.hint("broadcast"),"user_id","inner")
)
join_broadcast.explain(mode="formatted")

== Physical Plan ==
AdaptiveSparkPlan (8)
+- Project (7)
   +- BroadcastHashJoin Inner BuildRight (6)
      :- Project (2)
      :  +- Range (1)
      +- BroadcastExchange (5)
         +- Project (4)
            +- Range (3)


(1) Range
Output [1]: [id#0L]
Arguments: Range (0, 10000000, step=1, splits=Some(10))

(2) Project
Output [2]: [id#0L AS user_id#2L, (rand(4581568525187603755) * 100.0) AS amount#4]
Input [1]: [id#0L]

(3) Range
Output [1]: [id#7L]
Arguments: Range (0, 100, step=1, splits=Some(10))

(4) Project
Output [1]: [id#7L AS user_id#9L]
Input [1]: [id#7L]

(5) BroadcastExchange
Input [1]: [user_id#9L]
Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=79]

(6) BroadcastHashJoin
Left keys [1]: [user_id#2L]
Right keys [1]: [user_id#9L]
Join type: Inner
Join condition: None

(7) Project
Output [3]: [user_id#2L, amount#4, US AS country#11]
Input [3]: [user_id#2L, amount#4, user_id#9L]

(8) AdaptiveSparkPlan
Output [3]: [user_id#2L, amount#4,

In [10]:
# 힌트 강제 성공 >  BroadcastHashJoin,BroadcastExchange

In [11]:
# merge 강제 

In [12]:
join_merge = (
    fact_df
    .join(dim_df.hint("merge"), "user_id", "inner")
)

join_merge.explain(mode="formatted")

== Physical Plan ==
AdaptiveSparkPlan (11)
+- Project (10)
   +- SortMergeJoin Inner (9)
      :- Sort (4)
      :  +- Exchange (3)
      :     +- Project (2)
      :        +- Range (1)
      +- Sort (8)
         +- Exchange (7)
            +- Project (6)
               +- Range (5)


(1) Range
Output [1]: [id#0L]
Arguments: Range (0, 10000000, step=1, splits=Some(10))

(2) Project
Output [2]: [id#0L AS user_id#2L, (rand(4581568525187603755) * 100.0) AS amount#4]
Input [1]: [id#0L]

(3) Exchange
Input [2]: [user_id#2L, amount#4]
Arguments: hashpartitioning(user_id#2L, 200), ENSURE_REQUIREMENTS, [plan_id=107]

(4) Sort
Input [2]: [user_id#2L, amount#4]
Arguments: [user_id#2L ASC NULLS FIRST], false, 0

(5) Range
Output [1]: [id#7L]
Arguments: Range (0, 100, step=1, splits=Some(10))

(6) Project
Output [1]: [id#7L AS user_id#9L]
Input [1]: [id#7L]

(7) Exchange
Input [1]: [user_id#9L]
Arguments: hashpartitioning(user_id#9L, 200), ENSURE_REQUIREMENTS, [plan_id=108]

(8) Sort
Input [1]: [

In [13]:
# 파티션수 제어

In [16]:
join_broadcast.rdd.getNumPartitions()

10

In [19]:
control = (
    fact_df
    .join(dim_df.hint("broadcast"), "user_id")
    .hint("repartition", 8)
)

# control.explain(mode="formatted")

In [18]:
control.rdd.getNumPartitions()

8

In [22]:
# hint가 무시되는 경우
# 별칭을 안 맞춘 경우 

In [8]:
# 1. SQL을 쓰기 위해 Temp View 등록
fact_df.createOrReplaceTempView("fact_table")
dim_df.createOrReplaceTempView("dim_table")

# 상황: FROM dim_table AS d 라고 해놓고, 힌트에는 BROADCAST(dim_table)이라고 적음
# 결과: 스파크는 'd'는 아는데 'dim_table'은 Alias에 가려져서 못 찾음 -> 힌트 무시!


spark.sql("""
    SELECT /*+ BROADCAST(dim_table) */ * FROM fact_table f
    JOIN dim_table d 
    ON f.user_id = d.user_id
""").explain()

== Physical Plan ==
*(5) SortMergeJoin [user_id#2L], [user_id#9L], Inner
:- *(2) Sort [user_id#2L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(user_id#2L, 200), ENSURE_REQUIREMENTS, [plan_id=150]
:     +- *(1) Project [id#0L AS user_id#2L, (rand(660007605823935748) * 100.0) AS amount#4]
:        +- *(1) Range (0, 10000000, step=1, splits=10)
+- *(4) Sort [user_id#9L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(user_id#9L, 200), ENSURE_REQUIREMENTS, [plan_id=156]
      +- *(3) Project [id#7L AS user_id#9L, US AS country#11]
         +- *(3) Range (0, 100, step=1, splits=10)




26/02/02 11:08:19 WARN HintErrorLogger: Count not find relation 'dim_table' specified in hint 'BROADCAST(dim_table)'.


In [9]:
# > SortMergeJoin [codegen id : ...] 힌트 무시됨 

In [10]:
spark.stop()