In [None]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Optimizing Joins")
    .master("spark://spark-master:7077")
    .config("spark.cores.max", 16)
    .config("spark.executor.cores", 4)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/04 16:46:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

#### Join Big and Small table - SortMerge vs BroadCast Join

In [None]:
# Read EMP CSV data

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = (
    spark.read.format("csv")
    .schema(_schema)
    .option("header", True)
    .load("hdfs://namenode:9000/input/data/employee_records.csv")
)

In [None]:
# Read DEPT CSV data

_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = (
    spark.read.format("csv")
    .schema(_dept_schema)
    .option("header", True)
    .load("hdfs://namenode:9000/input/data/department_data.csv")
)

In [5]:
emp.count()

                                                                                

1000000

In [6]:
dept.count()

10

In [None]:
# Join Datasets
from pyspark.sql import functions as F

df_joined = emp.join(
    F.broadcast(dept), on=emp.department_id == dept.department_id, how="left_outer"
)

In [8]:
df_joined.write.format("noop").mode("overwrite").save()

                                                                                

In [9]:
df_joined.explain()

== Physical Plan ==
*(2) BroadcastHashJoin [department_id#7], [department_id#16], LeftOuter, BuildRight, false
:- FileScan csv [first_name#0,last_name#1,job_title#2,dob#3,email#4,phone#5,salary#6,department_id#7] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://namenode:9000/input/data/employee_records.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<first_name:string,last_name:string,job_title:string,dob:string,email:string,phone:string,s...
+- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [plan_id=109]
   +- *(1) Filter isnotnull(department_id#16)
      +- FileScan csv [department_id#16,department_name#17,description#18,city#19,state#20,country#21] Batched: false, DataFilters: [isnotnull(department_id#16)], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://namenode:9000/input/data/department_data.csv], PartitionFilters: [], PushedFilters: [IsNotNull(department_id)], Rea

In [None]:
# This is what happens without Broadcast
df_joined_no_bc = emp.join(
    dept, on=emp.department_id == dept.department_id, how="left_outer"
)
df_joined_no_bc.write.format("noop").mode("overwrite").save()

                                                                                

In [None]:
# This is what happens when we Broadcast a very large dataframe
emp = emp.coalesce(1)  # 1 huge partition
df_joined_no_bc = dept.join(
    F.broadcast(emp), on=emp.department_id == dept.department_id, how="left_outer"
)
df_joined_no_bc.write.format("noop").mode("overwrite").save()

24/11/04 16:53:50 WARN TaskSetManager: Lost task 0.0 in stage 9.0 (TID 229) (172.18.0.8 executor 1): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$1227/0x00000008409ef040.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1849)
	at java.base/java.io.ObjectOutputStream.write(ObjectOut

Py4JJavaError: An error occurred while calling o82.save.
: java.util.concurrent.ExecutionException: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 9.0 failed 4 times, most recent failure: Lost task 0.4 in stage 9.0 (TID 233) (172.18.0.6 executor 3): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$953/0x00000008408ea040.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1849)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:708)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1(Utils.scala:271)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1$adapted(Utils.scala:271)
	at org.apache.spark.util.Utils$$$Lambda$956/0x00000008408ec040.apply(Unknown Source)
	at org.apache.spark.util.Utils$.writeByteBufferImpl(Utils.scala:249)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:271)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2$adapted(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer$$Lambda$955/0x00000008408eb840.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.writeExternal(ChunkedByteBuffer.scala:103)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:60)
	at org.apache.spark.scheduler.DirectTaskResult$$Lambda$964/0x00000008408f0840.apply$mcV$sp(Unknown Source)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1495)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:59)
	at java.base/java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1450)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1421)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1169)
	at java.base/java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:345)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.SerializerHelper$.serializeToChunkedBuffer(SerializerHelper.scala:42)

Driver stacktrace:
	at java.base/java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.base/java.util.concurrent.FutureTask.get(FutureTask.java:205)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:209)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:517)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeBroadcast$1(SparkPlan.scala:208)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:204)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:207)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareRelation(BroadcastHashJoinExec.scala:221)
	at org.apache.spark.sql.execution.joins.HashJoin.codegenOuter(HashJoin.scala:444)
	at org.apache.spark.sql.execution.joins.HashJoin.codegenOuter$(HashJoin.scala:443)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenOuter(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.joins.HashJoin.doConsume(HashJoin.scala:357)
	at org.apache.spark.sql.execution.joins.HashJoin.doConsume$(HashJoin.scala:354)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:196)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:151)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:485)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:458)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:97)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.joins.HashJoin.doProduce(HashJoin.scala:351)
	at org.apache.spark.sql.execution.joins.HashJoin.doProduce$(HashJoin.scala:350)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:97)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:92)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:660)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:723)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:384)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:382)
	at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.writeWithV2(WriteToDataSourceV2Exec.scala:266)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run(WriteToDataSourceV2Exec.scala:360)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run$(WriteToDataSourceV2Exec.scala:359)
	at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.run(WriteToDataSourceV2Exec.scala:266)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:318)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 9.0 failed 4 times, most recent failure: Lost task 0.4 in stage 9.0 (TID 233) (172.18.0.6 executor 3): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$953/0x00000008408ea040.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1849)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:708)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1(Utils.scala:271)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1$adapted(Utils.scala:271)
	at org.apache.spark.util.Utils$$$Lambda$956/0x00000008408ec040.apply(Unknown Source)
	at org.apache.spark.util.Utils$.writeByteBufferImpl(Utils.scala:249)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:271)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2$adapted(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer$$Lambda$955/0x00000008408eb840.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.writeExternal(ChunkedByteBuffer.scala:103)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:60)
	at org.apache.spark.scheduler.DirectTaskResult$$Lambda$964/0x00000008408f0840.apply$mcV$sp(Unknown Source)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1495)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:59)
	at java.base/java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1450)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1421)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1169)
	at java.base/java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:345)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.SerializerHelper$.serializeToChunkedBuffer(SerializerHelper.scala:42)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2790)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2726)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2725)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2725)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1211)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1211)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2989)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2928)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2917)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:976)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2258)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2298)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2323)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1022)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:408)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1021)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:455)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:137)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:217)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:61)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:348)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$953/0x00000008408ea040.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1849)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:708)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1(Utils.scala:271)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1$adapted(Utils.scala:271)
	at org.apache.spark.util.Utils$$$Lambda$956/0x00000008408ec040.apply(Unknown Source)
	at org.apache.spark.util.Utils$.writeByteBufferImpl(Utils.scala:249)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:271)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2$adapted(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer$$Lambda$955/0x00000008408eb840.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.writeExternal(ChunkedByteBuffer.scala:103)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:60)
	at org.apache.spark.scheduler.DirectTaskResult$$Lambda$964/0x00000008408f0840.apply$mcV$sp(Unknown Source)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1495)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:59)
	at java.base/java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1450)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1421)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1169)
	at java.base/java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:345)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.SerializerHelper$.serializeToChunkedBuffer(SerializerHelper.scala:42)


#### Join Big and Big table - SortMerge without Buckets

In [12]:
# Read Sales data

sales = spark.read.parquet("hdfs://namenode:9000/input/data/sales.parquet")

                                                                                

In [14]:
sales.count()

                                                                                

1102575

In [None]:
# Read City data

city_schema = (
    "city_id string, city string, state string, state_abv string, country string"
)

city = (
    spark.read.format("csv")
    .schema(city_schema)
    .option("header", True)
    .load("hdfs://namenode:9000/input/data/cities.csv")
)

In [16]:
city.count()

                                                                                

2349391

In [None]:
# Join Data

df_sales_joined = sales.join(city, on=sales.city_id == city.city_id, how="left_outer")

In [18]:
df_sales_joined.write.format("noop").mode("overwrite").save()

                                                                                

In [31]:
# Partitions deep dive

print(sales.rdd.getNumPartitions())
print(city.rdd.getNumPartitions())
print(df_sales_joined.rdd.getNumPartitions())

8
8
200


In [None]:
sales.withColumn("partition_id", F.spark_partition_id()).groupby(
    "partition_id"
).count().show()

+------------+-------+
|partition_id|  count|
+------------+-------+
|           6|  53999|
|           3|1048576|
+------------+-------+



In [39]:
sales.select("trx_id").distinct().count()

                                                                                

1101905

In [44]:
sales.select("city_id").distinct().count()

245

In [53]:
sales.groupby("city_id").count().orderBy("count", ascending=False).show()

+----------+-----+
|   city_id|count|
+----------+-----+
|  45522086| 9796|
| 856233063| 9724|
| 781290085| 9715|
|1217211842| 9697|
|1607451095| 9662|
|2055198208| 9647|
|1678157838| 9641|
| 559832710| 9612|
| 485114748| 9596|
| 831124332| 9586|
|  28424447| 9575|
|1717498102| 9573|
|1985376021| 9562|
|1141716004| 9555|
| 287177635| 9552|
|2096878759| 9546|
|1676567417| 9543|
|1606354386| 9538|
|2052535508| 9525|
| 275328977| 9522|
+----------+-----+
only showing top 20 rows



In [40]:
sales.count()

1102575

In [None]:
sales.withColumn("partition_id", F.spark_partition_id()).groupby("city_id").agg(
    F.countDistinct("partition_id").alias("count")
).orderBy("count", ascending=False).show()

+----------+-----+
|   city_id|count|
+----------+-----+
| 198096267|    2|
| 293622227|    2|
|2096878759|    2|
| 985710244|    2|
|1606354386|    2|
|1678157838|    2|
| 407629665|    2|
| 930259917|    2|
| 220566264|    2|
| 632305754|    2|
|1683103551|    2|
|  45522086|    2|
| 350411713|    2|
|2078628066|    2|
| 414653088|    2|
|1840913006|    2|
| 971443623|    2|
| 381349011|    2|
|1802488818|    2|
|2033929143|    2|
+----------+-----+
only showing top 20 rows



In [None]:
# REPARTITIONING OF CITY_ID COLUMN. NOW ALL RECORDS FOR THE SAME CITY_ID WILL BE AT THE SAME PARTITION
sales = sales.repartition(8, "city_id")
sales.withColumn("partition_id", F.spark_partition_id()).groupby("city_id").agg(
    F.countDistinct("partition_id").alias("count")
).orderBy("count", ascending=False).show()



+----------+-----+
|   city_id|count|
+----------+-----+
|1620965190|    1|
| 576817662|    1|
|1223420625|    1|
| 831124332|    1|
|2056066328|    1|
|1802488818|    1|
|1610115143|    1|
|1610133005|    1|
|1194163531|    1|
| 216135201|    1|
|1574873504|    1|
|  45522086|    1|
|1602735059|    1|
| 380143978|    1|
|1296036143|    1|
| 287177635|    1|
|  77397141|    1|
| 585903816|    1|
|1243655802|    1|
| 275328977|    1|
+----------+-----+
only showing top 20 rows



                                                                                

In [None]:
city.withColumn("partition_id", F.spark_partition_id()).groupby(
    "partition_id"
).count().show()

+------------+------+
|partition_id| count|
+------------+------+
|           1|309558|
|           6|309876|
|           3|309622|
|           5|309730|
|           4|309817|
|           7|181567|
|           2|309607|
|           0|309614|
+------------+------+



##### Write Sales and City data in Buckets

In [None]:
# Write Sales data in Buckets

sales.write.format("csv").mode("overwrite").bucketBy(4, "city_id").option(
    "header", True
).option("path", "hdfs://namenode:9000/input/data/sales_bucket").saveAsTable(
    "sales_bucket"
)

                                                                                

In [None]:
# Write City data in Buckets

city.write.format("csv").mode("overwrite").bucketBy(4, "city_id").option(
    "header", True
).option("path", "hdfs://namenode:9000/input/data/city_bucket").saveAsTable(
    "city_bucket"
)

                                                                                

In [27]:
# Check tables

spark.sql("show tables in default").show()

+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|  default| city_bucket|      false|
|  default|sales_bucket|      false|
+---------+------------+-----------+



#### Join Sales and City data - SortMerge with Bucket

In [28]:
# Read Sales table

sales_bucket = spark.read.table("sales_bucket")

In [29]:
# Read City table

city_bucket = spark.read.table("city_bucket")

In [None]:
# Join datasets

df_joined_bucket = city_bucket.join(
    sales_bucket, on=sales_bucket.city_id == city_bucket.city_id, how="left_outer"
)

In [33]:
# Write dataset

df_joined_bucket.write.format("noop").mode("overwrite").save()

                                                                                

In [70]:
df_joined_bucket.explain()

== Physical Plan ==
*(4) SortMergeJoin [city_id#1270L], [cast(city_id#1277 as bigint)], LeftOuter
:- *(1) Sort [city_id#1270L ASC NULLS FIRST], false, 0
:  +- FileScan csv spark_catalog.default.sales_bucket[transacted_at#1265,trx_id#1266L,retailer_id#1267L,description#1268,amount#1269,city_id#1270L] Batched: false, Bucketed: true, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://namenode:9000/input/data/sales_bucket], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<transacted_at:string,trx_id:bigint,retailer_id:bigint,description:string,amount:double,cit..., SelectedBucketsCount: 4 out of 4
+- *(3) Sort [cast(city_id#1277 as bigint) ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(cast(city_id#1277 as bigint), 4), ENSURE_REQUIREMENTS, [plan_id=2196]
      +- *(2) Filter isnotnull(city_id#1277)
         +- FileScan csv spark_catalog.default.city_bucket[city_id#1277,city#1278,state#1279,state_abv#1280,country#1281] Batched: false, Bucketed: 

In [34]:
spark.stop()