In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Spark Memory Management")
    .master("spark://0b8ee91998c1:7077")
    .config("spark.cores.max", 4)
    .config("spark.executor.cores", 2)
    .config("spark.executor.memory", "512MB")
    .getOrCreate()
)

spark

**Spark Memory Calculation Per Executor**

In [1]:
# JVM On-Heap Usable memory (89% of executor memory)
512 * 0.89

455.68

In [2]:
# Subtracting Reserve Memory (300MB)
455.68 - 300

155.68

In [3]:
# Total Spark Memory (Unified Memory - Storage + Execution Memory) (60% default) spark.memory.fraction = 0.6
155.68 * 0.6

93.408

In [4]:
# User / Undefined Memory (Not controlled by Spark) (remaining 40% default)
155.68 * 0.4

62.272000000000006

In [5]:
# Storage Memory (spark.memory.storageFraction = 0.5)
93.408 * 0.5

46.704

In [6]:
# Execution Memory 
93.408 * 0.5

46.704

In [8]:
# Execution Memory per core
46.704 / 2

23.352

**Out Of Memory Error Demo on Executors**

In [None]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [11]:
%%sh
ls -ltrh data/input/oom/

total 42M
-rw-r--r-- 1 jovyan users 7.3K Sep 18 22:11 spark_oom_files.7z
-rw-r--r-- 1 jovyan users  11M Sep 18 22:12 text_file_xs.txt
-rw-r--r-- 1 jovyan users  11M Sep 18 22:12 text_file_singleline_xs.txt
-rw-r--r-- 1 jovyan users  21M Sep 18 22:12 text_file_s.txt


In [2]:
# Read file - Uses text_file_singleline_xs.txt to demonstrate OOM
df = spark.read.format("text").load("data/input/oom/text_file_singleline_xs.txt")
df.show()

In [3]:
# Cache data
df.cache().count()

1

In [6]:
# Explode data to count words
from pyspark.sql.functions import lower, split, explode, count, lit

df_final = (
    df.withColumn ("value", lower("value"))
    .withColumn("splitted_val", split("value", " ")) # ['is', 'an']
    .withColumn ("exploded_val", explode("splitted_val"))
    .drop ("splitted_val", "value")
    .groupBy("exploded_val").agg(count(lit(1)).alias("сount"))
)

In [7]:
df_final.show()

Py4JJavaError: An error occurred while calling o119.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.4 in stage 4.0 (TID 14) (172.23.0.3 executor 2): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.unsafe.types.UTF8String.fromBytes(UTF8String.java:154)
	at org.apache.spark.unsafe.types.UTF8String.fromString(UTF8String.java:184)
	at org.apache.spark.unsafe.types.UTF8String.split(UTF8String.java:1528)
	at org.apache.spark.unsafe.types.UTF8String.split(UTF8String.java:1501)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:143)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:57)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$606/0x00000005014a43c8.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:2935)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2935)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2927)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1295)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3207)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3141)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3130)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.unsafe.types.UTF8String.fromBytes(UTF8String.java:154)
	at org.apache.spark.unsafe.types.UTF8String.fromString(UTF8String.java:184)
	at org.apache.spark.unsafe.types.UTF8String.split(UTF8String.java:1528)
	at org.apache.spark.unsafe.types.UTF8String.split(UTF8String.java:1501)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)
	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:143)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:57)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$606/0x00000005014a43c8.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)
