### Start SparkSession

In [None]:
import os, subprocess, sys
print("JAVA_HOME:", os.environ.get("JAVA_HOME"))
print(os.environ.get("HADOOP_HOME"))

In [6]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


spark = (
    SparkSession
    .builder
    .appName("halltape_pyspark_local")
    .getOrCreate()
)

print("Spark session:", spark.sparkContext.uiWebUrl)

Spark session: http://10090-LT-X0017.na.msds.rhi.com:4040


### Read

In [7]:
PATH = 'data/customs_data.csv'

In [8]:
spark.read.csv(PATH).show()

+--------------------+
|                 _c0|
+--------------------+
|month;country;cod...|
|01/2016;IT;620469...|
|01/2016;CN;900190...|
|01/2016;BY;841430...|
|01/2016;US;901850...|
|01/2016;EE;902110...|
|01/2016;FR;381600...|
|01/2016;MX;852351...|
|01/2016;JP;620452...|
|01/2016;KR;611020...|
|01/2016;KG;852713...|
|01/2016;ZA;842123...|
|01/2016;CN;851810...|
|01/2016;TR;841790...|
|01/2016;IT;390610...|
|01/2016;CZ;870840...|
|01/2016;ES;640419...|
|01/2016;IT;940490...|
|01/2016;UA;820780...|
|01/2016;CN;330410...|
+--------------------+
only showing top 20 rows



In [11]:
df = spark.read.csv(PATH, sep=';', header=True)

df.show(truncate=False)

df.show(2, False, True)

+-------+-------+----------+------+------+--------+------+--------+-------------+-----------+-----------------------------+
|month  |country|code      |value |netto |quantity|region|district|direction_eng|measure_eng|load_date                    |
+-------+-------+----------+------+------+--------+------+--------+-------------+-----------+-----------------------------+
|01/2016|IT     |6204695000|131   |1     |7       |46000 |01      |IM           |ShT        |2024-07-01T00:00:00.000+03:00|
|01/2016|CN     |9001900009|112750|18    |0       |46000 |01      |IM           |1          |2024-01-01T00:00:00.000+03:00|
|01/2016|BY     |8414302004|392   |57    |8       |50000 |06      |IM           |ShT        |2024-06-01T00:00:00.000+03:00|
|01/2016|US     |9018509000|54349 |179   |0       |40000 |02      |IM           |1          |2024-04-01T00:00:00.000+03:00|
|01/2016|EE     |9021101000|17304 |372   |0       |46000 |01      |IM           |1          |2024-02-01T00:00:00.000+03:00|
|01/2016

### PrintSchema

In [8]:
df.printSchema()

root
 |-- month: string (nullable = true)
 |-- country: string (nullable = true)
 |-- code: string (nullable = true)
 |-- value: string (nullable = true)
 |-- netto: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- region: string (nullable = true)
 |-- district: string (nullable = true)
 |-- direction_eng: string (nullable = true)
 |-- measure_eng: string (nullable = true)
 |-- load_date: string (nullable = true)



In [12]:
result = df\
    .withColumnRenamed("direction_eng", "direction")\
    .withColumnRenamed("measure_eng", "measure")

result.columns

['month',
 'country',
 'code',
 'value',
 'netto',
 'quantity',
 'region',
 'district',
 'direction',
 'measure',
 'load_date']

### Select

In [22]:
result.show(4)

+-------+-------+----------+------+-----+--------+------+--------+---------+-------+--------------------+
|  month|country|      code| value|netto|quantity|region|district|direction|measure|           load_date|
+-------+-------+----------+------+-----+--------+------+--------+---------+-------+--------------------+
|01/2016|     IT|6204695000|   131|    1|       7| 46000|      01|       IM|    ShT|2024-07-01T00:00:...|
|01/2016|     CN|9001900009|112750|   18|       0| 46000|      01|       IM|      1|2024-01-01T00:00:...|
|01/2016|     BY|8414302004|   392|   57|       8| 50000|      06|       IM|    ShT|2024-06-01T00:00:...|
|01/2016|     US|9018509000| 54349|  179|       0| 40000|      02|       IM|      1|2024-04-01T00:00:...|
+-------+-------+----------+------+-----+--------+------+--------+---------+-------+--------------------+
only showing top 4 rows



In [21]:
result\
    .select('country')\
    .distinct()\
    .show(10, truncate=False)

+-------+
|country|
+-------+
|LT     |
|MM     |
|DZ     |
|CI     |
|TC     |
|FI     |
|SC     |
|AZ     |
|UA     |
|RO     |
+-------+
only showing top 10 rows



### GroupBy

In [27]:
(
    result
    .groupBy('country').agg(F.count('*').alias('total_rows'))
    .orderBy(F.col('total_rows').desc())
    .show()
)

+-------+----------+
|country|total_rows|
+-------+----------+
|     BY|   3509568|
|     KZ|   2519896|
|     CN|   2454792|
|     DE|   1542311|
|     UA|   1158498|
|     IT|   1102837|
|     US|    835936|
|     PL|    666690|
|     FR|    593040|
|     JP|    571756|
|     TR|    463432|
|     KR|    446907|
|     GB|    443091|
|     AM|    438705|
|     CZ|    407360|
|     KG|    403565|
|     ES|    401644|
|     IN|    374151|
|     NL|    365193|
|     UZ|    329707|
+-------+----------+
only showing top 20 rows



### Filter

In [28]:
result.show()

+-------+-------+----------+------+------+--------+------+--------+---------+-------+--------------------+
|  month|country|      code| value| netto|quantity|region|district|direction|measure|           load_date|
+-------+-------+----------+------+------+--------+------+--------+---------+-------+--------------------+
|01/2016|     IT|6204695000|   131|     1|       7| 46000|      01|       IM|    ShT|2024-07-01T00:00:...|
|01/2016|     CN|9001900009|112750|    18|       0| 46000|      01|       IM|      1|2024-01-01T00:00:...|
|01/2016|     BY|8414302004|   392|    57|       8| 50000|      06|       IM|    ShT|2024-06-01T00:00:...|
|01/2016|     US|9018509000| 54349|   179|       0| 40000|      02|       IM|      1|2024-04-01T00:00:...|
|01/2016|     EE|9021101000| 17304|   372|       0| 46000|      01|       IM|      1|2024-02-01T00:00:...|
|01/2016|     FR|3816000000|323488|253600|       0| 40000|      02|       IM|      1|2024-02-01T00:00:...|
|01/2016|     MX|8523519300|  1611|  

In [13]:
df_de = (
    result
    .where(F.col('country') == 'DE')
    .where(F.col('value').isNotNull())
)

print(df_de.count())

# df_de2 = (
#     result
#     .where(''' country == "DE" ''')
#     .where(''' value IS NOT NULL ''')
# )

# print(df_de.count() == df_de2.count())

1542311


### Save to CSV

In [11]:
df_de.columns

['month',
 'country',
 'code',
 'value',
 'netto',
 'quantity',
 'region',
 'district',
 'direction',
 'measure',
 'load_date']

In [16]:
final = (
    df_de
    .select(
        'month',
        'country',
        'code',
        'value',
        'netto',
        'quantity',
        'region',
        'district',
        'direction',
        'measure',
        F.col('load_date').cast('date'),
    )
)

final.show(2, truncate=False)

+-------+-------+----------+-----+-----+--------+------+--------+---------+-------+----------+
|month  |country|code      |value|netto|quantity|region|district|direction|measure|load_date |
+-------+-------+----------+-----+-----+--------+------+--------+---------+-------+----------+
|01/2016|DE     |4016995709|5901 |172  |0       |46000 |01      |IM       |1      |2024-01-01|
|01/2016|DE     |8708809109|1213 |94   |0       |45000 |01      |IM       |1      |2024-01-01|
+-------+-------+----------+-----+-----+--------+------+--------+---------+-------+----------+
only showing top 2 rows



In [12]:
final.show(4)

+-------+-------+----------+-----+-----+--------+------+--------+---------+-------+----------+
|  month|country|      code|value|netto|quantity|region|district|direction|measure| load_date|
+-------+-------+----------+-----+-----+--------+------+--------+---------+-------+----------+
|01/2016|     DE|4016995709| 5901|  172|       0| 46000|      01|       IM|      1|2024-01-01|
|01/2016|     DE|8708809109| 1213|   94|       0| 45000|      01|       IM|      1|2024-01-01|
|01/2016|     DE|7013419000| 7020| 1611|     492| 45000|      01|       IM|    ShT|2024-02-01|
|01/2016|     DE|3923309090|46294| 8048|       0| 45000|      01|       IM|      1|2024-04-01|
+-------+-------+----------+-----+-----+--------+------+--------+---------+-------+----------+
only showing top 4 rows



In [None]:
# ----------------------------------------------------------------------------------
# This block of code demonstrates various methods for saving a DataFrame (final) 
# and controlling the number and structure of the output files.
# ----------------------------------------------------------------------------------

# Uncontrolled saving by file count
# (This will result in as many files as the current number of partitions in 'final' DataFrame)
# (
#     final
#     .write
#     .format('csv')
#     .options(header='True', sep=';')
#     .mode('overwrite')
#     .csv('data/final_no_control')
# )

# partition_num = final.rdd.getNumPartitions()
# print(f'Кол-во партиций {partition_num}') # Output: 'Кол-во партиций [number]'

# # Controlled saving by file count - ONE FILE
# # The coalesce(1) operation reduces the number of partitions to 1, forcing a single output file.
# (
#     final
#     .coalesce(1)
#     .write
#     .format('csv')
#     .options(header='True', sep=';')
#     .csv('data/final_one_file')
# )

# partition_num = final.coalesce(1).rdd.getNumPartitions()
# print(f'Кол-во партиций {partition_num}') # Output: 'Кол-во партиций 1'


# # Saving with Partitioning
# # Creates separate directories based on the distinct values in the 'load_date' column.
# (
#     final
#     .write
#     .partitionBy('load_date')
#     .format('csv')
#     .options(header='True', sep=';')
#     .csv('data/final_partitioned')
# )

# print_df = final.select('load_date').distinct()
# print(f'Load_date distinct: {print_df.count()}') # Output: 'Load_date distinct: [number]'


# Saving with Partitioning and repartition within each partition (Controlled Partitioning)
# The repartition(1, 'load_date') ensures that for each distinct 'load_date', 
# only ONE file is written within its corresponding directory, reducing file sprawl.
(
    final
    .repartition(1, 'load_date')
    .write
    .partitionBy('load_date')
    .format('csv')
    .options(header='True', sep=';')
    .csv('data/final_partitioned_repart')
)

# partition_num = final.repartition(1, 'load_date').rdd.getNumPartitions()
# print(f'Кол-во партиций {partition_num}') # Output: 'Кол-во партиций [number of distinct load_dates]'

### Read Transformed

In [18]:
reader_no_control = (
    spark
    .read
    .csv('data/final_no_control/', header=True, sep=';')
    .where(''' load_date = "2024-01-01" ''')
)

reader_final_one_file = (
    spark
    .read
    .csv('data/final_one_file/', header=True, sep=';')
    .where(''' load_date = "2024-01-01" ''')
)

reader_partitioned = (
    spark
    .read
    .csv('data/final_partitioned', header=True, sep=';')
    .where(''' load_date = "2024-01-01" ''')
)

reader_partitioned_repart = (
    spark
    .read
    .csv('data/final_partitioned_repart', header=True, sep=';')
    .where(''' load_date = "2024-01-01" ''')
)


#reader_no_control.count() # number of files read: 16 | size of files read: 88.4 MiB | 2.5 s (90 ms, 301 ms, 384 ms)

#reader_final_one_file.count() # number of files read: 1 | size of files read: 88.4 MiB | 3.2 s (306 ms, 407 ms, 420 ms )

#reader_partitioned.count() # number of files read: 16 | size of files read: 16.4 MiB | 305 ms (32 ms, 39 ms, 54 ms )

reader_partitioned_repart.count() # number of files read: 1 | size of files read: 16.4 MiB | 179 ms (9 ms, 43 ms, 44 ms )

350998

### JOIN

In [None]:
data = [
    (14000, "Northern"),
    (11000, "Southern"),
    (10000, "Eastern"),
    (26000, "Western"),
    (56000, "Central"),
]
region_df = spark.createDataFrame(data, schema='region_id long, name string')

region_df.show()


customs_data = (
    spark
    .read
    .csv('data/customs_data.csv', header=True, sep=';')
)

customs_data.show(2)

In [22]:
# Turn off Broadcast JOIN
import time
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [25]:
# mo broadcast join

start_time = time.time()

customs_data.join(region_df, customs_data.region==region_df.region_id, "left").count()

end_time = time.time()

print(f"Elapsed time for join operation: {end_time - start_time:.2f} seconds")

Elapsed time for join operation: 27.30 seconds


In [24]:
# with broadcast join

start_time = time.time()

customs_data.join(F.broadcast(region_df), customs_data.region == region_df.region_id, "left").count()

end_time = time.time()

print(f"Elapsed time for broadcast join operation: {end_time - start_time:.2f} seconds")

Elapsed time for broadcast join operation: 19.21 seconds


### Cache | Persist

In [27]:
customs_data.cache().count()

26392290

In [28]:
customs_data.unpersist()

DataFrame[month: string, country: string, code: string, value: string, netto: string, quantity: string, region: string, district: string, direction_eng: string, measure_eng: string, load_date: string]

In [29]:
from pyspark.storagelevel import StorageLevel

customs_data.persist(StorageLevel.DISK_ONLY).count()

26392290

### Repartition & Coalesce

In [32]:
print(spark.sparkContext.getConf().get("spark.driver.memory"))

None


In [33]:
data = [
    (1,'one'),
    (2,'two'),
    (3,'three'),
    (4,'four'),
    (5,'five'),
    (6,'six'),
    (7, 'seven'),
    (8, 'eight'),
    (9, 'nine'),
]

df = spark.createDataFrame(data, ['id', 'number'])

df.show()

+---+------+
| id|number|
+---+------+
|  1|   one|
|  2|   two|
|  3| three|
|  4|  four|
|  5|  five|
|  6|   six|
|  7| seven|
|  8| eight|
|  9|  nine|
+---+------+



In [38]:
df.rdd.getNumPartitions()

8

In [43]:
mix.repartition(3).rdd.glom().collect()

[[Row(id=1, number='one'),
  Row(id=4, number='four'),
  Row(id=7, number='seven'),
  Row(id=9, number='nine')],
 [Row(id=2, number='two')],
 [Row(id=3, number='three'),
  Row(id=5, number='five'),
  Row(id=6, number='six'),
  Row(id=8, number='eight')]]

In [45]:
mix.coalesce(2).rdd.glom().collect()

[[Row(id=3, number='three'),
  Row(id=6, number='six'),
  Row(id=5, number='five'),
  Row(id=9, number='nine')],
 [Row(id=7, number='seven'),
  Row(id=1, number='one'),
  Row(id=2, number='two'),
  Row(id=8, number='eight'),
  Row(id=4, number='four')]]

In [47]:
mix.toPandas().head()

ImportError: Pandas >= 1.0.5 must be installed; however, it was not found.

In [48]:
# OUT OF MEMORY

d = spark.read.csv('data/customs_data.csv', header=True, sep='\t')
d.collect()

Py4JJavaError: An error occurred while calling o393.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 97.0 failed 1 times, most recent failure: Lost task 6.0 in stage 97.0 (TID 536) (10090-LT-X0017.na.msds.rhi.com executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:64)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:363)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$2838/0x000002352ce0af28.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1862)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:714)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1(Utils.scala:209)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1$adapted(Utils.scala:209)
	at org.apache.spark.util.Utils$$$Lambda$2841/0x000002352ce0c5b0.apply(Unknown Source)
	at org.apache.spark.util.Utils$.writeByteBufferImpl(Utils.scala:187)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:209)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2$adapted(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer$$Lambda$2840/0x000002352ce0c1d8.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.writeExternal(ChunkedByteBuffer.scala:103)
	at java.base/java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1465)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1436)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1181)
	at java.base/java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1572)
	at java.base/java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1529)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1438)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1181)
	at java.base/java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1381)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
	at java.base/java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:350)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4148)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4145)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:64)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:363)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$.$anonfun$serializeToChunkedBuffer$1$adapted(SerializerHelper.scala:40)
	at org.apache.spark.serializer.SerializerHelper$$$Lambda$2838/0x000002352ce0af28.apply(Unknown Source)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at java.base/java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1862)
	at java.base/java.io.ObjectOutputStream.write(ObjectOutputStream.java:714)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1(Utils.scala:209)
	at org.apache.spark.util.Utils$.$anonfun$writeByteBuffer$1$adapted(Utils.scala:209)
	at org.apache.spark.util.Utils$$$Lambda$2841/0x000002352ce0c5b0.apply(Unknown Source)
	at org.apache.spark.util.Utils$.writeByteBufferImpl(Utils.scala:187)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:209)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$writeExternal$2$adapted(ChunkedByteBuffer.scala:103)
	at org.apache.spark.util.io.ChunkedByteBuffer$$Lambda$2840/0x000002352ce0c1d8.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.writeExternal(ChunkedByteBuffer.scala:103)
	at java.base/java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1465)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1436)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1181)
	at java.base/java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1572)
	at java.base/java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1529)
	at java.base/java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1438)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1181)
	at java.base/java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1381)
	at java.base/java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177)
	at java.base/java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:350)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 59979)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\Users\ivazhu01\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ivazhu01\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\ivazhu01\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
       

In [49]:
spark.stop()

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it