# Spark Structured Streaming and Delta Lake

Let's stream some data into a Delta Lake table


In [1]:
from pyspark.sql import SparkSession


spark: SparkSession = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()


spark

<pyspark.sql.connect.session.SparkSession at 0x2029952cc20>

## Verify that Spark Connect is setup correctly
Executing a hello world of dataframes

In [2]:
from datetime import datetime, date
from pyspark.sql import Row

df = spark.createDataFrame(
    [
        Row(a=1, b=2.0, c="string1", d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
        Row(a=2, b=3.0, c="string2", d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
        Row(a=4, b=5.0, c="string3", d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0)),
    ]
)
df.writeTo("local.db.sample_table").createOrReplace()
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 01:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 01:00:00|
|  4|5.0|string3|2000-03-01|2000-01-03 01:00:00|
+---+---+-------+----------+-------------------+



## Hello-world 

Example streaming data

https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html



In [14]:
tableName = "hello_world_iceberg"
spark.sql(f"drop table if exists {tableName}")

df = spark.readStream.format("rate").option("rowsPerSecond", 1000000).load()

# Write the streaming DataFrame to a table in micro batches
query = (
    df.writeStream.trigger(processingTime="5 seconds")
    .format("iceberg")
    .option(
        "checkpointLocation",
        f"/opt/bitnami/spark/spark-warehouse/_checkpoint/{tableName}",
    )
    .toTable(tableName)
)
import time

time.sleep(60)  # Wait for 10 seconds

query.stop()

In [4]:
spark.sql(f"select * from {tableName}").show()

+--------------------+-----+
|           timestamp|value|
+--------------------+-----+
|2025-03-16 07:11:...| 4940|
|2025-03-16 07:11:...| 4944|
|2025-03-16 07:11:...| 4948|
|2025-03-16 07:11:...| 4952|
|2025-03-16 07:11:...| 4956|
|2025-03-16 07:11:...| 4960|
|2025-03-16 07:11:...| 4964|
|2025-03-16 07:11:...| 4968|
|2025-03-16 07:11:...| 4972|
|2025-03-16 07:11:...| 4976|
|2025-03-16 07:11:...| 4980|
|2025-03-16 07:11:...| 4984|
|2025-03-16 07:11:...| 4988|
|2025-03-16 07:11:...| 4890|
|2025-03-16 07:11:...| 4894|
|2025-03-16 07:11:...| 4898|
|2025-03-16 07:11:...| 4902|
|2025-03-16 07:11:...| 4906|
|2025-03-16 07:11:...| 4910|
|2025-03-16 07:11:...| 4914|
+--------------------+-----+
only showing top 20 rows



## Let's look at table metadata

In [None]:
# Describe the table to view metadata, including partitioning
spark.sql(f"select * from {tableName}.snapshots").show(truncate=False)

+-----------------------+-------------------+-------------------+---------+------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id          |operation|manifest_list                                                                                                                       |summar

In [9]:
# Describe the table to view metadata, including partitioning
spark.sql(f"select * from {tableName}.files").show(truncate=False)

+-------+-------------------------------------------------------------------------------------------------------------------------+-----------+-------+------------+------------------+----------------------+----------------------+-----------------+----------------+----------------------------------------------------------------+----------------------------------------------------------------+------------+-------------+------------+-------------+--------------------+--------------+---------------------+-----------------------------------------------------------------------------------------------------------+
|content|file_path                                                                                                                |file_format|spec_id|record_count|file_size_in_bytes|column_sizes          |value_counts          |null_value_counts|nan_value_counts|lower_bounds                                                    |upper_bounds                                            