# Spark Structured Streaming and Delta Lake

Let's stream some data into a Delta Lake table


In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
spark

## Verify that Spark Connect is setup correctly
Executing a hello world of dataframes

In [None]:
from datetime import datetime, date
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df.show()

## Hello-world 

Example streaming data

https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html



In [None]:
tableName = "hello_my_delta"
spark.sql(f"drop table if exists {tableName}")

df = spark.readStream.format("rate").option("rowsPerSecond", 10).load()

# Write the streaming DataFrame to a table in micro batches
query = (
    df.writeStream.trigger(processingTime="5 seconds")
    .format("delta")
    .option("checkpointLocation", f"/opt/bitnami/spark/checkpoint/{tableName}")
    .toTable(tableName)
)
import time

time.sleep(60)  # Wait for 10 seconds

query.stop()

In [None]:
spark.sql(f'select * from {tableName}').show(truncate=False)

## Let's look at table metadata

In [None]:
# Describe the table to view metadata, including partitioning
spark.sql(f"DESCRIBE DETAIL {tableName}").show(truncate=False)

# Describe the table's history to view active records and other operational metrics
spark.sql(f"DESCRIBE HISTORY {tableName}").show(truncate=False)


## Let's compact all these small files

In [None]:
spark.sql(f"optimize {tableName}").show(truncate=False)
spark.sql(f"vacuum {tableName}").show(truncate=False)

In [None]:
spark.sql(f"DESCRIBE DETAIL {tableName}").show(truncate=False)
spark.sql(f"DESCRIBE HISTORY {tableName}").show(truncate=False)