# Run your first Structured Streaming workload
[docs](https://docs.databricks.com/aws/en/structured-streaming/tutorial)

## Use Auto Loader to read streaming data from object storage

In [0]:
file_path = "/databricks-datasets/structured-streaming/events"
checkpoint_path = "/tmp/ss-tutorial/_checkpoint"

raw_df = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", checkpoint_path)
    .load(file_path)
)

## Perform a streaming transformation

In [0]:
from pyspark.sql.functions import col, current_timestamp

transformed_df = (raw_df.select(
    "*",
    col("_metadata.file_path").alias("source_file"),
    current_timestamp().alias("processing_time")
    )
)

## Perform an incremental batch write to Delta Lake

In [0]:
target_path = "/tmp/ss-tutorial/"
checkpoint_path = "/tmp/ss-tutorial/_checkpoint"

transformed_df.writeStream
    .trigger(availableNow=True)
    .option("checkpointLocation", checkpoint_path)
    .option("path", target_path)
    .start()

## Read data from Delta Lake, transform, and write to Delta Lake

In [0]:
(spark.readStream
    .table("<table-name1>")
    .join(spark.read.table("<table-name2>"), on="<id>", how="left")
    .writeStream
    .trigger(availableNow=True)
    .option("checkpointLocation", "<checkpoint-path>")
    .toTable("<table-name3>")
)

## Read data from Kafka, transform, and write to Kafka

In [0]:
(spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "<server:ip>")
    .option("subscribe", "<topic>")
    .option("startingOffsets", "latest")
    .load()
    .join(spark.read.table("<table-name>"), on="<id>", how="left")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "<server:ip>")
    .option("topic", "<topic>")
    .option("checkpointLocation", "<checkpoint-path>")
    .start()
)

---

# Structured Streaming patterns on Databricks

! 

## Write to Cassandra as a sink for Structured Streaming in Python

In [0]:
spark.conf.set("spark.cassandra.connection.host", "host1,host2")

df.writeStream \
  .format("org.apache.spark.sql.cassandra") \
  .outputMode("append") \
  .option("checkpointLocation", "/path/to/checkpoint") \
  .option("keyspace", "keyspace_name") \
  .option("table", "table_name") \
  .start()

## Write to Azure Synapse Analytics using foreachBatch() in Python

In [0]:
from pyspark.sql.functions import *
from pyspark.sql import *

def writeToSQLWarehouse(df, epochId):
  df.write \
    .format("com.databricks.spark.sqldw") \
    .mode('overwrite') \
    .option("url", "jdbc:sqlserver://<the-rest-of-the-connection-string>") \
    .option("forward_spark_azure_storage_credentials", "true") \
    .option("dbtable", "my_table_in_dw_copy") \
    .option("tempdir", "wasbs://<your-container-name>@<your-storage-account-name>.blob.core.windows.net/<your-directory-name>") \
    .save()

spark.conf.set("spark.sql.shuffle.partitions", "1")

query = (
  spark.readStream.format("rate").load()
    .selectExpr("value % 10 as key")
    .groupBy("key")
    .count()
    .toDF("key", "count")
    .writeStream
    .foreachBatch(writeToSQLWarehouse)
    .outputMode("update")
    .start()
    )

## Write to Amazon DynamoDB using foreach() in Scala and Python (in this case we use Python only ka)

In [0]:
# 1
table_name = "PythonForeachTest"

def get_dynamodb():
  import boto3

  access_key = "<access key>"
  secret_key = "<secret key>"
  region = "<region name>"
  return boto3.resource('dynamodb',
                 aws_access_key_id=access_key,
                 aws_secret_access_key=secret_key,
                 region_name=region)

def createTableIfNotExists():
    '''
    Create a DynamoDB table if it does not exist.
    This must be run on the Spark driver, and not inside foreach.
    '''
    dynamodb = get_dynamodb()

    existing_tables = dynamodb.meta.client.list_tables()['TableNames']
    if table_name not in existing_tables:
      print("Creating table %s" % table_name)
      table = dynamodb.create_table(
          TableName=table_name,
          KeySchema=[ { 'AttributeName': 'key', 'KeyType': 'HASH' } ],
          AttributeDefinitions=[ { 'AttributeName': 'key', 'AttributeType': 'S' } ],
          ProvisionedThroughput = { 'ReadCapacityUnits': 5, 'WriteCapacityUnits': 5 }
      )

      print("Waiting for table to be ready")

table.meta.client.get_waiter('table_exists').wait(TableName=table_name)

In [0]:
# 2 : Use a function
def sendToDynamoDB_simple(row):
  '''
  Function to send a row to DynamoDB.
  When used with `foreach`, this method is going to be called in the executor
  with the generated output rows.
  '''
  # Create client object in the executor,
  # do not use client objects created in the driver
  dynamodb = get_dynamodb()

  dynamodb.Table(table_name).put_item(
      Item = { 'key': str(row['key']), 'count': row['count'] })

In [0]:
# 2 : Use a class
class SendToDynamoDB_ForeachWriter:
  '''
  Class to send a set of rows to DynamoDB.
  When used with `foreach`, copies of this class is going to be used to write
  multiple rows in the executor. See the python docs for `DataStreamWriter.foreach`
  for more details.
  '''

  def open(self, partition_id, epoch_id):
    # This is called first when preparing to send multiple rows.
    # Put all the initialization code inside open() so that a fresh
    # copy of this class is initialized in the executor where open()
    # will be called.
    self.dynamodb = get_dynamodb()
    return True

  def process(self, row):
    # This is called for each row after open() has been called.
    # This implementation sends one row at a time.
    # For further enhancements, contact the Spark+DynamoDB connector
    # team: https://github.com/audienceproject/spark-dynamodb
    self.dynamodb.Table(table_name).put_item(
        Item = { 'key': str(row['key']), 'count': row['count'] })

  def close(self, err):
    # This is called after all the rows have been processed.
    if err:
      raise err

In [0]:
# 3
from pyspark.sql.functions import *

spark.conf.set("spark.sql.shuffle.partitions", "1")

query = (
  spark.readStream.format("rate").load()
    .selectExpr("value % 10 as key")
    .groupBy("key")
    .count()
    .toDF("key", "count")
    .writeStream
    .foreach(SendToDynamoDB_ForeachWriter())
    #.foreach(sendToDynamoDB_simple)  // alternative, use one or the other
    .outputMode("update")
    .start()
)

## Stream-Stream joins

จะอยู่ในอีก [Notebook](url)