diff --git a/SUMMARY.adoc b/SUMMARY.adoc index 88850dfaf..a1badf2f8 100644 --- a/SUMMARY.adoc +++ b/SUMMARY.adoc @@ -29,8 +29,8 @@ .. link:spark-sql-streaming-Dataset-groupBy.adoc[groupBy Operator -- Streaming Aggregation (with Implicit State Logic)] . link:spark-sql-streaming-KeyValueGroupedDataset.adoc[KeyValueGroupedDataset -- Streaming Aggregation] -.. link:spark-sql-streaming-KeyValueGroupedDataset-mapGroupsWithState.adoc[mapGroupsWithState Operator -- Stateful Stream Aggregation (with Explicit State Logic)] -.. link:spark-sql-streaming-KeyValueGroupedDataset-flatMapGroupsWithState.adoc[flatMapGroupsWithState Operator -- Arbitrary Stateful Stream Aggregation (with Explicit State Logic)] +.. link:spark-sql-streaming-KeyValueGroupedDataset-mapGroupsWithState.adoc[mapGroupsWithState Operator -- Stateful Streaming Aggregation (with Explicit State Logic)] +.. link:spark-sql-streaming-KeyValueGroupedDataset-flatMapGroupsWithState.adoc[flatMapGroupsWithState Operator -- Arbitrary Stateful Streaming Aggregation (with Explicit State Logic)] . link:spark-sql-streaming-DataStreamWriter.adoc[DataStreamWriter -- Writing Datasets To Streaming Data Sinks] .. link:spark-sql-streaming-ForeachWriter.adoc[ForeachWriter] @@ -58,6 +58,10 @@ .. link:spark-sql-streaming-QueryProgressEvent.adoc[QueryProgressEvent] .. link:spark-sql-streaming-QueryTerminatedEvent.adoc[QueryTerminatedEvent] +. link:spark-sql-streaming-webui.adoc[Web UI] + +. link:spark-sql-streaming-logging.adoc[Logging] + == Extending Structured Streaming . link:spark-sql-streaming-StreamSourceProvider.adoc[StreamSourceProvider -- Streaming Data Source Provider] @@ -119,25 +123,25 @@ . link:spark-sql-streaming-OffsetSeqMetadata.adoc[OffsetSeqMetadata] -=== State in Stateful Stream Processing +=== State in Stateful Streaming Aggregations -. link:spark-sql-streaming-GroupState.adoc[GroupState -- Contract for State Per Group For Stateful Aggregation] +. link:spark-sql-streaming-GroupState.adoc[GroupState -- State Per Group in Stateful Streaming Aggregation] .. link:spark-sql-streaming-GroupStateImpl.adoc[GroupStateImpl] .. link:spark-sql-streaming-GroupStateTimeout.adoc[GroupStateTimeout] . link:spark-sql-streaming-StateStore.adoc[StateStore] .. link:spark-sql-streaming-StateStoreOps.adoc[StateStoreOps -- Implicits Methods for Creating StateStoreRDD] -.. link:spark-sql-streaming-StateStoreRDD.adoc[StateStoreRDD] +.. link:spark-sql-streaming-StateStoreProvider.adoc[StateStoreProvider] .. link:spark-sql-streaming-StateStoreUpdater.adoc[StateStoreUpdater] .. link:spark-sql-streaming-StateStoreWriter.adoc[StateStoreWriter -- Recording Metrics For Writing to StateStore] -.. link:spark-sql-streaming-StateStoreCoordinator.adoc[StateStoreCoordinator -- Tracking Locations of StateStores for StateStoreRDD] -... link:spark-sql-streaming-StateStoreCoordinatorRef.adoc[StateStoreCoordinatorRef Interface for Communication with StateStoreCoordinator] -.. link:spark-sql-streaming-StateStoreProvider.adoc[StateStoreProvider] . link:spark-sql-streaming-HDFSBackedStateStore.adoc[HDFSBackedStateStore] .. link:spark-sql-streaming-HDFSBackedStateStoreProvider.adoc[HDFSBackedStateStoreProvider] +. link:spark-sql-streaming-StateStoreRDD.adoc[StateStoreRDD -- RDD for Updating State (in StateStore)] +. link:spark-sql-streaming-StateStoreCoordinator.adoc[StateStoreCoordinator -- Tracking Locations of StateStores for StateStoreRDD] +.. link:spark-sql-streaming-StateStoreCoordinatorRef.adoc[StateStoreCoordinatorRef Interface for Communication with StateStoreCoordinator] + == Varia . link:spark-sql-streaming-StreamProgress.adoc[StreamProgress Custom Scala Map] -. link:spark-sql-streaming-logging.adoc[Logging] diff --git a/spark-sql-streaming-GroupState.adoc b/spark-sql-streaming-GroupState.adoc index 3adf80798..a24e7e0a9 100644 --- a/spark-sql-streaming-GroupState.adoc +++ b/spark-sql-streaming-GroupState.adoc @@ -1,4 +1,4 @@ -== [[GroupState]] GroupState -- Contract for State Per Group For Stateful Aggregation +== [[GroupState]] GroupState -- State Per Group in Stateful Streaming Aggregation `GroupState` is the <> for working with a state (of type `S`) per group for arbitrary stateful aggregation (using link:spark-sql-streaming-KeyValueGroupedDataset.adoc#mapGroupsWithState[mapGroupsWithState] or link:spark-sql-streaming-KeyValueGroupedDataset.adoc#flatMapGroupsWithState[flatMapGroupsWithState] operators). diff --git a/spark-sql-streaming-HDFSBackedStateStore.adoc b/spark-sql-streaming-HDFSBackedStateStore.adoc index f1bdd8de3..0b185667f 100644 --- a/spark-sql-streaming-HDFSBackedStateStore.adoc +++ b/spark-sql-streaming-HDFSBackedStateStore.adoc @@ -1,6 +1,6 @@ == [[HDFSBackedStateStore]] HDFSBackedStateStore -`HDFSBackedStateStore` is a link:spark-sql-streaming-StateStore.adoc[StateStore] using HDFS-compatible file system for versioned state persistence. +`HDFSBackedStateStore` is a link:spark-sql-streaming-StateStore.adoc[StateStore] that uses a HDFS-compatible file system for versioned state persistence. `HDFSBackedStateStore` is created exclusively when `HDFSBackedStateStoreProvider` <> (which is when...FIXME). diff --git a/spark-sql-streaming-KeyValueGroupedDataset-flatMapGroupsWithState.adoc b/spark-sql-streaming-KeyValueGroupedDataset-flatMapGroupsWithState.adoc index 9e5e1c394..eeba77d77 100644 --- a/spark-sql-streaming-KeyValueGroupedDataset-flatMapGroupsWithState.adoc +++ b/spark-sql-streaming-KeyValueGroupedDataset-flatMapGroupsWithState.adoc @@ -1,4 +1,4 @@ -== [[flatMapGroupsWithState]] flatMapGroupsWithState Operator -- Arbitrary Stateful Stream Aggregation (with Explicit State Logic) +== [[flatMapGroupsWithState]] flatMapGroupsWithState Operator -- Arbitrary Stateful Streaming Aggregation (with Explicit State Logic) [source, scala] ---- diff --git a/spark-sql-streaming-KeyValueGroupedDataset-mapGroupsWithState.adoc b/spark-sql-streaming-KeyValueGroupedDataset-mapGroupsWithState.adoc index 47a1ca2b4..097eaebb4 100644 --- a/spark-sql-streaming-KeyValueGroupedDataset-mapGroupsWithState.adoc +++ b/spark-sql-streaming-KeyValueGroupedDataset-mapGroupsWithState.adoc @@ -1,4 +1,4 @@ -== [[mapGroupsWithState]] mapGroupsWithState Operator -- Stateful Stream Aggregation (with Explicit State Logic) +== [[mapGroupsWithState]] mapGroupsWithState Operator -- Stateful Streaming Aggregation (with Explicit State Logic) [source, scala] ---- diff --git a/spark-sql-streaming-StateStore.adoc b/spark-sql-streaming-StateStore.adoc index f0650fa8c..efef1845f 100644 --- a/spark-sql-streaming-StateStore.adoc +++ b/spark-sql-streaming-StateStore.adoc @@ -37,6 +37,8 @@ trait StateStore { | [[get]] `get` | +Used exclusively when `StateStoreRDD` link:spark-sql-streaming-StateStoreRDD.adoc#compute[is executed]. + | [[getRange]] `getRange` | diff --git a/spark-sql-streaming-StateStoreProvider.adoc b/spark-sql-streaming-StateStoreProvider.adoc index 511923a2e..a2079f780 100644 --- a/spark-sql-streaming-StateStoreProvider.adoc +++ b/spark-sql-streaming-StateStoreProvider.adoc @@ -14,4 +14,4 @@ CAUTION: FIXME CAUTION: FIXME -NOTE: `getStore` is used when `StateStore` is link:spark-sql-streaming-StateStore.adoc#get[requested for a state store] (given a `StateStoreProviderId`). +NOTE: `getStore` is used exclusively when `StateStore` is link:spark-sql-streaming-StateStore.adoc#get[requested for a state store] (given a `StateStoreProviderId`). diff --git a/spark-sql-streaming-StateStoreRDD.adoc b/spark-sql-streaming-StateStoreRDD.adoc index 03a9c7b6a..d53ab240a 100644 --- a/spark-sql-streaming-StateStoreRDD.adoc +++ b/spark-sql-streaming-StateStoreRDD.adoc @@ -1,8 +1,10 @@ -== [[StateStoreRDD]] StateStoreRDD +== [[StateStoreRDD]] StateStoreRDD -- RDD for Updating State (in StateStore) -`StateStoreRDD` is a specialized `RDD` for <> using link:spark-sql-streaming-StateStore.adoc[StateStore]. +`StateStoreRDD` is an `RDD` for <> with link:spark-sql-streaming-StateStore.adoc[StateStore] (and data from partitions of a <>). -`StateStoreRDD` is <> as the result of link:spark-sql-streaming-StateStoreOps.adoc#mapPartitionsWithStateStore[executing physical operators] (i.e. `FlatMapGroupsWithStateExec`, `StateStoreRestoreExec`, `StateStoreSaveExec`, `StreamingDeduplicateExec`). +`StateStoreRDD` is <> when link:spark-sql-streaming-StateStoreOps.adoc#mapPartitionsWithStateStore[executing physical operators] that work with state (i.e. `FlatMapGroupsWithStateExec`, `StateStoreRestoreExec`, `StateStoreSaveExec`, `StreamingDeduplicateExec`). + +CAUTION: FIXME graffle with operator -> logical operator -> planner -> physical operator -> StateStoreRDD `StateStoreRDD` uses `StateStoreCoordinator` for <> for job scheduling. diff --git a/spark-sql-streaming-webui.adoc b/spark-sql-streaming-webui.adoc new file mode 100644 index 000000000..bbd354ff3 --- /dev/null +++ b/spark-sql-streaming-webui.adoc @@ -0,0 +1,3 @@ +== Web UI + +Web UI...FIXME