<a href="https://colab.research.google.com/github/joao-dias-25/dataeng-spark/blob/main/spark_streaming/1_read_write_stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read & Write Stream
- readStream()
- writeStream()
- Streaming Dataframe

# Setting up PySpark

In [1]:
%pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()

# readStream with format "rate"
- readStream
- format("rate")

In [3]:
import pyspark.sql.functions as F

# read stream
stream = spark.readStream.format("rate").load()

In [4]:
type(stream)

In [5]:
# checking if it's streaming dataframe
stream.isStreaming

True

In [6]:
# should be false
data = [("c1", "v1"), ("c2", "v2")]
columns = ["col1", "col2"]
df = spark.createDataFrame(data, columns)
df.isStreaming

False

In [7]:
# apply normal dataframe operations
stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



In [8]:
# Queries with streaming sources must be executed with writeStream.start();
# stream.show()
stream.count()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
rate

# Transfor streaming dataframe

In [9]:
transformed = stream.withColumn("value2", F.col("value") * 2)

In [10]:
transformed.isStreaming

True

# write streaming dataframe - format memory
- writeStream
- format("memory")
- queryName
- outputMode
- start

In [11]:
query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report')
  .outputMode('append')
  .start()
)

# Checking result table

In [12]:
# StreamingQuery
type(query)

In [21]:
print(spark.table("rate_report").count())
spark.table("rate_report").orderBy(F.desc('timestamp')).show(20, False)

# one line per second

349
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2024-11-23 14:06:52.261|348  |696   |
|2024-11-23 14:06:51.261|347  |694   |
|2024-11-23 14:06:50.261|346  |692   |
|2024-11-23 14:06:49.261|345  |690   |
|2024-11-23 14:06:48.261|344  |688   |
|2024-11-23 14:06:47.261|343  |686   |
|2024-11-23 14:06:46.261|342  |684   |
|2024-11-23 14:06:45.261|341  |682   |
|2024-11-23 14:06:44.261|340  |680   |
|2024-11-23 14:06:43.261|339  |678   |
|2024-11-23 14:06:42.261|338  |676   |
|2024-11-23 14:06:41.261|337  |674   |
|2024-11-23 14:06:40.261|336  |672   |
|2024-11-23 14:06:39.261|335  |670   |
|2024-11-23 14:06:38.261|334  |668   |
|2024-11-23 14:06:37.261|333  |666   |
|2024-11-23 14:06:36.261|332  |664   |
|2024-11-23 14:06:35.261|331  |662   |
|2024-11-23 14:06:34.261|330  |660   |
|2024-11-23 14:06:33.261|329  |658   |
+-----------------------+-----+------+
only showing top 20 rows



In [23]:
query.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [24]:
query.isActive

True

In [25]:
query.recentProgress

[{'id': 'e8bec84d-c1fd-47fd-9434-861fea6224e1',
  'runId': '7a425ea8-e17d-49f2-be4d-e3fc87cf49ff',
  'name': 'rate_report',
  'timestamp': '2024-11-23T14:06:12.265Z',
  'batchId': 307,
  'numInputRows': 1,
  'inputRowsPerSecond': 90.90909090909092,
  'processedRowsPerSecond': 11.627906976744187,
  'durationMs': {'addBatch': 28,
   'commitOffsets': 26,
   'getBatch': 0,
   'latestOffset': 0,
   'queryPlanning': 5,
   'triggerExecution': 86,
   'walCommit': 27},
  'stateOperators': [],
  'sources': [{'description': 'RateStreamV2[rowsPerSecond=1, rampUpTimeSeconds=0, numPartitions=default',
    'startOffset': 307,
    'endOffset': 308,
    'latestOffset': 308,
    'numInputRows': 1,
    'inputRowsPerSecond': 90.90909090909092,
    'processedRowsPerSecond': 11.627906976744187}],
  'sink': {'description': 'MemorySink', 'numOutputRows': 1}},
 {'id': 'e8bec84d-c1fd-47fd-9434-861fea6224e1',
  'runId': '7a425ea8-e17d-49f2-be4d-e3fc87cf49ff',
  'name': 'rate_report',
  'timestamp': '2024-11-23T1

In [26]:
query.lastProgress['batchId']

480

# Stop streaming

In [27]:
query.stop()

In [28]:
spark.sql("select * from rate_report").show()

+--------------------+-----+------+
|           timestamp|value|value2|
+--------------------+-----+------+
|2024-11-23 14:01:...|    0|     0|
|2024-11-23 14:01:...|    1|     2|
|2024-11-23 14:01:...|    2|     4|
|2024-11-23 14:01:...|    3|     6|
|2024-11-23 14:01:...|    4|     8|
|2024-11-23 14:01:...|    5|    10|
|2024-11-23 14:01:...|    6|    12|
|2024-11-23 14:01:...|    7|    14|
|2024-11-23 14:01:...|    8|    16|
|2024-11-23 14:01:...|    9|    18|
|2024-11-23 14:01:...|   10|    20|
|2024-11-23 14:01:...|   11|    22|
|2024-11-23 14:01:...|   12|    24|
|2024-11-23 14:01:...|   13|    26|
|2024-11-23 14:01:...|   14|    28|
|2024-11-23 14:01:...|   15|    30|
|2024-11-23 14:01:...|   16|    32|
|2024-11-23 14:01:...|   17|    34|
|2024-11-23 14:01:...|   18|    36|
|2024-11-23 14:01:...|   19|    38|
+--------------------+-----+------+
only showing top 20 rows



In [29]:
# awaitTermination


# Increase rows per second (rate)


In [30]:

# read stream
stream = spark.readStream.format("rate").option("rowsPerSecond", 20).load()

transformed = stream.withColumn("value2", F.col("value") * 2)

query = (transformed.writeStream
  .format('memory')
  .queryName('rate_report_2')
  .outputMode('append')
  .start()
)


In [35]:
print(spark.table("rate_report_2").count())
spark.table("rate_report_2").show(100, False)

860
+-----------------------+-----+------+
|timestamp              |value|value2|
+-----------------------+-----+------+
|2024-11-23 14:18:10.743|0    |0     |
|2024-11-23 14:18:10.793|1    |2     |
|2024-11-23 14:18:10.843|2    |4     |
|2024-11-23 14:18:10.893|3    |6     |
|2024-11-23 14:18:10.943|4    |8     |
|2024-11-23 14:18:10.993|5    |10    |
|2024-11-23 14:18:11.043|6    |12    |
|2024-11-23 14:18:11.093|7    |14    |
|2024-11-23 14:18:11.143|8    |16    |
|2024-11-23 14:18:11.193|9    |18    |
|2024-11-23 14:18:11.243|10   |20    |
|2024-11-23 14:18:11.293|11   |22    |
|2024-11-23 14:18:11.343|12   |24    |
|2024-11-23 14:18:11.393|13   |26    |
|2024-11-23 14:18:11.443|14   |28    |
|2024-11-23 14:18:11.493|15   |30    |
|2024-11-23 14:18:11.543|16   |32    |
|2024-11-23 14:18:11.593|17   |34    |
|2024-11-23 14:18:11.643|18   |36    |
|2024-11-23 14:18:11.693|19   |38    |
|2024-11-23 14:18:11.743|20   |40    |
|2024-11-23 14:18:11.793|21   |42    |
|2024-11-23 14:18:11.

In [36]:
query.lastProgress['sources'][0]['numInputRows']

20

In [37]:
for batch in query.recentProgress:
  print(f"timestamp - {batch['timestamp']}")
  print(f"batchId - {batch['batchId']}")
  print(f"numInputRows - {batch['numInputRows']}")
  print("--")

timestamp - 2024-11-23T14:22:13.752Z
batchId - 243
numInputRows - 20
--
timestamp - 2024-11-23T14:22:14.749Z
batchId - 244
numInputRows - 20
--
timestamp - 2024-11-23T14:22:15.745Z
batchId - 245
numInputRows - 20
--
timestamp - 2024-11-23T14:22:16.745Z
batchId - 246
numInputRows - 20
--
timestamp - 2024-11-23T14:22:17.746Z
batchId - 247
numInputRows - 20
--
timestamp - 2024-11-23T14:22:18.744Z
batchId - 248
numInputRows - 20
--
timestamp - 2024-11-23T14:22:19.751Z
batchId - 249
numInputRows - 20
--
timestamp - 2024-11-23T14:22:20.748Z
batchId - 250
numInputRows - 20
--
timestamp - 2024-11-23T14:22:21.749Z
batchId - 251
numInputRows - 20
--
timestamp - 2024-11-23T14:22:22.745Z
batchId - 252
numInputRows - 20
--
timestamp - 2024-11-23T14:22:23.743Z
batchId - 253
numInputRows - 20
--
timestamp - 2024-11-23T14:22:24.752Z
batchId - 254
numInputRows - 20
--
timestamp - 2024-11-23T14:22:25.746Z
batchId - 255
numInputRows - 20
--
timestamp - 2024-11-23T14:22:26.747Z
batchId - 256
numInputRows 

In [39]:
query.stop()