## Source

In [1]:
df = spark.readStream \
          .format('rate') \
          .option('rowsPerSecond', 2) \
          .option('numPartitions', 2) \
          .load()

In [2]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)



## Transform

We will transform the stream adding a column with the partition id:

In [3]:
from pyspark.sql.functions import spark_partition_id

In [4]:
df_with_partition = df.withColumn('partition', spark_partition_id())

In [5]:
df_with_partition.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)
 |-- partition: integer (nullable = false)



## Sink

In [6]:
query = df_with_partition.writeStream \
          .format('memory') \
          .outputMode('append') \
          .queryName('mytable') \
          .start()

We can query the in-memory table:

In [7]:
spark.sql('select count(*) as count from mytable').show()

+-----+
|count|
+-----+
|    0|
+-----+



If we wait a little bit we will see the table keeps growing:

In [8]:
spark.sql('select count(*) as count from mytable').show()

+-----+
|count|
+-----+
|   14|
+-----+



In [9]:
spark.sql('select * from mytable order by timestamp desc limit 10').show(truncate=False)

+----------------------+-----+---------+
|timestamp             |value|partition|
+----------------------+-----+---------+
|2022-09-12 17:49:34.16|15   |1        |
|2022-09-12 17:49:33.66|14   |0        |
|2022-09-12 17:49:33.16|13   |1        |
|2022-09-12 17:49:32.66|12   |0        |
|2022-09-12 17:49:32.16|11   |1        |
|2022-09-12 17:49:31.66|10   |0        |
|2022-09-12 17:49:31.16|9    |1        |
|2022-09-12 17:49:30.66|8    |0        |
|2022-09-12 17:49:30.16|7    |1        |
|2022-09-12 17:49:29.66|6    |0        |
+----------------------+-----+---------+



## Stop

Finally we must remember to stop the query to avoid filling the memory of the driver:

In [10]:
query.stop()