In [0]:
dbutils.fs.mkdirs("/Volumes/youtube/raja_de/streaming_dumps/stream_checkpoint")
dbutils.fs.mkdirs("/Volumes/youtube/raja_de/streaming_dumps/stream_read")
dbutils.fs.mkdirs("/Volumes/youtube/raja_de/streaming_dumps/stream_write")

In [0]:
schema_defined="File String,Shop String,Sales_count integer"

#Reading Stream data

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import sum as _sum

# 1️⃣ Define schema
schema_defined = StructType([
    StructField("File", StringType(), True),
    StructField("Shop", StringType(), True),
    StructField("Sales_count", IntegerType(), True)
])

# 2️⃣ Define local folders (not /Volumes/)
input_path = "/Volumes/youtube/raja_de/streaming_dumps/stream_read/"
checkpoint_path = "/Volumes/youtube/raja_de/streaming_dumps/stream_checkpoint/"

# 3️⃣ Read streaming CSV
df = (spark.readStream
      .format("csv")
      .schema(schema_defined)
      .option("header", True)
      .load(input_path))

# 4️⃣ Aggregate
df1 = df.groupBy("Shop").agg(_sum("Sales_count").alias("Total_Sales"))


In [0]:
query = (
    df.writeStream
      .format("parquet")
      .outputMode("append")
      .option("path", "/Volumes/youtube/raja_de/streaming_dumps/stream_write/")
      .option("checkpointLocation", "/Volumes/youtube/raja_de/streaming_dumps/stream_checkpoint/parquet_once")
      .trigger(availableNow=True)          # <--- important fix
      .start()
)

query.awaitTermination()  # wait until it finishes


In [0]:
# batch1 = """emp_id,name,age,department
# 1,Alice,29,HR
# 2,Bob,31,Finance
# 3,Charlie,27,IT
# """
# batch2 = """emp_id,name,age,department
# 4,David,45,Finance
# 5,Eve,30,HR
# """
# batch3 = """emp_id,name,age,department
# 6,Frank,33,Sales
# 7,Grace,28,Finance
# """
# batch4 = """emp_id,name,age,department
# 8,Henry,41,IT
# 9,Ivy,26,HR
# """
# batch5 = """emp_id,name,age,department
# 10,Jack,38,Sales
# 11,Karen,34,Finance
# """

# # write files (overwrites if present)
# dbutils.fs.put("/Volumes/youtube/raja_de/streaming_dumps/stream_read/batch1.csv", batch1, True)
dbutils.fs.put("/Volumes/youtube/raja_de/streaming_dumps/stream_read/batch2.csv", batch2, True)

dbutils.fs.put("/Volumes/youtube/raja_de/streaming_dumps/stream_read/batch3.csv", batch3, True)
dbutils.fs.put("/Volumes/youtube/raja_de/streaming_dumps/stream_read/batch4.csv",batch4, True)
dbutils.fs.put("/Volumes/youtube/raja_de/streaming_dumps/stream_read/batch5.csv", batch5, True)

# # dbutils.fs.put("/FileStore/tables/stream_input/batch3.csv", batch3, True)
# # dbutils.fs.put("/FileStore/tables/stream_input/batch4.csv", batch4, True)
# # dbutils.fs.put("/FileStore/tables/stream_input/batch5.csv", batch5, True)

In [0]:
display(spark.read.format("parquet").load("/Volumes/youtube/raja_de/streaming_dumps/stream_write/"))