In [0]:
# This cell ingests data into bronze via streaming
# Detects new files automatically and processes them incrementally.

# Source file path
source_autoloader_path = "/Volumes/leomar/1bronze/autoloader_files/"

# Checkpoint path
checkpoint_path = "/Volumes/leomar/1bronze/checkpoint/bronze_claims_stream/"

# Bronze target table
target_bronze_table = "leomar.1bronze.claims_stream"

# Read the stream file 
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

claim_stream_schema = StructType([
    StructField("Amount", DoubleType(), True),
    StructField("CPTCodes", StringType(), True),
    StructField("ClaimDate", TimestampType(), True),
    StructField("ClaimID", StringType(), True),
    StructField("EventTimestamp", TimestampType(), True),
    StructField("ICD10Codes", StringType(), True),
    StructField("MemberID", StringType(), True),
    StructField("ProviderID", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("_rescued_data", StringType(), True),
    StructField("_source_file", StringType(), True),
    StructField("_ingestion_timestamp", TimestampType(), True)
])

streaming_df = (spark.readStream
    .format("cloudFiles") #Auto Loader (Cloud Files) 
    .option("cloudFiles.format", "json")  # Specify format
    .option("cloudFiles.schemaLocation", checkpoint_path)  # Saves metadata about the progress of a streaming query
    .option("multiLine", "true")  
    .option("escape", '"')
    .schema(claim_stream_schema)
    .load(source_autoloader_path)
)

# Add metadata columns - USE UC-COMPATIBLE METHOD
from pyspark.sql.functions import current_timestamp, col
streaming_df_with_metadata = (streaming_df
    .withColumn("_source_file", col("_metadata.file_path"))  # UC-compatible file path
    .withColumn("_ingestion_timestamp", current_timestamp())
)

# Start the Stream with AvailableNow trigger
streaming_query = (streaming_df_with_metadata.writeStream
    .format("delta")
    .outputMode("append")  # Only append for Bronze layer
    .option("checkpointLocation", checkpoint_path)  # Exactly-once guarantee
    .option("mergeSchema", "true")  # Handle schema changes
    .queryName("claims_stream_ingestion_query")
    .trigger(once=True)  # Use AvailableNow trigger (processed as micro-batch)-adding this as im currently using serverless continues stream is not supported in a server less account 
    .toTable(target_bronze_table)
)

display(streaming_query)

# Wait for the termination of the query. This blocks the notebook cell, keeping the stream alive.
# The stream will run until you manually stop it or an error occurs.
streaming_query.awaitTermination()

# If you need to stop the stream manually, you can run this command in a separate cell.
# streaming_query.stop()
     