In [0]:
#==========================
# Real-Time Retail Pipeline
# Event Hub → Bronze → Silver → Gold
# ==========================

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, from_json, sum as _sum, avg as _avg, countDistinct
import json
with open("/dbfs/FileStore/configs/config.json", "r") as f:
    config = json.load(f)
# ==========================
# 1. Spark Session
# ==========================
spark = SparkSession.builder.appName("RealTimeRetailPipeline").getOrCreate()

# ==========================
# 2. Event Hub Configuration
# ==========================
eh_connection_string = config["eventhub"]["connection_string"]

# Encrypt for Spark connector
ehConf = {
    'eventhubs.connectionString': sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(eh_connection_string)
}

# ==========================
# 3. Define Schema
# ==========================
schema = StructType() \
    .add("InvoiceNo", StringType()) \
    .add("StockCode", StringType()) \
    .add("Description", StringType()) \
    .add("Quantity", StringType()) \
    .add("InvoiceDate", StringType()) \
    .add("UnitPrice", StringType()) \
    .add("CustomerID", StringType()) \
    .add("Country", StringType())

# ==========================
# 4. Bronze: Raw Ingestion from Event Hub
# ==========================
df_raw = (spark.readStream
          .format("eventhubs")
          .options(**ehConf)
          .load())

# Parse binary body to structured columns
bronze_df = (df_raw
             .selectExpr("CAST(body AS STRING) as json")
             .select(from_json(col("json"), schema).alias("data"))
             .select("data.*"))

bronze_path = config["delta_tables"]["bronze_path"]
display(bronze_df )

df_clean = (bronze_df
            .withColumn("Quantity", col("Quantity").cast(IntegerType()))
            .withColumn("UnitPrice", col("UnitPrice").cast(DoubleType())))
display(df_clean)
bronze_query = (df_clean.writeStream
                .format("delta")
                .option("checkpointLocation", bronze_path + "/_checkpoint")
                .outputMode("append")
                .start(bronze_path))