Step 1: Create Catalog Structure (SQL)

In [0]:
%sql
SHOW CATALOGS

In [0]:
%sql
-- Create retailplex_platform catalog
CREATE CATALOG IF NOT EXISTS retailplex_platform;
USE CATALOG retailplex_platform;

-- Create schemas for medallion layers
CREATE SCHEMA IF NOT EXISTS retailplex_platform.landing;
CREATE SCHEMA IF NOT EXISTS retailplex_platform.bronze;

CREATE SCHEMA IF NOT EXISTS retailplex_platform.gold;

-- Create volume for files
CREATE VOLUME IF NOT EXISTS retailplex_platform.landing.raw_files;


-- Create volume for schema and checkpoints
CREATE VOLUME IF NOT EXISTS retailplex_platform.bronze.schemas
COMMENT 'Volume for bronze schemas';
CREATE VOLUME IF NOT EXISTS retailplex_platform.bronze.checkpoints
COMMENT 'Volume for bronze checkpoints';


Step 2: Upload Your JSON File


In [0]:

# the methods will not work. better to upload directly here /Volumes/retailplex_platform/landing/raw_files/incoming_multiplex_data/multiplex_data.jsonl

Step 3: Process the stream and Save Bronze layer

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
# Set Spark configs before any Spark actions
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", "200")
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
spark.conf.set("spark.sql.streaming.schemaInference", "true")
spark.conf.set("spark.sql.broadcastTimeout", "1200")

In [0]:
print("🌊 Setting up Auto Loader for Streaming Source...")

# Auto Loader paths
streaming_source_path = "/Volumes/retailplex_platform/landing/raw_files/incoming_multiplex_data/"
schema_location = "/Volumes/retailplex_platform/bronze/schemas/retailplex_stream/"
checkpoint_location =  "/Volumes/retailplex_platform/bronze/checkpoints/retailplex_stream/"
# 2. Define Bronze Delta table path (managed by Unity Catalog schema bronze)
bronze_table = "retailplex_platform.bronze.multiplex_stream"


In [0]:

multiplex_schema = StructType([
    StructField("topic", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("data", StringType(), True)
])


In [0]:
def create_autoloader_stream():
    """
    Main function to read stream and write to Bronze layer
    """
    multiplex_stream = (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("cloudFiles.schemaLocation", schema_location)
        .option("cloudFiles.inferColumnTypes", "true")
        .option("cloudFiles.maxFilesPerTrigger", 10)
        .option("cloudFiles.includeExistingFiles", "true")
        .option("cloudFiles.validateOptions", "true")
        .schema(multiplex_schema)
        .load(streaming_source_path)
    )
    # Now you can add columns
    multiplex_stream_enhanced = (
        multiplex_stream
        .withColumn("year_month", F.date_format("timestamp", "yyyy-MM"))
        .withColumn("_ingestion_timestamp", F.current_timestamp())
    )
    multiplex_stream_write = (multiplex_stream_enhanced.writeStream 
             .format("delta") 
             .outputMode("append") 
             .option("checkpointLocation", checkpoint_location)
             .partitionBy("topic", "year_month") 
             .option("mergeSchema", "true")  
             .trigger(availableNow=True)  
             .table(bronze_table))
    multiplex_stream_write.awaitTermination()



In [0]:
create_autoloader_stream()

In [0]:
base_in = "/Volumes/retailplex_platform/landing/raw_files/refdata/"
base_bz = "/Volumes/retailplex_platform/bronze/refdata"

pairs = [
  ("customer_segments", "CUSTOMER_SEGMENTS.csv"),
  ("product_categories", "PRODUCT_CATEGORIES.csv"),
  ("product_subcategories", "PRODUCT_SUBCATEGORIES.csv"),
  ("suppliers", "SUPPLIERS.csv"),
  ("geography", "GEOGRAPHY.csv")
]

spark.sql("USE CATALOG retailplex_platform")

for tbl, file in pairs:
    spark.sql(f"""
      CREATE TABLE IF NOT EXISTS bronze.{tbl}
      USING DELTA
    """)
    spark.sql(f"""
      COPY INTO bronze.{tbl}
      FROM '{base_in}/{file}'
      FILEFORMAT = CSV
      FORMAT_OPTIONS ('header'='true', 'inferSchema'='true')
      COPY_OPTIONS ('mergeSchema'='true')
    """)


In [0]:
print("\n🔄 AUTO LOADER WORKFLOW: \
    \n1. 📁 New JSON file lands in source directory,\
    \n2. 🔍 Auto Loader detects the file (within seconds),\
    \n3. 📋 Checks file against known schema,\
    \n4. 🔄 If new columns found → evolves schema,\
    \n5. ⚡ Processes file in next micro-batch,\
    \n6. ✅ Updates internal tracking (never reprocess),\
    \n7. 🎯 Sends data to your streaming processing function,\
    \n8. 🔁 Waits for next trigger (60 seconds in our setup)")

In [0]:
# The basic Auto Loader setup
print("\n1️⃣ BASIC SETUP:\
\n   multiplex_autoloader_stream = spark.readStream \
\n   This creates a streaming DataFrame (not a regular DataFrame)\
\n   - readStream = streaming mode (vs read for batch)\
\n   - Continuously monitors for new files")


print("\n2️⃣ FORMAT SPECIFICATION: \
\n   .format('cloudFiles')\
\n   - 'cloudFiles' is the Auto Loader format \
\n   - This tells Spark to use Auto Loader instead of regular file reading \
\n   - Auto Loader = Databricks' optimized streaming file reader")


print("\n3️⃣ FILE FORMAT: \
\n   .option('cloudFiles.format', 'json')  \
\n   - Tells Auto Loader the files are JSON format \
\n   - Other options: parquet, csv, text, delta, etc. \
\n   - Auto Loader will parse JSON automatically")


print("\n4️⃣ SCHEMA LOCATION: \
\n   ('.option('cloudFiles.schemaLocation', schema_location)  \
\n   - WHERE: /Volumes/datastream_nexus/landing/schemas/autoloader/ \
\n   - WHAT: Auto Loader stores the inferred schema here \
\n   - WHY: Enables schema evolution and consistency across runs \
\n   - BENEFIT: If job restarts, it remembers the schema")

print("\n   📁 What gets stored in schema location: \
\n   ├── _schemas/ \
\n   │   └── schema_version_1.json  # Discovered schema\
\n   ├── _checkpoint/\
\n   │   └── schema evolution tracking\
\n   └── metadata files")


print("\n5️⃣ COLUMN TYPE INFERENCE: \
\n   .option('cloudFiles.inferColumnTypes', 'true') \
\n   - Auto-detects data types (string, int, double, boolean) \
\n   - Without this: everything would be treated as strings \
\n   - Example: '123' becomes integer 123, not string '123'")


print("\n6️⃣ SCHEMA EVOLUTION: \
\n   .option(cloudFiles.schemaEvolutionMode, addNewColumns) \
\n   - addNewColumns: New fields in JSON get added to schema \
\n   - rescue: Puts unknown columns in _rescued_data column \
\n   - failOnNewColumns: Fails if new columns appear \
\n   - none: No schema evolution allowed")


print("\n7️⃣ PROCESSING RATE CONTROL:\
\n   .option('cloudFiles.maxFilesPerTrigger', 10) ')\
\n   - Processes maximum 10 files per micro-batch\
\n   - Prevents overwhelming the system with too many files\
\n   - Helps with consistent processing latency\
\n   - Tune based on file size and processing capacity")


print("\n   📊 File Processing Strategy:\
\n   Scenario 1: 5 files arrive → Process all 5\
\n   Scenario 2: 50 files arrive → Process 10, queue the rest\
\n   Scenario 3: No files → Wait for next trigger")

print("\n8️⃣ EXISTING FILES:\
\n   .option('cloudFiles.includeExistingFiles', 'true') ')\
\n   - true: Process files that already exist (backfill)\
\n   - false: Only process files that arrive after stream starts\
\n   - Useful for: Initial data load or catching up")


print("\n9️⃣ VALIDATION:\
\n   .option('cloudFiles.validateOptions', 'true') ')\
\n   - Validates all Auto Loader options at startup')\
\n   - Catches configuration errors early\
\n   - Recommended for production")

print("\n🔟 PROVIDING SCHEMA:\
\n   '.schema(multiplex_schema) \
\n   - Provides explicit schema instead of full inference\
\n   - Faster startup (no schema inference time)\
\n   - More predictable behavior\
\n   - Catches schema violations early")

print("\n   💡 Schema vs Inference Trade-offs:\
\n   With Explicit Schema:\
\n   ✅ Faster startup\
\n   ✅ Predictable structure \
\n   ✅ Type safety\
\n   ❌ Must maintain schema manually")

print("\n   With Full Inference:\
\n   ✅ Automatic discovery\
\n   ✅ Handles unknown structures\
\n   ❌ Slower first run\
\n   ❌ Less predictable")

print("\n1️⃣1️⃣ SOURCE PATH:\
\n   ('.load(source_path)\
\n   - WHERE: /Volumes/datastream_nexus/landing/raw_files/\
\n   - WHAT: Directory to monitor for files\
\n   - BEHAVIOR: Recursively monitors subdirectories")