In [0]:
from pyspark.sql import functions as F

In [0]:
# initializing required paths for downstream notebooks
layers = ['source', 'bronze', 'silver', 'gold']

# azure storage paths for location control
reports_adls_path = {layer: f'abfss://{layer}@storagegemeente.dfs.core.windows.net/reports' for layer in layers}
source_path = reports_adls_path['source']
bronze_path = reports_adls_path['bronze']
silver_path = reports_adls_path['silver']
gold_path = reports_adls_path['gold']

# autoloader bronze paths
checkpoint_adls_path = {layer: f'abfss://checkpoints@storagegemeente.dfs.core.windows.net/{layer}/reports/checkpoint' for layer in layers}
schema_adls_path = {layer: f'abfss://checkpoints@storagegemeente.dfs.core.windows.net/{layer}/reports/schema' for layer in layers}
bronze_checkpoint_path = checkpoint_adls_path['bronze']
bronze_schema_path = schema_adls_path['bronze']

# unity catalog delta table paths
uc_path = {layer: f'gemeente.{layer}.reports' for layer in layers}
bronze_reports = uc_path['bronze']
silver_reports = uc_path['silver']
gold_reports = uc_path['gold']

In [0]:
# using autoloader for incremental loading of json format from the source container
df = (
    spark.readStream
    .format('cloudFiles')
    .option('cloudFiles.format', 'json')
    .option('cloudFiles.schemaLocation', bronze_schema_path)
    .load(source_path)
)

In [0]:
# adding system ingestion time for auditing purposes
df = df.withColumn('ingestion_timestamp', F.current_timestamp())

In [0]:
# setting problem status to open
df = df.withColumn('status', F.lit('open'))

In [0]:
# creating the first delta table for the medaillon architecture
# utilizing unity catalog for lineage and governance while keeping control over storage location
(
    df.writeStream
    .format('delta')
    .option('checkpointLocation', bronze_checkpoint_path)
    .outputMode('append')
    .trigger(availableNow=True)
    .option('path', bronze_path)
    .table(bronze_reports)
)

In [0]:
# variables for passing downstream to the next notebook/task in the databricks job
vars = {
    'silver_path':silver_path,
    'gold_path':gold_path,
    'bronze_reports':bronze_reports,
    'silver_reports':silver_reports,
    'gold_reports':gold_reports
}

dbutils.jobs.taskValues.set(key='layer_vars_key', value=vars)