In [0]:
vars = dbutils.jobs.taskValues.get(taskKey='silver_task', key='layer_vars_key')

gold_path = vars['gold_path']
silver_reports = vars['silver_reports']
gold_reports = vars['gold_reports']

In [0]:
import random
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
# load silver table into spark df
df = spark.read.table(silver_reports)

In [0]:
# assigning priority to report based on problem
df = df.withColumn('prioriteit',
                   F.when(F.col('problem_norm') == 'losse stoeptegel', 'laag')
                   .when(F.col('problem_norm') == 'kapotte lantaarnpaal', 'middel')
                   .when(F.col('problem_norm') == 'kapotte vuilnisbak', 'hoog')
                   )

In [0]:
# selecting only required columns for business 
df_gold = df.select(
    'id',
    'problem_norm',
    'prioriteit',
    'street_name',
    'house_number',
    'postcode',
    'lon',
    'lat',
    'status',
    'status_updated_at',
    'reported_on'
)

In [0]:
# !!script to simulate reports being solved!!
# selects random number of records from oldest open reports and sets status to solved
# updates the status_updated_at column to support SCD type 1

df_open = df_gold.filter(F.col('status') == 'open')
if df_open.count() > 50:
    df_oldest_open = df_open.orderBy(F.col('reported_on').asc()).limit(50)
    rows_to_solve = df_oldest_open.orderBy(F.rand()).limit(random.randint(5,10))
    ids_to_solve = [r.id for r in rows_to_solve.select(F.col('id')).collect()]
    
    df_gold = (
        df_gold
    .withColumn('status',
                 F.when(F.col('id').isin(ids_to_solve),
                 F.lit('solved'))
                 .otherwise(F.col('status')))  
    .withColumn('status_updated_at', 
                F.when(F.col('id').isin(ids_to_solve),
                F.lit(F.col('reported_on') + 
                F.expr('INTERVAL 1 MINUTE') *
                (F.floor(F.rand()* (600 - 120)) + 120)))
                .otherwise(F.lit(F.col('status_updated_at'))
    ))
)
                   

In [0]:
# MERGE new records to prevent duplicates in case of unexpected reprocessing of task
# load existing records as DeltaTable, spark df does not support ACID
# using overwrite mode for first table initiation

if spark.catalog.tableExists(gold_reports):
    print('Gold table exists → started merge')
    gold_table = DeltaTable.forName(spark, gold_reports)
    (
        gold_table.alias('existing')
        .merge(
            df_gold.alias('new'),     
            'existing.id = new.id',          
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    print('Gold table does not exist → creating new table')
    (df_gold.write.mode('overwrite').format('delta').option('path', gold_path).saveAsTable(gold_reports))

In [0]:
# vars to pass to next notebook
vars = {
    'gold_reports':gold_reports
}

dbutils.jobs.taskValues.set(key='layer_vars_key', value=vars)
