In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable

In [0]:
#CONFIG
target_table='databricks_catalog.gold.Dim_location_scd2'
bk='LocationID'
attr=['LocationID','Borough','Zone','service_zone']


####Data Reading

In [0]:
df_in=spark.sql('select * from databricks_catalog.silver.taxi_lookup')


####Create rowhash and scd2 columns

In [0]:
df_src=df_in.select ('LocationID','Borough','Zone','service_zone').dropDuplicates(['LocationID'])\
    .withColumn('row_hash',sha2(concat_ws('||',*[coalesce(col(c).cast('string'),lit("")) for c in attr]),256))\
        .withColumn('effective_from',current_timestamp())\
            .withColumn('effective_to',lit('9999-12-31'))\
                .withColumn('is_current',lit('True'))

#####Create table if not exists (First run)

In [0]:
if not  spark.catalog.tableExists(target_table):
    df_src.write.mode('overwrite').format('delta').saveAsTable(target_table)

else:
    dt=DeltaTable.forName(spark,target_table)
    

In [0]:
#Expire current records
(dt.alias('trg').merge(df_src.alias('src'),f"trg.{bk}==src.{bk} and trg.is_current='True'")\
    .whenMatchedUpdate(condition="trg.row_hash!=src.row_hash",
                       set={"is_current":lit('False'),
                            "effective_to":lit(current_timestamp())}
                       ).execute())

####Insert new and changed records

In [0]:
#current expire keys
current_after_expire=(spark.table(target_table).filter(col('is_current')==True).select(bk,'row_hash'))
#Insert
df_insert=(df_src.alias('src').join(current_after_expire.alias('curr'),on=bk,how='left')\
    .filter(
        col('curr.row_hash').isNull()|
        (col('curr.row_hash')!=col('src.row_hash'))
        ).select('src.*') )

(df_insert.write.mode('append').format('delta').saveAsTable(target_table))



In [0]:
%sql
select * from databricks_catalog.gold.dim_location_scd2