In [0]:
from pyspark.sql.functions import date_trunc, from_utc_timestamp, current_timestamp

def add_ingestion_date(input_df):
    output_df = input_df.withColumn("ingestion_date", date_trunc("day", 
                                        from_utc_timestamp(current_timestamp(), "Europe/Amsterdam")
                                        ))
    return output_df

In [0]:
from  pyspark.sql.functions import input_file_name
def add_data_source(input_df):
    output_df = input_df.withColumn("data_source", input_file_name())
    return output_df

In [0]:
def merge_delta_data(input_df, db_name, table_name, folder_path, merge_condition, partition_column):
    spark.conf.set("spark.databricks.optimizer.dynamicPartitionPruning", "true") 

    from delta.tables import DeltaTable 

    table_path = f"{folder_path}/{table_name}"

    if (spark._jsparkSession.catalog().tableExists(f"{db_name}.{table_name}")):
        deltaTable = DeltaTable.forPath(spark, f"{folder_path}/{table_name}")
        deltaTable.alias("tgt").merge(
            input_df.alias("src"),
            merge_condition) \
                .whenMatchedUpdateAll() \
                .whenNotMatchedInsertAll() \
                .execute()
    else:
        input_df.write.mode("overwrite").partitionBy(partition_column).format("delta").saveAsTable(f"{db_name}.{table_name}") # Writes data files (Delta format) to cloud storage

        # register the table in Hive metastore
        # without this step, we have to read the data via its path.
        query = f""" 
        CREATE TABLE {db_name}.{table_name} 
        USING DELTA 
        LOCATION '{folder_path}/{table_name}'
        """
        
        spark.sql(query)
    