In [9]:
pTableName = 'table1'
pJoinKey= 'salekey'
pOrderKey=''
pTriggerType = "batch"

StatementMeta(, ea568abd-cea4-4d6a-b05b-d4a5abc7842b, 11, Finished, Available)

###### Explaination 

Thanks to Spark Structured Streaming checkpointing, the cell below detects any files for processing since the last run. It uses an overloaded writestream method known as a foreachbatch statement to perform a SQL merge statement against the target table. For more information on foreachbatch and streaming queries please see [this section](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch) of the Apache Spark documentation

In [10]:
from pyspark.sql.types import *
from pyspark.sql.functions import col,lit,current_timestamp, input_file_name
import sys

spark.sql("set spark.sql.streaming.schemaInference=true")
spark.sql("set spark.streaming.stopGracefullyOnShutdown=true")

relbaselocation = "Files/AutoMerger"

def loadIncrementals(table_name, join_key, order_key):
    printlog = ""
    def upsertToDeltaCaptureCDC(microBatchOutputDF, batchId):
        updatestmt = ''
        insertsrcstmt = ''
        inserttgtstmt = ''
        targettable = microBatchOutputDF.select("fep_table_name").limit(1).collect()[0][0]
        joinkey = microBatchOutputDF.select("fep_join_key").limit(1).collect()[0][0]
        orderkey = microBatchOutputDF.select("fep_order_key").limit(1).collect()[0][0]
        #add this to the end of the next line if you require a timestmape field .withColumn("changeTimestamp",current_timestamp())
        microBatchOutputDF = microBatchOutputDF.drop("fep_join_key").drop('fep_table_name').withColumn("changeTimestamp",current_timestamp()).withColumn("inputFile",input_file_name())
        microBatchOutputDF.createOrReplaceTempView("updates")
        tablecols = spark.sql('show columns in '+targettable).collect()

        for column in tablecols:
          updatestmt = updatestmt+column.col_name +'= source.'+column.col_name + ','
          insertsrcstmt = insertsrcstmt + column.col_name+ ','
          inserttgtstmt = inserttgtstmt + "source."+column.col_name+ ','

        # These columns would only be necessary if we didn't hard code the last column such as timestamp above
        updatestmt = updatestmt.rstrip(",")
        insertsrcstmt = insertsrcstmt.rstrip(",")
        inserttgtstmt = inserttgtstmt.rstrip(",")

        if orderkey != '':
          mergestmt = '''MERGE INTO '''+targettable+''' target
          USING (SELECT X.* FROM
                (
                SELECT
                    DENSE_RANK() OVER (PARTITION BY CT.'''+joinkey+''' ORDER BY CT.'''+orderkey+''' desc) AS DR
                ,   CT.*
                FROM
                    updates AS CT
                ) X
                where DR=1
                )  source 
          ON source.'''+ joinkey + '''=  target.'''+joinkey+'''
          WHEN MATCHED THEN UPDATE SET ''' + updatestmt + '''
          WHEN NOT MATCHED THEN INSERT (''' + insertsrcstmt + ''') VALUES ('''+inserttgtstmt + ''')'''
        else:
          mergestmt = '''MERGE INTO '''+targettable+''' target
          USING updates AS source 
          ON source.'''+ joinkey + '''=  target.'''+joinkey+'''
          WHEN MATCHED THEN UPDATE SET ''' + updatestmt + '''
          WHEN NOT MATCHED THEN INSERT (''' + insertsrcstmt + ''') VALUES ('''+inserttgtstmt + ''')'''
        #printlog = printlog + "| " + mergestmt
        microBatchOutputDF.sparkSession.sql(mergestmt)
  
    printlog = printlog + "| Reading new data from location: "+ relbaselocation+"/incrementalfeed/"+table_name+"/"
    streamingUpserts = spark.readStream.format("parquet").load(relbaselocation+"/incrementalfeed/"+table_name+"/").withColumn("fep_table_name", lit(table_name)) \
      .withColumn("fep_join_key", lit(join_key)).withColumn("fep_order_key", lit(order_key)) 

    # Use  .trigger(availableNow=True) for running in scheduled mode and remove awaitTermination statement if you want to run this as a streaming batch.
    if pTriggerType == 'batch':
        printlog = printlog + "| Running in batch mode"
        query = streamingUpserts.writeStream.queryName(table_name).format("delta").foreachBatch(upsertToDeltaCaptureCDC).trigger(availableNow=True).option("checkpointLocation",relbaselocation+"/checkpoints/"+table_name+"/").start()

    else:
        printlog = printlog + "| Running in streaming mode"
        query = streamingUpserts.writeStream.queryName(table_name).format("delta").foreachBatch(upsertToDeltaCaptureCDC).option("checkpointLocation",relbaselocation+"/checkpoints/"+table_name+"/").start()

    try:
      query.awaitTermination()
    except Exception as error:
      #mssparkutils.notebook.exit("Error loading "+ pTableName + ". Print log:"+ printlog + ". Error log: " +  str(error))
      raise

    if pTriggerType == 'batch':
      mssparkutils.notebook.exit("Successfully loaded "+ pTableName + ". Print log:"+ printlog)
    
loadIncrementals(pTableName,pJoinKey,pOrderKey)

StatementMeta(, ea568abd-cea4-4d6a-b05b-d4a5abc7842b, 12, Finished, Available)