##Template for performing event based ingestion and merging from Attunity change files

In [2]:
#Summary of algorithm
# - Attunity CDC send change data of various table into ADLS Gen 2 folder, each table has its own folder. 
# - Each change data file will come with a schema file (dfm) that describe the schema of the data file
# - Eventgrid listen to new files landed in the subscribed folder and create messages  detailing locations and type of operations for each file
# - Our main program will read messages from message queue, sort them by table then process messages in batch with a predefined size. By sorting we will have least number of table possible in each batch
# - Within each batch, the process_files progream will group by table and retrieve a unique schema file and data files for each table in the group by. From schema file, it will form the schema and use it to retrieve data  
# - For insert data, use regular insert. For update and delete, user MERGE to merge data to target table

In [3]:
# Load data from Azure 
# Reset the widgets
dbutils.widgets.removeAll()

dbutils.widgets.text("STORAGE_ACCOUNT", "")
dbutils.widgets.text("SAS_KEY", "")
dbutils.widgets.text("ACCOUNT_KEY", "")
dbutils.widgets.text("QUEUE_NAME", "")
dbutils.widgets.text("ROOT_PATH", "")

account_name = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
sas=dbutils.widgets.get("SAS_KEY").strip()
account_key = dbutils.widgets.get("ACCOUNT_KEY").strip()
queue_name=  dbutils.widgets.get("QUEUE_NAME").strip()
# 'queue1'
  # Set up account access key
conf_key = "fs.azure.account.key.{storage_acct}.dfs.core.windows.net".format(storage_acct=account_name)
spark.conf.set(conf_key, account_key)

# sas='?sv=2019-02-02&ss=bfqt&srt=sco&sp=rwdlacup&se=2020-10-10T23:39:50Z&st=2019-12-19T16:39:50Z&spr=https&sig=1spXbLPp5j4z8A07hUCzJOLdgOZXGhacIW1ot5TqHfQ%3D'

root_path =dbutils.widgets.get("ROOT_PATH").strip()
table_path ={"test":"/tmp/target_test","test2":"/tmp/target_test2"}



## Utility functions to parse schema and load data

In [5]:

from delta.tables import *
from pyspark.sql.types import StructField, StructType , LongType, StringType, DoubleType , DateType 

def get_file_info(paths, file_path="", file_extension="json"):
#function to parse dfm schema file and return schema for loading data file
#assumption is each table will have its own directory. All files will be sent here and archiving 
#will output a dictionary of file name and schema
  schemas =spark.read.option("multiLine", True).option("mode", "PERMISSIVE").json(paths)
  #This map show mapping between oracle data type and Spark SQL. Needs update
  ora_pyspark_map = {'STRING':StringType(), 'DATETIME':DateType(),'NUMERIC':DoubleType()}
  schemas = schemas.select(["dataInfo", "fileInfo"]).collect()
  output={}
  for schema in schemas:
    targetSchema = StructType()
    for item in schema['dataInfo']['columns']:
      #default to String type if no mapping is found
      target_type = ora_pyspark_map.get(item['type'], StringType())
      targetSchema.add(item['name'],target_type)
      
    output[file_path+schema['fileInfo']['name']+"."+file_extension] = targetSchema
  v = {}
#This algorithm is to produce unique schema info from the list
  for key, value in sorted(output.items()):
      v.setdefault(value, []).append(key)
  return v
def get_target_schema(change_schema):
    targetSchema = StructType()
    for field in change_schema.fields:
      if "header__" not in field.name:
        targetSchema.add(field)
    return targetSchema
  

def merge(updatesDF, target_tbl_path):
#   update_tmp_name = "updatetbl"
#   updatesDF.registerTempTable(update_tmp_name)
  #processing the insert
  targetTable = DeltaTable.forPath(spark, target_tbl_path)
  updatesDF.cache()
  insert_df = updatesDF.filter("header__change_oper='I'").select(['LOOPNUM', 'TRANSACTIONNUM', 'B', 'C', 'D'])
  insert_df.write.format("delta").mode("append").save(target_tbl_path)
  #processing the update/delete
  upsert_df = updatesDF.filter("header__change_oper='U' or header__change_oper='D'")

  targetTable.alias("tgt_tbl").merge(
      upsert_df.alias("updates"),
      "tgt_tbl.TRANSACTIONNUM = updates.TRANSACTIONNUM and tgt_tbl.LOOPNUM = updates.LOOPNUM" ) \
    .whenMatchedUpdate("updates.header__change_oper='U'",set = {
                             "B" : "updates.B",
                             "C" : "updates.C",
                             "D" : "updates.D"} ) \
    .whenMatchedDelete("updates.header__change_oper='D'") \
    .execute()
  
def process_files(dfm_filelist,path_prefix,target_tbl_path):
  data_format='csv'
  chg_tbl_name ="test"
  file_list = get_file_info(dfm_filelist,path_prefix,data_format)
  datafilelist=[]
  for file in file_list.items():
    path = file[1]
    schema=file[0]
    data = spark.read.format(data_format).schema(schema).load(path)
    data.registerTempTable(chg_tbl_name)
    #to be implemented: generate below query dynamically based on target schema 
#     updatesDF =sql("select header__change_oper, LOOPNUM, TRANSACTIONNUM, B, C, D from (select header__change_oper, LOOPNUM, TRANSACTIONNUM, B, C, D, RANK() OVER (PARTITION BY LOOPNUM, TRANSACTIONNUM,header__change_oper ORDER BY header__change_seq DESC) AS RNK  from "+chg_tbl_name+" where header__change_oper='U') A where RNK=1")
    updatesDF =sql("select header__change_oper, LOOPNUM, TRANSACTIONNUM, B, C, D from (select header__change_oper, LOOPNUM, TRANSACTIONNUM, B, C, D, RANK() OVER (PARTITION BY LOOPNUM, TRANSACTIONNUM ORDER BY header__change_seq DESC) AS RNK  from "+chg_tbl_name+" ) A where RNK=1")

    merge(updatesDF,target_tbl_path)
  
  

### Main procedure to process incoming data from Eventgrid

In [7]:
from azure.storage.queue import QueueService,QueueMessageFormat
import ast
import time
#Authenticate to Datalake where the files are landed. You can use ABFS or WASB depending on authentication method


#Dictionary contain table name and table path maps
def main_proc():
  #authenticate to the Storeage Queue Service using a shared SAS key

  queue_service = QueueService(account_name=account_name,sas_token=sas)

  #Set visibility_timeout which is the estimated time that your processing will last so that other parallel clusters may not see it. The messages will be back to queue 
  #unless you explicitly delete them which should be done after successful operation. 32 is the max number of messages in one read. If you need more than that, call get_messages 
  #multiple times.
  #Do this while a while loop so that it keep processing new files

  batch=0
  batch_size =32
  max_bath_num = 5
  visibility_timeout =5*60
  #wait time if current queue is empty before retry
  wait_time =10
  while True:
    file_list=[]
    table_list=[]
    #Get estimate of queue length
    metadata = queue_service.get_queue_metadata(queue_name)
    count = metadata.approximate_message_count
    print("Begining processing, queue length is ", count)
    messages=None
    #This is to get more messages than the default limit of 32
    for i in range(max_bath_num):
      batch_messages = queue_service.get_messages(
            queue_name, num_messages=batch_size, visibility_timeout=visibility_timeout)
      if messages is None:
        messages = batch_messages
      else:
        messages = messages+batch_messages
    #This is the path to append with new files extracted from the queue 
    for message in messages:
      content =QueueMessageFormat.binary_base64decode(message.content).decode('utf8')
      json_content = ast.literal_eval(content)  
      #THe logic in this example will process anything which is not delete operation including update. Change this to fit your scenario
      if json_content['data']['api']!="DeleteFile":
        file_list.append(root_path+"/".join(json_content['data']['url'].split("/")[-2:]))
        table_list.append(json_content['data']['url'].split("/")[-2])
    #here is the main processing logic (Data transformation)
    #1. Reading files, the reader can read multiple files
    dfm_filelist=[item for item in file_list if ".dfm" in item ]
    table_dfm_dict = {file:tbl for tbl,file in zip(table_list,dfm_filelist)}

    reduced_tbl_file = {}
    #create a grouping of table:file_list
    for key, value in sorted(table_dfm_dict.items()):
        reduced_tbl_file.setdefault(value, []).append(key)
    if len(dfm_filelist)>0:
      for tbl in reduced_tbl_file.keys():
        tbl_path=table_path.get(tbl,"/tmp/target_test")
        print("process: "+tbl+" path: "+tbl_path)
        process_files(dfm_filelist,root_path+tbl+"/", tbl_path)
      for message in messages:
        queue_service.delete_message(queue_name, message.id, message.pop_receipt)

      print("finish batch {0}, processed {1} files".format(batch, len(file_list)))
      batch=batch+1
    else:
      #Wait for next batch
      print("Nothing in queue, wait {} seconds for next batch".format(wait_time))
      time.sleep(wait_time)
      continue 




In [8]:
#calling function a single time
main_proc()

In [9]:

import threading
#start 5 jobs simultaneously
for i in range(5):
    t = threading.Thread(target=main_proc)
    t.start()

In [10]:
spark.read.format("delta").load("/tmp/target_test").registerTempTable("test")
spark.read.format("delta").load("/tmp/target_test2").registerTempTable("test2")

sql("delete from test2")
sql("delete from test")

In [11]:

%sql select count(*) from test2
--checking result

count(1)
0
