In [0]:
from pyspark.sql.types import *
import utils

table_schema = StructType(
                    [
                        StructField('_dBSequence_ref', StringType(), True), 
                        StructField('_end', LongType(), True), 
                        StructField('_id', StringType(), True), 
                        StructField('_isDecoy', BooleanType(), True), 
                        StructField('_peptide_ref', StringType(), True), 
                        StructField('_post', StringType(), True), 
                        StructField('_pre', StringType(), True), 
                        StructField('_start', LongType(), True),
                        StructField('source_file', StringType(), True), 
                        StructField('file_size', LongType(), True),
                     ]
                    )

In [0]:
from pyspark.sql.functions import col

xml_tag_to_extract="PeptideEvidence"

catalog = dbutils.widgets.get("catalog")
storage_account= dbutils.widgets.get("storage_account_name")
container=dbutils.widgets.get("container")
path_to_monitor=dbutils.widgets.get("path_to_monitor")
schema = dbutils.widgets.get("schema")

target_table= f"{catalog}.{schema}.{xml_tag_to_extract.lower()}_bronze"

storage_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_monitor}"

checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/{xml_tag_to_extract.lower()}"

utils.create_table_from_schema(spark, table_schema,target_table)


uploaded_files = spark.readStream.format("cloudFiles") \
  .option("cloudFiles.format", "xml") \
  .option("rowTag", xml_tag_to_extract) \
  .schema(table_schema)\
  .load(storage_location) \
  .withColumn("source_file", col("_metadata.file_path")) \
  .withColumn("file_size", col("_metadata.file_size"))

result = (
  uploaded_files
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)\
    .option("checkpointLocation",checkpoint_location)\
    .table(target_table)
)
