In [0]:

import pandas as pd
from pyspark.sql.types import *

# Create table from dataframe
from pyspark.sql import DataFrame

schema=StructType(
                    [
                        StructField('Modification', 
                                    ArrayType(
                                                StructType(
                                                            [
                                                                StructField('_avgMassDelta', DoubleType(), True), 
                                                                StructField('_location', LongType(), True), 
                                                                StructField('_monoisotopicMassDelta', DoubleType(), True), 
                                                                StructField('_residues', StringType(), True), 
                                                                StructField('cvParam', 
                                                                            StructType(
                                                                                [
                                                                                    StructField('_accession', StringType(), True),
                                                                                    StructField('_cvRef', StringType(), True),
                                                                                    StructField('_name', StringType(), True), 
                                                                                    StructField('_value', StringType(), True)]), 
                                                                            True)]
                                                            ), 
                                                True), 
                                    True), 
                        StructField('PeptideSequence', StringType(), True), 
                        StructField('_id', StringType(), True), 
                        StructField('source_file', StringType(), False), 
                        StructField('file_size', LongType(), False)
                        ]
                    )


def create_table(df: DataFrame, table_name:str):
    ddl = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in df.schema.fields])
    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({ddl})"
    spark.sql(sql)


def create_table_from_schema(schema: StructType, table_name:str):

    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({schema.toDDL()})"
    print(sql)
    spark.sql(sql)


In [0]:
from pyspark.sql.functions import col

xml_tag_to_extract="Peptide"

catalog = dbutils.widgets.get("catalog")
storage_account= dbutils.widgets.get("storage_account_name")
container=dbutils.widgets.get("container")
path_to_monitor=dbutils.widgets.get("path_to_monitor")
schema = dbutils.widgets.get("schema")

target_table= f"{catalog}.{schema}.{xml_tag_to_extract.lower()}_bronze"


storage_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_monitor}"

checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/{xml_tag_to_extract.lower()}"


create_table_from_schema(schema,table_name)


uploaded_files = spark.readStream.format("cloudFiles") \
  .option("cloudFiles.format", "xml") \
  .option("rowTag", xml_tag_to_extract) \
  .schema(schema)\
  .load(storage_location) \
  .withColumn("source_file", col("_metadata.file_path")) \
  .withColumn("file_size", col("_metadata.file_size"))

result = (
  uploaded_files
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)
    .option("checkpointLocation",checkpoint_location)\
    .table(target_table)
)