In [0]:
## Define the spark schema

from pyspark.sql.types import (
    StructType, StructField,
    StringType, DoubleType, LongType, BooleanType,
    ArrayType
)

schema = StructType([
    StructField(
        "SpectrumIdentificationItem",
        ArrayType(
            StructType([
                StructField(
                    "Fragmentation",
                    StructType([
                        StructField(
                            "IonType",
                            ArrayType(
                                StructType([
                                    StructField(
                                        "FragmentArray",
                                        ArrayType(
                                            StructType([
                                                StructField("_measure_ref", StringType(), True),
                                                StructField("_values", StringType(), True)
                                            ]),
                                            True
                                        ),
                                        True
                                    ),
                                    StructField("_charge", LongType(), True),
                                    StructField("_index", StringType(), True),
                                    StructField(
                                        "cvParam",
                                        ArrayType(
                                            StructType([
                                                StructField("_accession", StringType(), True),
                                                StructField("_cvRef", StringType(), True),
                                                StructField("_name", StringType(), True),
                                                StructField("_value", StringType(), True)
                                            ]),
                                            True
                                        ),
                                        True
                                    )
                                ]),
                                True
                            ),
                            True
                        )
                    ]),
                    True
                ),
                StructField(
                    "PeptideEvidenceRef",
                    ArrayType(
                        StructType([
                            StructField("_peptideEvidence_ref", StringType(), True)
                        ]),
                        True
                    ),
                    True
                ),
                StructField("_calculatedMassToCharge", DoubleType(), True),
                StructField("_chargeState", LongType(), True),
                StructField("_experimentalMassToCharge", DoubleType(), True),
                StructField("_id", StringType(), True),
                StructField("_passThreshold", BooleanType(), True),
                StructField("_peptide_ref", StringType(), True),
                StructField("_rank", LongType(), True),
                StructField(
                    "cvParam",
                    ArrayType(
                        StructType([
                            StructField("_accession", StringType(), True),
                            StructField("_cvRef", StringType(), True),
                            StructField("_name", StringType(), True),
                            StructField("_unitAccession", StringType(), True),
                            StructField("_unitCvRef", StringType(), True),
                            StructField("_unitName", StringType(), True),
                            StructField("_value", StringType(), True)
                        ]),
                        True
                    ),
                    True
                )
            ]),
            True
        ),
        True
    ),
    StructField("_id", StringType(), True),
    StructField("_spectraData_ref", StringType(), True),
    StructField("_spectrumID", StringType(), True),
    StructField(
        "cvParam",
        ArrayType(
            StructType([
                StructField("_accession", StringType(), True),
                StructField("_cvRef", StringType(), True),
                StructField("_name", StringType(), True),
                StructField("_unitAccession", StringType(), True),
                StructField("_unitCvRef", StringType(), True),
                StructField("_unitName", StringType(), True),
                StructField("_value", StringType(), True)
            ]),
            True
        ),
        True
    ),
    StructField("source_file", StringType(), False),
    StructField("file_size", LongType(), False)
])



In [0]:
# Create table from dataframe
from pyspark.sql import DataFrame

def create_table(df: DataFrame, table_name:str):
    ddl = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in df.schema.fields])
    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({ddl})"
    spark.sql(sql)


def create_table_from_schema(schema: StructType, table_name:str):

    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({schema.toDDL()})"
    print(sql)
    spark.sql(sql)



In [0]:

from pyspark.sql.functions import col

xml_tag_to_extract="SpectrumIdentificationResult"

catalog = dbutils.widgets.get("catalog")
storage_account= dbutils.widgets.get("storage_account_name")
container=dbutils.widgets.get("container")
path_to_monitor=dbutils.widgets.get("path_to_monitor")

target_table= f"{catalog}.{schema}.{xml_tag_to_extract.lower()}_bronze"

storage_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_monitor}"


checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/{xml_tag_to_extract.lower()}"


create_table_from_schema(schema,table_name)


uploaded_files = spark.readStream.format("cloudFiles") \
  .option("cloudFiles.format", "xml") \
  .option("rowTag", xml_tag_to_extract) \
  .schema(schema)\
  .load(storage_location) \
  .withColumn("source_file", col("_metadata.file_path")) \
  .withColumn("file_size", col("_metadata.file_size"))

spectrum_identification_result = (
  uploaded_files
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)\
    .option("checkpointLocation",checkpoint_location)\
    .table(target_table)
)
