In [0]:
from pyspark.sql.types import *

schema = StructType(
                    [
                        StructField('_dBSequence_ref', StringType(), True), 
                        StructField('_end', LongType(), True), 
                        StructField('_id', StringType(), True), 
                        StructField('_isDecoy', BooleanType(), True), 
                        StructField('_peptide_ref', StringType(), True), 
                        StructField('_post', StringType(), True), 
                        StructField('_pre', StringType(), True), 
                        StructField('_start', LongType(), True),
                        StructField('source_file', StringType(), True), 
                        StructField('file_size', LongType(), True),
                     ]
                    )

In [0]:
# Create table from dataframe
from pyspark.sql import DataFrame

def create_table(df: DataFrame, table_name:str):
    ddl = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in df.schema.fields])
    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({ddl})"
    spark.sql(sql)


def create_table_from_schema(schema: StructType, table_name:str):
    ddl = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in schema.fields])
    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({ddl})"
    print(sql)
    spark.sql(sql)


In [0]:
from pyspark.sql.functions import col

xml_tag_to_extract="PeptideEvidence"

container="data" 
storage_account= "senjkdtbxloader"


storage_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/folder"

schema_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/schema/{xml_tag_to_extract.lower()}"

checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/{xml_tag_to_extract.lower()}"

table_name= f"jk_libraries.bronze.{xml_tag_to_extract.lower()}"

create_table_from_schema(schema,table_name)


uploaded_files = spark.readStream.format("cloudFiles") \
  .option("cloudFiles.format", "xml") \
  .option("rowTag", xml_tag_to_extract) \
  .schema(schema)\
  .load(storage_location) \
  .withColumn("source_file", col("_metadata.file_path")) \
  .withColumn("file_size", col("_metadata.file_size"))

result = (
  uploaded_files
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)
    .option("checkpointLocation",checkpoint_location)\
    .table(table_name)
)
