In [0]:
from pyspark.sql.functions import col, explode, struct, array, lit
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, BinaryType, ArrayType, DoubleType, MapType, BooleanType

xml_tag_to_extract="PeptideEvidence"
container="data" 
storage_account= "senjkdtbxloader"

checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/silver/{xml_tag_to_extract.lower()}"

target_table= f"jk_libraries.silver.{xml_tag_to_extract.lower()}"

source_table = f"jk_libraries.bronze.{xml_tag_to_extract.lower()}"


schema = StructType([
    StructField("_dBSequence_ref", StringType(), True),
    StructField("_end", IntegerType(), True),
    StructField("_id", StringType(), True),
    StructField("_isDecoy", BooleanType(), True),
    StructField("_peptide_ref", StringType(), True),
    StructField("_post", StringType(), True),
    StructField("_pre", StringType(), True),
    StructField("_start", IntegerType(), True),
    StructField("source_file", StringType(), True),
    StructField("file_size", StringType(), True),
    ])


# Create table from dataframe
from pyspark.sql import DataFrame

def create_table(df: DataFrame, table_name:str):
    ddl = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in df.schema.fields])
    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({ddl})"
    spark.sql(sql)


def create_table_from_schema(schema: StructType, table_name:str):

    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({schema.toDDL()})"
    print(sql)
    spark.sql(sql)


create_table_from_schema(schema, target_table)

In [0]:

result = (
    spark
    .readStream
    .table(source_table)
    .filter(~col("_dBSequence_ref").like("%_REVERSED%"))
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)\
    .option("checkpointLocation",checkpoint_location)\
    .table(target_table)
)