In [0]:
from pyspark.sql.functions import col, explode, struct, array, lit
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, BinaryType, ArrayType, DoubleType, MapType

xml_tag_to_extract="DBSequence"
container="data" 
storage_account= "senjkdtbxloader"

checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/silver/{xml_tag_to_extract.lower()}"

target_table= f"jk_libraries.silver.{xml_tag_to_extract.lower()}"

source_table = f"jk_libraries.bronze.{xml_tag_to_extract.lower()}"


schema = StructType([
    StructField("_accession", StringType(), True),
    StructField("_id", StringType(), True),
    StructField("_searchDatabase_ref", StringType(), True),
    StructField("source_file", StringType(), True),
    StructField("file_size", StringType(), True),
    StructField("protein_description", StringType(), True)
    ])


# Create table from dataframe
from pyspark.sql import DataFrame

def create_table(df: DataFrame, table_name:str):
    ddl = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in df.schema.fields])
    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({ddl})"
    spark.sql(sql)


def create_table_from_schema(schema: StructType, table_name:str):

    sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({schema.toDDL()})"
    print(sql)
    spark.sql(sql)


create_table_from_schema(schema, target_table)


In [0]:
from pyspark.sql.functions import expr, element_at, map_from_entries, filter, transform, lit, struct, udf
from pyspark.sql.types import StructType, StructField,StringType


bronze_data = spark.readStream.table(source_table).withColumn(
    "protein_description",
    element_at(
        map_from_entries(
            filter(
                transform("cvParam", lambda x: struct(x["_name"], x["_value"])),
                lambda x: x["_name"] == lit("protein description")
            )
        ),
        lit("protein description")
    )
).filter(~col("_accession").like("%_REVERSED%")).select("_accession","_id","_searchDatabase_ref","source_file","file_size","protein_description")


result = (
  bronze_data
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)\
    .option("checkpointLocation",checkpoint_location)\
    .table(target_table)
)

