In [0]:
from pyspark.sql.functions import col, explode, struct, array, lit
from pyspark.sql.types import *
import utils

# Create table from dataframe
from pyspark.sql import DataFrame

xml_tag_to_extract="DBSequence"


catalog = dbutils.widgets.get("catalog")
storage_account= dbutils.widgets.get("storage_account_name")
container=dbutils.widgets.get("container")
path_to_monitor=dbutils.widgets.get("path_to_monitor")
schema = dbutils.widgets.get("schema")

target_table= f"{catalog}.{schema}.{xml_tag_to_extract.lower()}_silver"

source_table = f"{catalog}.{schema}.{xml_tag_to_extract.lower()}_bronze"


checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/silver/{xml_tag_to_extract.lower()}"




table_schema = StructType([
    StructField("_accession", StringType(), True),
    StructField("_id", StringType(), True),
    StructField("_searchDatabase_ref", StringType(), True),
    StructField("source_file", StringType(), True),
    StructField("file_size", LongType(), True),
    StructField("protein_description", StringType(), True)
    ])



utils.create_table_from_schema(spark, table_schema,target_table)


In [0]:
from pyspark.sql.functions import expr, element_at, map_from_entries, filter, transform, lit, struct, udf
from pyspark.sql.types import StructType, StructField,StringType


bronze_data = spark.readStream.table(source_table).withColumn(
    "protein_description",
    element_at(
        map_from_entries(
            filter(
                transform("cvParam", lambda x: struct(x["_name"], x["_value"])),
                lambda x: x["_name"] == lit("protein description")
            )
        ),
        lit("protein description")
    )
).filter(~col("_accession").like("%_REVERSED%")).select("_accession","_id","_searchDatabase_ref","source_file","file_size","protein_description")


result = (
  bronze_data
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)\
    .option("checkpointLocation",checkpoint_location)\
    .table(target_table)
)

