In [0]:

from pyspark.sql.functions import expr, element_at, map_from_entries, filter, transform, lit, struct, udf, explode,col
from pyspark.sql.types import *
import utils


xml_tag_to_extract="Peptide"

catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
storage_account= dbutils.widgets.get("storage_account_name")
container=dbutils.widgets.get("container")
path_to_monitor=dbutils.widgets.get("path_to_monitor")

target_table= f"{catalog}.{schema}.{xml_tag_to_extract.lower()}_silver"

source_table = f"{catalog}.{schema}.{xml_tag_to_extract.lower()}_bronze"

checkpoint_location = f"abfss://{container}@{storage_account}.dfs.core.windows.net/checkpoint/silver/{xml_tag_to_extract.lower()}"


df=spark.read.table(source_table)


table_schema = StructType(
                    [
                        StructField('_id', StringType(), True),
                        StructField('PeptideSequence', StringType(), True), 
                        StructField('source_file', StringType(), True), 
                        StructField('file_size', LongType(), True), 
                        StructField('massDelta', DoubleType(), True), 
                        StructField('location', LongType(), True), 
                        StructField('residues', StringType(), True), 
                        StructField('cv_accession_number', StringType(), True), 
                        StructField('cvname', StringType(), True)
                    ]
                    )


utils.create_table_from_schema(spark, table_schema,target_table)

df= spark.readStream.table(source_table)


df=df.withColumn("modification",explode(col("Modification"))).withColumn("massDelta",col("modification._avgMassDelta")).withColumn("location",col("modification._location")).withColumn("residues",col("modification._residues")).withColumn("cv_accession_number",col("modification.cvParam._accession")).withColumn("cvname",col("modification.cvParam._name")).select("_id","PeptideSequence","source_file","file_size","massDelta","location","residues","cv_accession_number","cvname")



result = (
  df
    .writeStream
    .format("delta")      
    .option("mode","append")\
    .trigger(availableNow=True)\
    .option("checkpointLocation",checkpoint_location)\
    .table(target_table)
)
