In [0]:
spark.conf.set(
    "fs.azure.account.key.projectadf.dfs.core.windows.net",
    "")

In [0]:
from pyspark.sql.functions import col, trim, lower, initcap, current_timestamp

procedures_raw_df = spark.read.table("healthcare_raw.medical_procedure")


In [0]:
procedures_cleaned_df = (procedures_raw_df
    # Clean text fields
    .withColumn("procedure_name", trim(initcap(col("ProcedureName"))))
    
    # Data validation: Ensure required fields are present
    .filter(col("ProcedureName").isNotNull() & col("AppointmentID").isNotNull())
    
    # Remove duplicates (same procedure for same appointment shouldn't happen)
    .dropDuplicates(["ProcedureID", "AppointmentID"])
    
    # Add audit column
    .withColumn("ingestion_date", current_timestamp())
    
    # Select and rename final columns
    .select(
        col("ProcedureID").alias("procedure_id"),
        "procedure_name",
        col("AppointmentID").alias("appointment_id"),
        "ingestion_date"
    )
)


In [0]:
procedures_cleaned_df.write.mode("overwrite").format("delta").saveAsTable("healthcare_processed.medical_procedures")

# Write to cloud storage
procedures_cleaned_df.write.mode("overwrite").parquet(f"abfss://source@projectadf.dfs.core.windows.net/medical_procedures")


In [0]:
dbutils.notebook.exit("Success")