In [0]:
spark.conf.set(
    "fs.azure.account.key.projectadf.dfs.core.windows.net",
    "")

In [0]:
from pyspark.sql.functions import col, trim, lower, initcap, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

doctors_raw_df = spark.read.table("healthcare_raw.doctors")

In [0]:
doctors_cleaned_df = (doctors_raw_df
    # Clean text fields
    .withColumn("doctor_name", trim(initcap(col("DoctorName"))))  # Capitalize first letters and trim
    .withColumn("specialization", trim(lower(col("Specialization"))))  # Standardize to lowercase
    .withColumn("contact_info", trim(col("DoctorContact")))
    
    # Data quality: Remove doctors without name or specialization
    .filter(col("DoctorName").isNotNull() & col("Specialization").isNotNull())
    
    # Remove duplicates
    .dropDuplicates(["DoctorID"])
    
    # Add audit columns
    .withColumn("ingestion_date", current_timestamp())
    
    # Select and rename final columns
    .select(
        col("DoctorID").alias("doctor_id"),
        "doctor_name",
        "specialization", 
        "contact_info",
        "ingestion_date"
    )
)


In [0]:
doctors_cleaned_df.write.mode("overwrite").format("delta").saveAsTable("healthcare_processed.doctors")

doctors_cleaned_df.write.mode("overwrite").parquet(f"abfss://source@projectadf.dfs.core.windows.net/doctors")


In [0]:
print("Total doctors processed:", doctors_cleaned_df.count())
display(doctors_cleaned_df.limit(5))


Total doctors processed: 600


doctor_id,doctor_name,specialization,contact_info,ingestion_date
100,Thalia,emergency medicine,.@yopmail.com,2025-08-25T09:25:29.365Z
101,Mireielle,allergists,.@yopmail.com,2025-08-25T09:25:29.365Z
103,Elie,endocrinologist,.@yopmail.com,2025-08-25T09:25:29.365Z
104,Cacilie,emergency medicine,.@yopmail.com,2025-08-25T09:25:29.365Z
105,Lynea,emergency medicine,.@yopmail.com,2025-08-25T09:25:29.365Z


In [0]:
dbutils.notebook.exit("Success")