In [0]:
%sql
CREATE DATABASE IF NOT EXISTS clinicaltrial_gold;

1. Gold - Patient Summary

In [0]:
# Drop table if exists to avoid schema merge errors
spark.sql("DROP TABLE IF EXISTS clinicaltrial_gold.patient_summary")

from pyspark.sql.functions import col, sum, max, datediff

patients = spark.table("clinicaltrial_silver.patients")
dosing = spark.table("clinicaltrial_silver.drug_dosing")
aes = spark.table("clinicaltrial_silver.adverse_events")
outcomes = spark.table("clinicaltrial_silver.outcomes")


# Exposure summary

exposure = (
    dosing.groupBy("patient_id")
    .agg(
        sum("exposed_flag").alias("doses_taken"),
        max("dose_date").alias("last_dose_date")
    )
)


# SAE flag

sae_flag = (
    aes.groupBy("patient_id")
    .agg(
        max("serious").cast("int").alias("any_serious_ae")
    )
)

gold_patient_summary = (
    patients
    .join(exposure, on="patient_id", how="left")
    .join(sae_flag, on="patient_id", how="left")
    .join(outcomes.select("patient_id", "best_response"), on="patient_id", how="left")
)

(
    gold_patient_summary
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("clinicaltrial_gold.patient_summary")
)

2. Gold: Safety Metrics (Trial Arm Level)

In [0]:
# Drop table if exists to avoid schema merge errors
spark.sql("DROP TABLE IF EXISTS clinicaltrial_gold.safety_metrics")

from pyspark.sql.functions import col, countDistinct, sum
from pyspark.sql.types import IntegerType

aes = spark.table("clinicaltrial_silver.adverse_events")
patients = spark.table("clinicaltrial_silver.patients")

safety = (
    aes.join(patients, "patient_id")
    .groupBy("treatment_arm")
    .agg(
        countDistinct("patient_id").alias("patients_with_ae"),
        sum(col("serious").cast("int")).alias("total_serious_ae")
    )
)

total_patients = (
    patients.groupBy("treatment_arm")
    .agg(countDistinct("patient_id").alias("total_patients"))
)

gold_safety_metrics = (
    safety.join(total_patients, "treatment_arm")
    .withColumn("ae_incidence_rate", col("patients_with_ae") / col("total_patients"))
)

(
    gold_safety_metrics.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("clinicaltrial_gold.safety_metrics")
)

3. Gold: Exposure Summary

In [0]:
# Drop table if exists to avoid schema merge errors
spark.sql("DROP TABLE IF EXISTS clinicaltrial_gold.exposure_summary")

from pyspark.sql.functions import col, count, avg, sum, round

dosing = spark.table("clinicaltrial_silver.drug_dosing")
patients = spark.table("clinicaltrial_silver.patients")

gold_exposure_summary = (
    dosing.join(patients, "patient_id")
    .groupBy("patient_id", "treatment_arm")
    .agg(
        count("*").alias("planned_doses"),
        sum("exposed_flag").alias("taken_doses"),
        avg("dose_mg").alias("avg_dose_mg") 
    )
    .withColumn(
        "compliance_rate",
        round(col("taken_doses") / col("planned_doses") * 100,2)
    )
)

(
    gold_exposure_summary.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("clinicaltrial_gold.exposure_summary")
)

4. Gold: Efficacy Metrics

In [0]:
from pyspark.sql.functions import col, when, countDistinct, sum


outcomes= spark.table("clinicaltrial_silver.outcomes")
patients = spark.table("clinicaltrial_silver.patients")

efficacy = (
    outcomes.join(patients, "patient_id")
    .groupBy("treatment_arm")
    .agg(
        countDistinct("patient_id").alias("n_patients"),
        sum(when(col("best_response").isin("CR", "PR"), 1).otherwise(0)).alias("responders"),
        sum(when(col("best_response").isin("CR", "PR", "SD"),1).otherwise(0)).alias("disease_control")
    )
    .withColumn("ORR", col("responders") / col("n_patients"))
    .withColumn("DCR", col("disease_control") / col("n_patients"))
)


(
    efficacy.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("clinicaltrial_gold.efficacy_metrics")
)