Create DB clinical_trial_bronze

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS clinical_trial_bronze;

Read data and write data to bronze

In [0]:
# Bronze Patients
from pyspark.sql.functions import current_timestamp, lit

df_patients = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/patients.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_file", lit("/Volumes/workspace/clinicaltrial_data/clinicaltrial/patients.csv"))
)

(df_patients
    .write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    .saveAsTable("clinical_trial_bronze.patients"))

In [0]:
# Bronze Visits
from pyspark.sql.functions import current_timestamp, lit

df_visits = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/visits.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_file", lit("/Volumes/workspace/clinicaltrial_data/clinicaltrial/visits.csv"))
)

(df_visits
    .write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    .saveAsTable("clinical_trial_bronze.visits"))


In [0]:
# Bronze Labs

df_labs = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/labs.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_file", lit("/Volumes/workspace/clinicaltrial_data/clinicaltrial/labs.csv"))
)

(df_labs
    .write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    .saveAsTable("clinical_trial_bronze.labs")
)

In [0]:
# Bronze Adverse Events

df_ae = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/adverse_events.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_file", lit("/Volumes/workspace/clinicaltrial_data/clinicaltrial/adverse_events.csv"))
)

(
    df_ae.write
        .format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .saveAsTable("clinical_trial_bronze.adverse_events")
)

In [0]:
# Bronze Drug Dosing

df_dosing = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/drug_dosing.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_file", lit("/Volumes/workspace/clinicaltrial_data/clinicaltrial/drug_dosing.csv"))
)

(
    df_dosing.write
        .format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .saveAsTable("clinical_trial_bronze.drug_dosing")
)

In [0]:
# Bronze Outcomes

df_outcomes = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/outcomes.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_file", lit("/Volumes/workspace/clinicaltrial_data/clinicaltrial/outcomes.csv"))
)

(
    df_outcomes.write
        .format("delta")
        .mode("append")
        .option("mergeSchema", "true")
        .saveAsTable("clinical_trial_bronze.outcomes")

)

Validate bronze layer

In [0]:
%sql

SHOW TABLES IN clinical_trial_bronze;



In [0]:

%sql
--patient count
SELECT COUNT(*) FROM clinical_trial_bronze.patients;



In [0]:
%sql
-- patient data
SELECT * FROM clinical_trial_bronze.patients limit 10;


In [0]:
%sql
-- AE data
SELECT * FROM clinical_trial_bronze.adverse_events limit 10;


In [0]:
%sql
-- Drug dosing data
SELECT * FROM clinical_trial_bronze.drug_dosing limit 10;


In [0]:
%sql
-- LAB data
SELECT * FROM clinical_trial_bronze.labs limit 10;


In [0]:
%sql
-- Outcome data
SELECT * FROM clinical_trial_bronze.outcomes limit 10;


In [0]:
%sql
-- Visit data
SELECT * FROM clinical_trial_bronze.visits limit 10;
