#### Create Silver Layer (Medallion Architecture) Tables from raw unzipped JSON files for Human Drug Adverse Event dataset

- This Notebook is Step2 to create flattened table (Silver Layer) from raw JSON files downloaded using the first Notebook in the sample.
- On successful execution of the Notebook you will have three Delta Parquet tables available in the Lakehouse: fda_drug_event, fda_drug_event_patient_drug and fda_drug_event_patient_reaction
- The JSON from original dataset is deeply nested and not very conducive for analytics hence this Notebook flattens the JSON to a few levels but still there a few columns which have JSON, the idea is to further build out Gold Layer tables based on use cases
- Delta Parquet format is highly compressed so size of files for the three tables is going to be in the range of 15-20 GB
- Hence the tables are created you can delete the raw files if you prefer to save costs




In [1]:
%%sql

--delete the tables if they exist
DROP TABLE IF EXISTS fda_drug_event;
DROP TABLE IF EXISTS fda_drug_event_patient_drug;
DROP TABLE IF EXISTS fda_drug_event_patient_reaction;

StatementMeta(, , -1, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

In [2]:
from pyspark.sql.functions import col, explode_outer
from pyspark.sql.types import ArrayType, StructType

#define the function to flatten json columns in dataframe
def flatten_dataframe(df):
    fields = df.schema.fields
    field_names = [field.name for field in fields]

    for i in range(len(fields)):
        field = fields[i]
        field_type = field.dataType
        field_name = field.name

        if isinstance(field_type, StructType):
            child_field_names = [f"{field_name}.{child_name}" for child_name in field_type.fieldNames()]
            new_field_names = [name for name in field_names if name != field_name] + child_field_names
            renamed_cols = [col(name).alias(name.replace(".", "_")) for name in new_field_names]
            exploded_df = df.select(*renamed_cols)
            return flatten_dataframe(exploded_df)

    return df

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 6, Finished, Available)

In [3]:
#read unzipped json files into dataframe
df = spark.read.option("multiline", "true").json("Files/fda_ds_unzipped/*.json")
df.printSchema()

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 7, Finished, Available)

root
 |-- meta: struct (nullable = true)
 |    |-- disclaimer: string (nullable = true)
 |    |-- last_updated: string (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- results: struct (nullable = true)
 |    |    |-- limit: long (nullable = true)
 |    |    |-- skip: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |-- terms: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- authoritynumb: string (nullable = true)
 |    |    |-- companynumb: string (nullable = true)
 |    |    |-- duplicate: string (nullable = true)
 |    |    |-- fulfillexpeditecriteria: string (nullable = true)
 |    |    |-- occurcountry: string (nullable = true)
 |    |    |-- patient: struct (nullable = true)
 |    |    |    |-- drug: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- actiondrug: string (nullable = true)
 |    

In [4]:
from pyspark.sql.functions import explode

#results column is an array so using explode function to create row for each element of the array
df= df.withColumn("results", explode("results"))
df = df.drop("meta")
df.printSchema()

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 8, Finished, Available)

root
 |-- results: struct (nullable = true)
 |    |-- authoritynumb: string (nullable = true)
 |    |-- companynumb: string (nullable = true)
 |    |-- duplicate: string (nullable = true)
 |    |-- fulfillexpeditecriteria: string (nullable = true)
 |    |-- occurcountry: string (nullable = true)
 |    |-- patient: struct (nullable = true)
 |    |    |-- drug: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- actiondrug: string (nullable = true)
 |    |    |    |    |-- activesubstance: struct (nullable = true)
 |    |    |    |    |    |-- activesubstancename: string (nullable = true)
 |    |    |    |    |-- drugadditional: string (nullable = true)
 |    |    |    |    |-- drugadministrationroute: string (nullable = true)
 |    |    |    |    |-- drugauthorizationnumb: string (nullable = true)
 |    |    |    |    |-- drugbatchnumb: string (nullable = true)
 |    |    |    |    |-- drugcharacterization: string (nullable = true)


In [5]:
#function to help with renaming of columns
def renameColumnPrefix(df, prefix, replaceValue = ""):
    columns = df.columns
    rename_dict = {col: col.replace(prefix, '') for col in columns if col.startswith(prefix)}

    # Rename the columns
    df = df.select([df[col].alias(rename_dict.get(col, col)) for col in df.columns])
    return df

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 9, Finished, Available)

In [6]:
#flatten the struct columns
df = flatten_dataframe(df)
df.printSchema()

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 10, Finished, Available)

root
 |-- results_authoritynumb: string (nullable = true)
 |-- results_companynumb: string (nullable = true)
 |-- results_duplicate: string (nullable = true)
 |-- results_fulfillexpeditecriteria: string (nullable = true)
 |-- results_occurcountry: string (nullable = true)
 |-- results_primarysourcecountry: string (nullable = true)
 |-- results_receiptdate: string (nullable = true)
 |-- results_receiptdateformat: string (nullable = true)
 |-- results_receivedate: string (nullable = true)
 |-- results_receivedateformat: string (nullable = true)
 |-- results_reportduplicate: string (nullable = true)
 |-- results_reporttype: string (nullable = true)
 |-- results_safetyreportid: string (nullable = true)
 |-- results_safetyreportversion: string (nullable = true)
 |-- results_serious: string (nullable = true)
 |-- results_seriousnesscongenitalanomali: string (nullable = true)
 |-- results_seriousnessdeath: string (nullable = true)
 |-- results_seriousnessdisabling: string (nullable = true)
 |

In [7]:
#renaming the columns to remove results_ prefix
df = renameColumnPrefix(df, "results_")
df.printSchema()

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 11, Finished, Available)

root
 |-- authoritynumb: string (nullable = true)
 |-- companynumb: string (nullable = true)
 |-- duplicate: string (nullable = true)
 |-- fulfillexpeditecriteria: string (nullable = true)
 |-- occurcountry: string (nullable = true)
 |-- primarysourcecountry: string (nullable = true)
 |-- receiptdate: string (nullable = true)
 |-- receiptdateformat: string (nullable = true)
 |-- receivedate: string (nullable = true)
 |-- receivedateformat: string (nullable = true)
 |-- reportduplicate: string (nullable = true)
 |-- reporttype: string (nullable = true)
 |-- safetyreportid: string (nullable = true)
 |-- safetyreportversion: string (nullable = true)
 |-- serious: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnessother: string 

In [8]:
#create a separate dataframe for Patient reaction information, flattening, exploding and renaming as needed
patient_reaction_df = df.select("safetyreportid",  "patient_reaction")
patient_reaction_df = patient_reaction_df.withColumn("patient_reaction", explode_outer("patient_reaction"))
patient_reaction_df = flatten_dataframe(patient_reaction_df)
patient_reaction_df = renameColumnPrefix(patient_reaction_df,'patient_reaction_')
#display(patient_reaction_df)

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 12, Finished, Available)

In [9]:
#save patient reation table
patient_reaction_df.write.mode("overwrite").format("delta").save("Tables/fda_drug_event_patient_reaction")

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 13, Finished, Available)

In [10]:
#remove patient reaction column as the table has already 
df = df.drop("patient_reaction")
df.printSchema()

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 14, Finished, Available)

root
 |-- authoritynumb: string (nullable = true)
 |-- companynumb: string (nullable = true)
 |-- duplicate: string (nullable = true)
 |-- fulfillexpeditecriteria: string (nullable = true)
 |-- occurcountry: string (nullable = true)
 |-- primarysourcecountry: string (nullable = true)
 |-- receiptdate: string (nullable = true)
 |-- receiptdateformat: string (nullable = true)
 |-- receivedate: string (nullable = true)
 |-- receivedateformat: string (nullable = true)
 |-- reportduplicate: string (nullable = true)
 |-- reporttype: string (nullable = true)
 |-- safetyreportid: string (nullable = true)
 |-- safetyreportversion: string (nullable = true)
 |-- serious: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnessother: string 

In [11]:
#create separate dataframe for Patient Drug records, explodeing, flattening and renaming as needed
patient_drug_df = df.select("safetyreportid",  "patient_drug")
patient_drug_df = patient_drug_df.withColumn("patient_drug", explode_outer("patient_drug"))
patient_drug_df = flatten_dataframe(patient_drug_df)
patient_drug_df = renameColumnPrefix(patient_drug_df,'patient_drug_')
#display(patient_drug_df)

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 15, Finished, Available)

In [12]:
#save patient drug table
patient_drug_df.write.mode("overwrite").format("delta").save("Tables/fda_drug_event_patient_drug")

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 16, Finished, Available)

In [13]:
#drop patient drug column as it is no longer needed
df = df.drop("patient_drug")
df.printSchema()

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 17, Finished, Available)

root
 |-- authoritynumb: string (nullable = true)
 |-- companynumb: string (nullable = true)
 |-- duplicate: string (nullable = true)
 |-- fulfillexpeditecriteria: string (nullable = true)
 |-- occurcountry: string (nullable = true)
 |-- primarysourcecountry: string (nullable = true)
 |-- receiptdate: string (nullable = true)
 |-- receiptdateformat: string (nullable = true)
 |-- receivedate: string (nullable = true)
 |-- receivedateformat: string (nullable = true)
 |-- reportduplicate: string (nullable = true)
 |-- reporttype: string (nullable = true)
 |-- safetyreportid: string (nullable = true)
 |-- safetyreportversion: string (nullable = true)
 |-- serious: string (nullable = true)
 |-- seriousnesscongenitalanomali: string (nullable = true)
 |-- seriousnessdeath: string (nullable = true)
 |-- seriousnessdisabling: string (nullable = true)
 |-- seriousnesshospitalization: string (nullable = true)
 |-- seriousnesslifethreatening: string (nullable = true)
 |-- seriousnessother: string 

In [14]:
#save fda_drug_event tbable
df.write.mode("overwrite").format("delta").save("Tables/fda_drug_event")

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 18, Finished, Available)

In [15]:
%%sql

select count(*) from fda_drug_event;
select count(*) from fda_drug_event_patient_drug;
select count(*) from fda_drug_event_patient_reaction;

StatementMeta(, , -1, Finished, Available)

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 1 fields>

In [16]:
%%sql

select medicinalproduct, count(*) as num_adverse_events 
from fda_drug_event_patient_drug 
group by medicinalproduct
order by num_adverse_events desc

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 22, Finished, Available)

<Spark SQL result set with 1000 rows and 2 fields>

In [17]:
%%sql

select * from fda_drug_event_patient_drug;

StatementMeta(, 426ccb54-d11f-40de-8c90-65bd4e0de6cf, 23, Finished, Available)

<Spark SQL result set with 1000 rows and 45 fields>