In [28]:
import json
from typing import Dict
import re
import pprint

import csv

In [24]:
data_files = [
    'drug-label-0001-of-0011.json',
    'drug-label-0002-of-0011.json',
    'drug-label-0003-of-0011.json',
    'drug-label-0004-of-0011.json',
    'drug-label-0005-of-0011.json',
    'drug-label-0006-of-0011.json',
    'drug-label-0007-of-0011.json',
    'drug-label-0008-of-0011.json',
    'drug-label-0009-of-0011.json',
    'drug-label-0010-of-0011.json',
    'drug-label-0011-of-0011.json',
]

def iterate_over_files(callback_fn):
    for data_file in data_files:
        with open(data_file, 'r') as f:
            file = json.load(f)
        for result in file['results']:
            callback_fn(result)

### Total record count

In [5]:
count = {'count': 0}
def get_total_count(result: Dict):
    count['count'] += 1
iterate_over_files(get_total_count)
print(count)

{'count': 200315}


### Get all key names in records

In [8]:
unique_columns = set()
def get_all_columns(result: Dict):
    unique_columns.update(set(result.keys()))
iterate_over_files(get_all_columns)
for column in sorted(list(unique_columns)):
    print(column)

abuse
abuse_table
accessories
active_ingredient
active_ingredient_table
adverse_reactions
adverse_reactions_table
alarms
alarms_table
animal_pharmacology_and_or_toxicology
animal_pharmacology_and_or_toxicology_table
ask_doctor
ask_doctor_or_pharmacist
ask_doctor_or_pharmacist_table
ask_doctor_table
assembly_or_installation_instructions
calibration_instructions
carcinogenesis_and_mutagenesis_and_impairment_of_fertility
carcinogenesis_and_mutagenesis_and_impairment_of_fertility_table
cleaning
clinical_pharmacology
clinical_pharmacology_table
clinical_studies
clinical_studies_table
compatible_accessories
components
components_table
contraindications
contraindications_table
controlled_substance
dependence
dependence_table
description
description_table
diagram_of_device
disposal_and_waste_handling
do_not_use
do_not_use_table
dosage_and_administration
dosage_and_administration_table
dosage_forms_and_strengths
dosage_forms_and_strengths_table
drug_abuse_and_dependence
drug_abuse_and_dependenc

In [36]:
DRUGSATFDA_CONCEPT_ID_RE = re.compile("(ANDA|NDA)(\d*)")

In [39]:
data_file = open("fda_data.csv", "w")
writer = csv.writer(data_file)
# write header
writer.writerow((
    "drug_id", 
    "spl", 
    # "adverse_reactions", 
    # "alarms", 
    # "boxed_warning",
    # "carcinogenesis_and_mutagenesis_and_impairment_of_fertility",
    "clinical_pharmacology", 
    "contraindications", 
    # "drug_and_or_laboratory_test_interactions",
    # "drug_interactions", 
    # "general_precautions", 
    # "geriatric_use",
    "indications_and_usage", 
    # "nonclinical_toxicology",
    # "nonteratogenic_effects", 
    # "pediatric_use", 
    "pharmacodynamics",
    "pharmacogenomics",
    "pharmacokinetics",
    "purpose", 
    # "teratogenic_effects",
    "use_in_specific_populations", 
    # "warnings", 
    # "warnings_and_cautions", 
    "brand_name",
    "generic_name", 
    "substance_name", 
    "application_number"
))

def load_record_in_db(result: Dict):
    if not re.match(DRUGSATFDA_CONCEPT_ID_RE, "".join(result.get("openfda", {}).get("application_number", []))):
        return
    drug = (
        result.get("id"),
        result.get("spl", ""),
        # "|".join(result.get("adverse_reactions", [])),
        # result.get("alarms", ""),
        # result.get("boxed_warning", ""),
        # result.get("carcinogenesis_and_mutagenesis_and_impairment_of_fertility", ""),
        result.get("clinical_pharmacology", ""),
        result.get("contraindications", ""),
        # result.get("drug_and_or_laboratory_test_interactions", ""),
        # result.get("drug_interactions", ""),
        # result.get("general_precautions", ""),
        # result.get("geriatric_use", ""),
        "|".join(result.get("indications_and_usage", [])),
        # result.get("nonclinical_toxicology", ""),
        # result.get("nonteratogenic_effects", ""),
        # result.get("pediatric_use", ""),
        result.get("pharmacodynamics", ""),
        result.get("pharmacogenomics", ""),
        result.get("pharmacokinetics", ""),
        "|".join(result.get("purpose", [])),
        # result.get("teratogenic_effects", ""),
        result.get("use_in_specific_populations", ""),
        # "|".join(result.get("warnings", [])),
        # result.get("warnings_and_cautions", ""),
        "|".join(result.get("openfda", {}).get("brand_name", [])),
        "|".join(result.get("openfda", {}).get("generic_name", [])),
        "|".join(result.get("openfda", {}).get("substance_name", [])),
        "|".join(result.get("openfda", {}).get("application_number", []))
    )
    writer.writerow(drug)
    
iterate_over_files(load_record_in_db)
data_file.close()