In [1]:
import os
import pandas as pd
import numpy as np
import json
import requests
import zipfile
import datetime
import re
from tqdm import tqdm
root_path = os.path.join("event")
result_path = os.path.join("event_drug_symptom")

In [6]:
df = pd.read_csv(os.path.join("drug","no_missing_drugs_v2.csv"))
subset = df[df["postings"].str.contains(" ")] # find events with multiple drugs
print(len(subset))

3074136


In [2]:
to_year = {
            '800': 10,          # decade
            '801': 1,           # year
            '802': 1 / 12,      # month
            '803': 1 / (12 * 4),# week
            '804': 1 / 365,     # days
            '805': 1 / 8760,    # hour
        }
decode_sex = {0: 'unknown', 1: "Male", 2: "Female"}

decode_characterization = {
            1: "Suspect (the drug was considered by the reporter to be the cause)",
            2: "Concomitant (the drug was reported as being taken along with the suspect drug)",
            3: "Interacting (the drug was considered by the reporter to have interacted with the suspect drug)"
        }
decode_serious = {
            1: "The adverse event resulted in death, a life threatening condition, hospitalization, disability, \
            congenital anomaly, or other serious condition",
            2: "The adverse event did not result in any of the above"}
decode_death = {
            0: "survive",
            1: "death"
        }
decode_sender = {
            1: "Physician",
            2: "Pharmacist",
            3: "Other health professional",
            4: "Lawyer",
            5: "Consumer or non-health professional",
            0: "unknown"
        }

In [3]:
# from download.py with modification
def download(arg):
    """download file at url and save at dir """
    url, directory = arg
    r = requests.get(url, allow_redirects=True)
    open(directory, 'wb').write(r.content)
    with zipfile.ZipFile(directory, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(directory))
    os.remove(directory)

def get_event_links(path):
    """obtain all download links for events from fdc's provided download json"""
    with open(path, 'r') as handle:
        file = json.load(handle)
    events = file['results']["drug"]["event"]
    links = []
    for p in events['partitions']:
        links.append(p['file'])
    return links

def get_ndc_links(path):
    """obtain all download links for events from fdc's provided download json"""
    with open(path, 'r') as handle:
        file = json.load(handle)
    events = file['results']["drug"]["ndc"]
    links = []
    for p in events['partitions']:
        links.append(p['file'])
    return links

links = get_event_links("download.json")
ndc_links = get_ndc_links("download.json")

In [4]:
# Download ndc file
url = ndc_links[0]
infos = url.split("/")[3:] # getting rid of https and website
path = ''
for i in infos:
    path = os.path.join(path, i)
    if i.endswith('.zip'):
        break
    else:
        if not os.path.exists(path):
            os.mkdir(path)
        path = path.strip("\n")
        url = url.strip("\n")
download([url, path])

In [5]:
"""Create a symptom database from symptoms.json"""

symptoms = json.load(open("symptoms.json", "r"))
data = {"id":[], "symptom":[]}
for i, ele in tqdm(enumerate(symptoms)):
    data["id"].append(i)
    data["symptom"].append(ele)
df = pd.DataFrame(data)
df.to_csv(os.path.join("drug","symptoms_dataset.csv"), index=False)

21099it [00:00, 2350481.28it/s]


In [6]:
"""Create datasets for ndc's drug listings and their active ingredients""" 
"""id is associated with the brand_name"""

ndc_data = {"id":[], "brand_name":[], "generic_name":[], "product_type":[], "manufacturer_name":[]}
comp_data = {"id":[], "active_ingredient":[]}

encode_product_type = {"HUMAN OTC DRUG":0, 
                       "HUMAN PRESCRIPTION DRUG":1, 
                       "PLASMA DERIVATIVE":2, 
                       "VACCINE":3, 
                       "BULK INGREDIENT":4, 
                       "DRUG FOR FURTHER PROCESSING":5, 
                       "CELLULAR THERAPY":6, 
                       "LICENSED VACCINE BULK INTERMEDIATE":7,
                       "STANDARDIZED ALLERGENIC":8, "NON-STANDARDIZED ALLERGENIC":9}

with open(os.path.join("drug","ndc", "drug-ndc-0001-of-0001.json")) as ifs:
    ndc = json.load(ifs)
    ndc = ndc["results"] # list of dict
    for i, ele in tqdm(enumerate(ndc)):
        if "generic_name" in ele:
            for name in ele["generic_name"].split(","):
                ndc_data["id"].append(i)
                ndc_data["generic_name"].append(name.strip())
                ndc_data["brand_name"].append(ele["brand_name_base"])
                ndc_data["product_type"].append(encode_product_type[ele["product_type"]])
                if "manufacturer_name" in ele["openfda"]:
                    ndc_data["manufacturer_name"].append(ele["openfda"]["manufacturer_name"][0])
                else:
                    ndc_data["manufacturer_name"].append("__NULL__")
        else:
            ndc_data["id"].append(i)
            ndc_data["generic_name"].append("__NULL__")
            ndc_data["brand_name"].append(ele["brand_name_base"])
            ndc_data["product_type"].append(encode_product_type[ele["product_type"]])
            if "manufacturer_name" in ele["openfda"]:
                ndc_data["manufacturer_name"].append(ele["openfda"]["manufacturer_name"][0])
            else:
                ndc_data["manufacturer_name"].append("__NULL__")
        if "active_ingredients" in ele:
            for comp in ele["active_ingredients"]:
                comp_data["id"].append(i)
                comp_data["active_ingredient"].append(comp["name"])
    ndc_df = pd.DataFrame(ndc_data)
    ndc_df.to_csv(os.path.join("drug","drug_dataset.csv"), index=False)
    comp_df = pd.DataFrame(comp_data)
    comp_df.to_csv(os.path.join("drug","active_ingredient_dataset.csv"), index=False)

126675it [00:00, 460194.93it/s]


In [7]:
"""Drug and Symptom Reverse Lookup"""

class Drug:
    def __init__(self, components=None):
        self.components = set(components) # components are lowercase strings
    def __str__(self):
        return ' '.join([str(x) for x in self.components])
    def __hash__(self):
        return hash(str(self))
    def __len__(self):
        return len(self.components)
    def __list__(self):
        return list(self.components)
    def __eq__(self, other):
        return self.components == other.components

components = {}
drug_to_comp = {}
comp_id = 0
comp_df = pd.read_csv(os.path.join("drug","active_ingredient_dataset.csv"))
for i,row in tqdm(comp_df.iterrows()):
    subset = comp_df.loc[comp_df["id"] == row["id"]]
    comps = subset["active_ingredient"].values
    comps = Drug([x.lower() for x in comps])
    if comps not in components:
        components[comps] = comp_id
        comp_id += 1
    drug_to_comp[row["id"]] = components[comps]
num_drugs = len(components)

#Brand_Name -> component set
drug_vocab = {}
drug_df = pd.read_csv(os.path.join("drug","drug_dataset.csv"))
for i,row in tqdm(drug_df.iterrows()):
    if row["id"] in drug_to_comp:
        try:
            drug_vocab[row["brand_name"].lower()] = drug_to_comp[row["id"]]
        except:
            pass
        try:
            drug_vocab[row["generic_name"].lower()] = drug_to_comp[row["id"]]
        except:
            pass

# Keep track of discovered drugs

missing_vocab = {}
missing_drugs = {"id":[], "brand_name":[]}

# Symptom -> ID
symptom_vocab = {}
symptom_df = pd.read_csv(os.path.join("drug","symptoms_dataset.csv"))
symptom_df['symptom'] = symptom_df['symptom'].str.lower()
for i,row in tqdm(symptom_df.iterrows()):
    symptom_vocab[row["symptom"]] = row["id"]
    
event_id = 0

223302it [02:28, 1505.09it/s]
205302it [00:15, 13315.06it/s]
21099it [00:01, 16386.84it/s]


In [8]:
if not os.path.exists(os.path.join("drug","events")):
    os.mkdir(os.path.join("drug","events"))
    os.mkdir(os.path.join("drug","drugs"))
    os.mkdir(os.path.join("drug","characteristics"))
    os.mkdir(os.path.join("drug","symptoms"))

In [25]:
# Drug Dataset:
    # ID (event)
    # postings list based off drug_dataset
    # missing list based off missing_dataset
# Characterization dataset:
    # ID (event)
    # postings list based off drug_dataset
    # missing list based off missing_dataset
# Symptoms dataset:
    # ID (event)
    # postings list based off symptoms_dataset
    
def func(v):
    # Check if drug in ndc data
    if v in drug_vocab:
        d.append(drug_vocab[v])
        if "drugcharacterization" in ele: 
            c.append(ele["drugcharacterization"])
        else:
            c.append(0)
        return True
    # Check if drug is a known component
    elif Drug([v]) in components:
        d.append(components[Drug([v])])
        if "drugcharacterization" in ele: 
            c.append(ele["drugcharacterization"])
        else:
            c.append(0)
        return True
    return False

#for link_index, url in tqdm(enumerate(links)):
start = 800
end = min(900, len(links))
for link_index in tqdm(range(start, end)):
    url = links[link_index]
    # Download an event file
    infos = url.split("/")[3:] # getting rid of https and website
    path = ''
    for i in infos:
        path = os.path.join(path, i)
        if i.endswith('.zip'):
            break
        else:
            if not os.path.exists(path):
                os.mkdir(path)
            path = path.strip("\n")
            url = url.strip("\n")
    download([url, path])

    folder_name = path.split("\\")[-2]
    file_name = path.split(".")[0]
    split_name = file_name.split("\\")[-1]

    # Grab and decode events
    events = json.load(open(file_name+".json", "r"))
    events = events["results"]

    # Event Dataset:
    event_data = {"ID":[], "sex":[], "age":[], "serious":[], "death":[], "sender":[], "time":[]}
    drug_data = {"ID":[], "postings":[], "missing_imp":[], "missing_not":[]}
    char_data = {"ID":[], "postings":[], "missing_imp":[], "missing_not":[]}
    symp_data = {"ID":[], "postings":[]}

    for event in events:
        patient = event["patient"]

        # Event
        event_data["ID"].append(event_id)
        event_data["serious"].append(event["serious"])

        if "patientsex" in patient:
            event_data["sex"].append(decode_sex[int(patient['patientsex'])])
        else:
            event_data["sex"].append(0)
        try:
            event_data["age"].append(int(patient["patientonsetage"]) * to_year[patient["patientonsetageunit"]])
        except:
            event_data["age"].append(-1)
        if "seriousnessdeath" in event:
            event_data["death"].append(1)
        else:
            event_data["death"].append(0)
        try:
            event_data["sender"].append(decode_sender[int(event["primarysource"]["qualification"])])
        except:
            event_data["sender"].append(0)
        try:
            time = event['receiptdate']
            timestamp = datetime.datetime(int(time[0:4]), int(time[4:6]), int(time[6::]), 0, 0).timestamp()
            event_data["time"].append(timestamp)
        except:
            event_data["time"].append(-1)

        # Drugs & Characteristics
        drug_data["ID"].append(event_id)
        char_data["ID"].append(event_id)
        d = []
        c = []
        
        # Track missing drugs
        # 1. Suspect
        # 2. Concomitant
        # 3. Interacting
        # 0. Not given
        
        # Keep track of suspect / interacting drugs
        m_d_imp = []
        m_c_imp = []
        
        # Versus concomitant / unknown drugs
        m_d_not = []
        m_c_not = []
        for ele in patient["drug"]:
            if "medicinalproduct" in ele:
                if type(ele["medicinalproduct"]) != str:
                    continue
                    
                _drug = ele["medicinalproduct"].lower()
                
                if len(re.findall("(unknown)|(unspecified)|(vitamin)|(inhaler)|(chewable)|(tea)", _drug)) > 0:
                    continue
                
                if not func(_drug):
                    # Pattern 1: Split on (
                    val = _drug.split("(")[0]
                    if func(val):
                        continue
                    
                    # Patern 1.5: Split on -
                    val = _drug.split("-")[0]
                    if func(val):
                        continue
                    
                    # Pattern 2: Grab within ()
                    val = re.findall("\((.+)\)", _drug)
                    if len(val) > 0:
                        if func(val[0]):
                            continue
                    
                    # Pattern 3: Drug + Drug
                    val = _drug.split(" + ")
                    if len(val) == 2:
                        if Drug(val) in components:
                            d.append(components[Drug(val)])
                            if "drugcharacterization" in ele: 
                                c.append(ele["drugcharacterization"])
                            else:
                                c.append(0)
                            continue
                    
                    # Pattern 4: Remove "tablets", \d\d\d ?[um]g
                    val = re.split(" \d", _drug)[0]
                    if func(val):
                        continue
                    
                    val = re.split("( (?:tab)|(?:cap))", _drug)
                    if len(val) > 1:
                        func(val[0])
                        continue
                        
                        
                    # ----- Drug Not Found -----
                    
                    # Check if this missing drug has been seen before
                    if _drug not in missing_vocab:
                        missing_drugs["id"].append(num_drugs)
                        missing_drugs["brand_name"].append(_drug)
                        missing_vocab[_drug] = num_drugs
                        num_drugs += 1
                    
                    if "drugcharacterization" in ele:
                        if ele["drugcharacterization"] == "1" or ele["drugcharacterization"] == "3":
                            m_d_imp.append(missing_vocab[_drug])
                            m_c_imp.append(ele["drugcharacterization"])
                        else:
                            m_d_not.append(missing_vocab[_drug])
                            m_c_not.append(ele["drugcharacterization"])
                    else:
                        m_d_not.append(missing_vocab[_drug])
                        m_c_not.append(0)                    
                
        drug_data["postings"].append(" ".join([str(x) for x in d]))
        char_data["postings"].append(" ".join([str(x) for x in c]))
        drug_data["missing_imp"].append(" ".join([str(x) for x in m_d_imp]))
        char_data["missing_imp"].append(" ".join([str(x) for x in m_c_imp]))
        drug_data["missing_not"].append(" ".join([str(x) for x in m_d_not]))
        char_data["missing_not"].append(" ".join([str(x) for x in m_c_not]))

        # Symptoms
        symp_data["ID"].append(event_id)
        if "reaction" in patient:
            s = []
            for sym in patient["reaction"]:
                if "reactionmeddrapt" in sym:
                    s.append(str(symptom_vocab[sym['reactionmeddrapt'].lower()]))
            symp_data["postings"].append(" ".join(s))
        else:
            symp_data["postings"].append("")

        event_id += 1

    # Remove download
    os.remove(file_name+".json")
    os.rmdir(os.path.join("\\".join(path.split("\\")[0:3])))

    # Write results to file
    if not os.path.exists(os.path.join("drug","events", folder_name)):
        os.mkdir(os.path.join("drug","events", folder_name))
        os.mkdir(os.path.join("drug","drugs", folder_name))
        os.mkdir(os.path.join("drug","characteristics", folder_name))
        os.mkdir(os.path.join("drug","symptoms", folder_name))

    event_df = pd.DataFrame(event_data)
    event_df.to_csv(os.path.join("drug", "events", folder_name, split_name+".csv"), index=False)

    drug_df = pd.DataFrame(drug_data)
    drug_df.to_csv(os.path.join("drug", "drugs", folder_name, split_name+".csv"), index=False)

    char_df = pd.DataFrame(char_data)
    char_df.to_csv(os.path.join("drug", "characteristics", folder_name, split_name+".csv"), index=False)

    symp_df = pd.DataFrame(symp_data)
    symp_df.to_csv(os.path.join("drug", "symptoms", folder_name, split_name+".csv"), index=False)

missing_df = pd.DataFrame(missing_drugs)
missing_df.to_csv(os.path.join("drug","missing_drugs_dataset_"+str(start)+"_"+str(end)+".csv"), index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [12:44<00:00, 15.51s/it]


In [10]:
#missing_df = pd.DataFrame(missing_drugs)
#missing_df = missing_df.drop_duplicates(subset="brand_name")
#print(missing_df.head(20))
print(len(missing_df))
#missing_df.to_csv(os.path.join("drug","missing_drugs_dataset_697.csv"), index=False)

868


In [26]:
print("num missing:", len(missing_drugs["brand_name"]))
print("num events:", event_id)

num missing: 454931
num events: 10432358


In [27]:
df = None
check = False
for folder in tqdm(os.listdir(os.path.join("drug", "drugs"))):
    for file in os.listdir(os.path.join("drug", "drugs", folder)):
        if check:
            df = pd.concat([df, pd.read_csv(os.path.join("drug", "drugs", folder, file))])
        else:
            df = pd.read_csv(os.path.join("drug", "drugs", folder, file))
            check = True

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
100%|██████████████████████████████████████████████████████████████████████████████████| 62/62 [02:03<00:00,  7.05s/it]


In [28]:
df = df.fillna("")
subset = df.loc[df["missing_imp"]==""]
subset = subset.loc[subset["postings"]!=""]
subset = subset[["ID","postings"]]
print(len(subset))
subset.to_csv(os.path.join("drug","no_missing_drugs_v2.csv"), index=False)

7042330


In [32]:
data ={"id":[], "components":[]}
for key in components:
    data["id"].append(components[key])
    data["components"].append(", ".join(key.components))
df = pd.DataFrame.from_dict(data)
df.to_csv("components_dataset.csv",index=False)