In [1]:
import os
import pandas as pd
import numpy as np
import json
import requests
import zipfile
import datetime
import re
from tqdm import tqdm
root_path = os.path.join("event")
result_path = os.path.join("event_drug_symptom")
drug_path = os.path.join("drug_brands")

In [2]:
to_year = {
            '800': 10,          # decade
            '801': 1,           # year
            '802': 1 / 12,      # month
            '803': 1 / (12 * 4),# week
            '804': 1 / 365,     # days
            '805': 1 / 8760,    # hour
        }
decode_sex = {0: 'unknown', 1: "Male", 2: "Female"}

decode_characterization = {
            1: "Suspect (the drug was considered by the reporter to be the cause)",
            2: "Concomitant (the drug was reported as being taken along with the suspect drug)",
            3: "Interacting (the drug was considered by the reporter to have interacted with the suspect drug)"
        }
decode_serious = {
            1: "The adverse event resulted in death, a life threatening condition, hospitalization, disability, \
            congenital anomaly, or other serious condition",
            2: "The adverse event did not result in any of the above"}
decode_death = {
            0: "survive",
            1: "death"
        }
decode_sender = {
            1: "Physician",
            2: "Pharmacist",
            3: "Other health professional",
            4: "Lawyer",
            5: "Consumer or non-health professional",
            0: "unknown"
        }

In [3]:
# from download.py with modification
def download(arg):
    """download file at url and save at dir """
    url, directory = arg
    r = requests.get(url, allow_redirects=True)
    open(directory, 'wb').write(r.content)
    with zipfile.ZipFile(directory, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(directory))
    os.remove(directory)

def get_event_links(path):
    """obtain all download links for events from fdc's provided download json"""
    with open(path, 'r') as handle:
        file = json.load(handle)
    events = file['results']["drug"]["event"]
    links = []
    for p in events['partitions']:
        links.append(p['file'])
    return links

def get_ndc_links(path):
    """obtain all download links for events from fdc's provided download json"""
    with open(path, 'r') as handle:
        file = json.load(handle)
    events = file['results']["drug"]["ndc"]
    links = []
    for p in events['partitions']:
        links.append(p['file'])
    return links

links = get_event_links("download.json")
ndc_links = get_ndc_links("download.json")

In [4]:
# Download ndc file
url = ndc_links[0]
infos = url.split("/")[3:] # getting rid of https and website
path = ''
for i in infos:
    path = os.path.join(path, i)
    if i.endswith('.zip'):
        break
    else:
        if not os.path.exists(path):
            os.mkdir(path)
        path = path.strip("\n")
        url = url.strip("\n")
download([url, path])

In [5]:
"""Create a symptom database from symptoms.json"""

symptoms = json.load(open("symptoms.json", "r"))
data = {"id":[], "symptom":[]}
for i, ele in tqdm(enumerate(symptoms)):
    data["id"].append(i)
    data["symptom"].append(ele)
df = pd.DataFrame(data)
df.to_csv(os.path.join("drug","symptoms_dataset.csv"), index=False)

21099it [00:00, 2644344.11it/s]


In [6]:
"""Drug and Symptom Reverse Lookup"""

class Drug:
    def __init__(self, components=None):
        self.components = set(components) # components are lowercase strings
    def __str__(self):
        return ', '.join([str(x) for x in self.components])
    def __hash__(self):
        return hash(str(self))
    def __len__(self):
        return len(self.components)
    def __list__(self):
        return list(self.components)
    def __eq__(self, other):
        return self.components == other.components


# id,brand_id,brand_name,generic_name,components,product_type,manufacturer_name
drug_df = pd.read_csv(os.path.join("medicine_dataset.csv"))
    
# All these drugs have a brand_name---generic_name pair and components
# Data is clean; all lower-case

brand_vocab = {}      # brand_name -> list of ids

for i,row in tqdm(drug_df.iterrows()):
    brand = row["brand_name"]
    brand_id = row["brand_id"]
    brand_vocab[brand] = brand_id
    
num_drugs = len(brand_vocab)

# Keep track of discovered drugs
missing_vocab = {}
missing_drugs = {"id":[], "drug_name":[]}

# Symptom -> ID
symptom_vocab = {}
symptom_df = pd.read_csv(os.path.join("drug","symptoms_dataset.csv"))
symptom_df['symptom'] = symptom_df['symptom'].str.lower()
for i,row in tqdm(symptom_df.iterrows()):
    symptom_vocab[row["symptom"]] = row["id"]
    
event_id = 0

173528it [00:10, 16299.01it/s]
21099it [00:01, 16336.23it/s]


In [7]:
if not os.path.exists(os.path.join(drug_path,"events")):
    os.mkdir(os.path.join(drug_path,"events"))
    os.mkdir(os.path.join(drug_path,"drugs"))
    os.mkdir(os.path.join(drug_path,"characteristics"))
    os.mkdir(os.path.join(drug_path,"symptoms"))

In [8]:
# Drug Dataset:
    # ID (event)
    # postings list based off drug_dataset
    # missing list based off missing_dataset
# Characterization dataset:
    # ID (event)
    # postings list based off drug_dataset
    # missing list based off missing_dataset
# Symptoms dataset:
    # ID (event)
    # postings list based off symptoms_dataset
    
def match(val):
    # Check if drug is a known brand name
    if val in brand_vocab:
        d.append(brand_vocab[val])
        if "drugcharacterization" in ele: 
            d_c.append(ele["drugcharacterization"])
        else:
            d_c.append(0)
        return True

for link_index, url in tqdm(enumerate(links)):
    url = links[link_index]
    # Download an event file
    infos = url.split("/")[3:] # getting rid of https and website
    path = ''
    for i in infos:
        path = os.path.join(path, i)
        if i.endswith('.zip'):
            break
        else:
            if not os.path.exists(path):
                os.mkdir(path)
            path = path.strip("\n")
            url = url.strip("\n")
    download([url, path])

    folder_name = path.split("\\")[-2]
    file_name = path.split(".")[0]
    split_name = file_name.split("\\")[-1]

    # Grab and decode events
    events = json.load(open(file_name+".json", "r"))
    events = events["results"]

    # Event Dataset:
    event_data = {"ID":[], "sex":[], "age":[], "serious":[], "death":[], "sender":[], "time":[]}
    drug_data = {"ID":[], "brands":[], "missing_imp":[], "missing_not":[]}
    char_data = {"ID":[], "brands":[], "missing_imp":[], "missing_not":[]}
    symp_data = {"ID":[], "symptoms":[]}

    for event in events:
        patient = event["patient"]

        # Event
        event_data["ID"].append(event_id)
        event_data["serious"].append(event["serious"])

        if "patientsex" in patient:
            event_data["sex"].append(decode_sex[int(patient['patientsex'])])
        else:
            event_data["sex"].append(0)
        try:
            event_data["age"].append(int(patient["patientonsetage"]) * to_year[patient["patientonsetageunit"]])
        except:
            event_data["age"].append(-1)
        if "seriousnessdeath" in event:
            event_data["death"].append(1)
        else:
            event_data["death"].append(0)
        try:
            event_data["sender"].append(decode_sender[int(event["primarysource"]["qualification"])])
        except:
            event_data["sender"].append(0)
        try:
            time = event['receiptdate']
            timestamp = datetime.datetime(int(time[0:4]), int(time[4:6]), int(time[6::]), 0, 0).timestamp()
            event_data["time"].append(timestamp)
        except:
            event_data["time"].append(-1)

        # Drugs & Characteristics
        drug_data["ID"].append(event_id)
        char_data["ID"].append(event_id)
        d = []
        g = []
        d_c = []
        g_c = []
        
        # Keep track of suspect / interacting drugs
        m_d_imp = []
        m_c_imp = []
        
        # Versus concomitant / unknown drugs
        m_d_not = []
        m_c_not = []
        for ele in patient["drug"]:
            if "medicinalproduct" in ele:
                if type(ele["medicinalproduct"]) != str:
                    continue
                    
                _drug = ele["medicinalproduct"].lower()
                
                if len(re.findall("(unknown)|(unspecified)|(vitamin)|(inhaler)|(chewable)|(tea)", _drug)) > 0:
                    continue
                
                if not match(_drug):
                    # Pattern 1: Split on (
                    val = _drug.split("(")[0]
                    if match(val):
                        continue
                    
                    # Patern 1.5: Split on -
                    val = _drug.split("-")[0]
                    if match(val):
                        continue
                    
                    # Pattern 2: Grab within ()
                    val = re.findall("\((.+)\)", _drug)
                    if len(val) > 0:
                        if match(val[0]):
                            continue
                    
                    # Pattern 3: Drug + Drug
                    val = re.sub(" + ", " and ", _drug)
                    if match(val):
                        continue
                    
                    # Pattern 4: Remove "tablets", \d\d\d ?[um]g
                    val = re.split(" \d", _drug)[0]
                    if match(val):
                        continue
                    
                    val = re.split("( (?:tab)|(?:cap))", _drug)
                    if len(val) > 1:
                        match(val[0])
                        continue
                        
                    # ----- Drug Not Found -----
                    
                    # Check if this missing drug has been seen before
                    if _drug not in missing_vocab:
                        missing_drugs["id"].append(num_drugs)
                        missing_drugs["drug_name"].append(_drug)
                        missing_vocab[_drug] = num_drugs
                        num_drugs += 1
                    
                    if "drugcharacterization" in ele:
                        if ele["drugcharacterization"] == "1" or ele["drugcharacterization"] == "3":
                            m_d_imp.append(missing_vocab[_drug])
                            m_c_imp.append(ele["drugcharacterization"])
                        else:
                            m_d_not.append(missing_vocab[_drug])
                            m_c_not.append(ele["drugcharacterization"])
                    else:
                        m_d_not.append(missing_vocab[_drug])
                        m_c_not.append(0)                    
                
        drug_data["brands"].append(" ".join([str(x) for x in d]))
        char_data["brands"].append(" ".join([str(x) for x in d_c]))
        drug_data["missing_imp"].append(" ".join([str(x) for x in m_d_imp]))
        char_data["missing_imp"].append(" ".join([str(x) for x in m_c_imp]))
        drug_data["missing_not"].append(" ".join([str(x) for x in m_d_not]))
        char_data["missing_not"].append(" ".join([str(x) for x in m_c_not]))

        # Symptoms
        symp_data["ID"].append(event_id)
        if "reaction" in patient:
            s = []
            for sym in patient["reaction"]:
                if "reactionmeddrapt" in sym:
                    s.append(str(symptom_vocab[sym['reactionmeddrapt'].lower()]))
            symp_data["symptoms"].append(" ".join(s))
        else:
            symp_data["symptoms"].append("")

        event_id += 1

    # Remove download
    os.remove(file_name+".json")
    os.rmdir(os.path.join("\\".join(path.split("\\")[0:3])))

    # Write results to file
    if not os.path.exists(os.path.join(drug_path,"events", folder_name)):
        os.mkdir(os.path.join(drug_path,"events", folder_name))
        os.mkdir(os.path.join(drug_path,"drugs", folder_name))
        os.mkdir(os.path.join(drug_path,"characteristics", folder_name))
        os.mkdir(os.path.join(drug_path,"symptoms", folder_name))

    event_df = pd.DataFrame(event_data)
    event_df.to_csv(os.path.join(drug_path, "events", folder_name, split_name+".csv"), index=False)

    drug_df = pd.DataFrame(drug_data)
    drug_df.to_csv(os.path.join(drug_path, "drugs", folder_name, split_name+".csv"), index=False)

    char_df = pd.DataFrame(char_data)
    char_df.to_csv(os.path.join(drug_path, "characteristics", folder_name, split_name+".csv"), index=False)

    symp_df = pd.DataFrame(symp_data)
    symp_df.to_csv(os.path.join(drug_path, "symptoms", folder_name, split_name+".csv"), index=False)

missing_df = pd.DataFrame(missing_drugs)
missing_df.to_csv(os.path.join(drug_path,"missing_drugs_dataset.csv"), index=False)

900it [2:18:08, 19.93s/it]


In [9]:
print("num missing:", len(missing_drugs["drug_name"]))
print("num events:", event_id)

num missing: 477831
num events: 10432358


In [10]:
df = None
check = False
for folder in tqdm(os.listdir(os.path.join(drug_path, "drugs"))):
    for file in os.listdir(os.path.join(drug_path, "drugs", folder)):
        if check:
            df = pd.concat([df, pd.read_csv(os.path.join(drug_path, "drugs", folder, file))])
        else:
            df = pd.read_csv(os.path.join(drug_path, "drugs", folder, file))
            check = True

100%|██████████████████████████████████████████████████████████████████████████████████| 62/62 [01:59<00:00,  5.44s/it]


In [11]:
df = df.fillna("")
subset = df.loc[df["missing_imp"]==""]
subset = subset[subset["brands"]!=""]
subset = subset[subset["brands"].str.contains(" ")]
subset = subset[["ID","brands"]]
print(len(subset))
subset.to_csv(os.path.join(drug_path,"no_missing_drugs.csv"), index=False)

2742121


In [12]:
drug_count = {}
for i, row in tqdm(subset.iterrows()):
    b = row["brands"].split(" ")
    for ele in b:
        if ele not in drug_count:
            drug_count[ele] = 0
        drug_count[ele] += 1
        
counts = {"id":[], "count":[]}
for ele in tqdm(drug_count):
    counts["id"].append(ele)
    counts["count"].append(drug_count[ele])
sup_df = pd.DataFrame.from_dict(counts)

2742121it [02:41, 16967.95it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5521/5521 [00:00<00:00, 1845306.59it/s]


In [18]:
print(len(sup_df))
print(len(sup_df.loc[sup_df["count"]>len(sup_df)*0.1]))

5521
1415


In [19]:
vocab = {}
basket_size = {}
minsup = sup_df.loc[sup_df["count"]>len(sup_df)*0.1]
#minsup = sup_df.loc[sup_df["count"]>10000]
for i, row in minsup.iterrows():
    vocab[row["id"]] = row["count"]
out_data = {"ID":[], "brands":[]}

#for i, row in tqdm(subset.iterrows()):
#    b = row["brands"].split(" ")
#    if len(b) not in basket_size:
#        basket_size[len(b)] = 0
#    basket_size[len(b)] += 1
    
for i, row in tqdm(subset.iterrows()):
    b = row["brands"].split(" ")
    #if basket_size[len(b)] > 1000:
    #    continue
    check = True
    for ele in b:
        if ele not in vocab:
            check = False
            break
    if check:
        out_data["ID"].append(row["ID"])
        out_data["brands"].append(row["brands"])
test_df = pd.DataFrame.from_dict(out_data)
print(len(test_df))
test_df.to_csv(os.path.join(drug_path,"no_missing_drugs_minsup_1.csv"), index=False)

2742121it [03:05, 14749.17it/s]


2523823
