This is an attempt to pull the data via OpenFDA API. Each report in FAERS usually has:

- Patient information: age, sex, weight, country.
- Drug information: the name of the drug taken, dose (if reported), route (oral, IV, etc.), when it was started/stopped.
- Adverse events: side effects, negative reactions, whether the patient had to be hospitalized, disability, or even death.
- Outcome: categories like recovered, ongoing, hospitalization, life-threatening, death.
- Reporter: can be doctors, pharmacists, patients, or manufacturers.


Limitations
- It’s voluntary for patients and healthcare providers (though mandatory for manufacturers).
- Not every side effect is reported (underreporting).
- The data is not proof that the drug caused the event — just that it happened after the drug was used.
- Dosage information isn’t always filled in.
- It’s skewed toward adverse outcomes (not “drug worked well”), so it’s best for studying drug safety, not efficacy

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]()

In [None]:
# https://colab.research.google.com/github/zhimingkuang/Harvard-AM-115/blob/main/05_model_fitting/fit_yeast_ode.ipynb

## Option 1: Pull all the drugs that this patient was taking not just the target drug

In [None]:
import requests, pandas as pd, time

BASE = "https://api.fda.gov/drug/event.json"
PAGE_SIZE = 100  # max per request
MAX_RECORDS = 1000  # adjust as needed (API limit ~2400/day)

# First-line NSCLC drug (Keytruda)
DRUG_TERM = "PEMBROLIZUMAB"


def fetch_page(skip, drug_term=None):
    params = {"limit": PAGE_SIZE, "skip": skip}
    if drug_term:
        params["search"] = f'patient.drug.medicinalproduct:"{drug_term}"'
    r = requests.get(BASE, params=params, timeout=60)
    r.raise_for_status()
    return r.json().get("results", [])


def flatten(results):
    rows = []
    for item in results:
        patient = item.get("patient", {})
        drugs = patient.get("drug", []) or []
        reactions = patient.get("reaction", []) or []

        death_flag = (item.get("seriousnessdeath") == "1") or (
            (patient.get("patientdeath") or {}).get("patientdeathdate") is not None
        )

        for d in drugs:
            rows.append(
                {
                    # --- Patient ---
                    "patientid": item.get("safetyreportid"),
                    "age": patient.get("patientonsetage"),
                    "age_unit": patient.get("patientonsetageunit"),
                    "sex": patient.get("patientsex"),  # 1=Male, 2=Female
                    "weight": patient.get("patientweight"),
                    # --- Drug info ---
                    "drug": d.get("medicinalproduct"),
                    "indication": d.get("drugindication"),
                    "dose_text": d.get("drugdosagetext"),
                    "route": d.get("drugadministrationroute"),
                    "start_date": d.get("drugstartdate"),
                    "end_date": d.get("drugenddate"),
                    # --- Outcome (focus: death yes/no) ---
                    "outcome_dead": death_flag,
                    # --- Reactions (list of terms) ---
                    "reactions": [rx.get("reactionmeddrapt") for rx in reactions],
                    # --- Metadata ---
                    "receivedate": item.get("receivedate"),
                    "country": item.get("occurcountry"),
                }
            )
    return rows


def collect(max_records=1000, drug_term=None):
    all_rows, skip = [], 0
    while skip < max_records:
        res = fetch_page(skip, drug_term)
        if not res:
            break
        all_rows.extend(flatten(res))
        skip += PAGE_SIZE
        time.sleep(0.25)  # polite pause
    return pd.DataFrame(all_rows)

In [None]:
# --- Run ---
df = collect(MAX_RECORDS, DRUG_TERM)
# df.to_csv("faers_keytruda_death.csv", index=False)
# print(f"Saved {len(df)} rows. Example rows:")
# print(df.head(5))

In [None]:
df.head(200)

Unnamed: 0,patientid,age,age_unit,sex,weight,drug,indication,dose_text,route,start_date,end_date,outcome_dead,reactions,receivedate,country
0,10222779,73,801,1,78.9,TORASEMIDE,PRODUCT USED FOR UNKNOWN INDICATION,2-2-0,048,,,True,[Cardiomyopathy],20140606,DE
1,10222779,73,801,1,78.9,SPIRONOLACTONE.,PRODUCT USED FOR UNKNOWN INDICATION,1-0-0,065,,,True,[Cardiomyopathy],20140606,DE
2,10222779,73,801,1,78.9,RAMIPRIL.,PRODUCT USED FOR UNKNOWN INDICATION,1-0-1,048,20071101,,True,[Cardiomyopathy],20140606,DE
3,10222779,73,801,1,78.9,ASPIRIN.,PRODUCT USED FOR UNKNOWN INDICATION,0-1-0,048,20071101,,True,[Cardiomyopathy],20140606,DE
4,10222779,73,801,1,78.9,HEPARIN,PRODUCT USED FOR UNKNOWN INDICATION,"800 IU, QH",065,,,True,[Cardiomyopathy],20140606,DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,10333509,55,801,2,50.2,ZOLPIDEM,ANXIETY,"10 MG, QD,FORMATION:PILL",048,20110221,,False,"[Headache, Meningitis aseptic]",20140722,BE
196,10333509,55,801,2,50.2,ACETAMINOPHEN.,PREMEDICATION,TOTAL DAILY DOSE 1G. FREQUENCY: OTHER,048,20130926,,False,"[Headache, Meningitis aseptic]",20140722,BE
197,10333509,55,801,2,50.2,ALBUTEROL.,ASTHMA,TOTAL DAILY DOSE 100 MICROGRAM. FREQUENCY: AS ...,055,200512,,False,"[Headache, Meningitis aseptic]",20140722,BE
198,10333509,55,801,2,50.2,BUDESONIDE.,ASTHMA,"FORMULATION: INHALANT; 500 MICROGRAM, BID",055,20131203,,False,"[Headache, Meningitis aseptic]",20140722,BE


## Option 2: Pull only the drug that we are targeting

In [None]:
import requests, pandas as pd, time

BASE = "https://api.fda.gov/drug/event.json"
PAGE_SIZE = 100
MAX_RECORDS = 1000
DRUG_TERM = "PEMBROLIZUMAB"  # Keytruda generic
BRAND_TERM = "KEYTRUDA"  # brand name (optional)

ONLY_SUSPECT = True  # keep only suspect drugs (drugcharacterization == "1")


def fetch_page(skip, drug_term=None):
    params = {"limit": PAGE_SIZE, "skip": skip}
    if drug_term:
        # quotes -> phrase match; this fetches reports mentioning the drug anywhere
        params["search"] = f'patient.drug.medicinalproduct:"{drug_term}"'
    r = requests.get(BASE, params=params, timeout=60)
    r.raise_for_status()
    return r.json().get("results", [])


def is_target_drug(name: str) -> bool:
    if not isinstance(name, str):
        return False
    n = name.strip().upper()
    return (DRUG_TERM in n) or (BRAND_TERM in n)


def flatten(results):
    rows = []
    for item in results:
        patient = item.get("patient", {}) or {}
        drugs = patient.get("drug", []) or []
        reactions = patient.get("reaction", []) or []

        death_flag = (item.get("seriousnessdeath") == "1") or (
            (patient.get("patientdeath") or {}).get("patientdeathdate") is not None
        )

        for d in drugs:
            med = (d.get("medicinalproduct") or "").upper()
            if not is_target_drug(med):
                continue
            if ONLY_SUSPECT and d.get("drugcharacterization") != "1":
                continue  # keep only suspect drug entries

            rows.append(
                {
                    "patientid": item.get("safetyreportid"),
                    "age": patient.get("patientonsetage"),
                    "age_unit": patient.get("patientonsetageunit"),
                    "sex": patient.get("patientsex"),  # 1=Male, 2=Female, 0/UNK
                    "weight": patient.get("patientweight"),
                    "drug": d.get("medicinalproduct"),
                    "drugcharacterization": d.get("drugcharacterization"),  # 1=suspect
                    "indication": d.get("drugindication"),
                    "dose_text": d.get("drugdosagetext"),
                    "route": d.get("drugadministrationroute"),
                    "start_date": d.get("drugstartdate"),
                    "end_date": d.get("drugenddate"),
                    "outcome_dead": death_flag,
                    "reactions": [
                        rx.get("reactionmeddrapt")
                        for rx in reactions
                        if isinstance(rx, dict)
                    ],
                    "receivedate": item.get("receivedate"),
                    "country": item.get("occurcountry"),
                }
            )
    return rows


def collect(max_records=1000, drug_term=None):
    all_rows, skip = [], 0
    while skip < max_records:
        res = fetch_page(skip, drug_term)
        if not res:
            break
        all_rows.extend(flatten(res))
        skip += PAGE_SIZE
        time.sleep(0.25)
    return pd.DataFrame(all_rows)

In [None]:
# run
df = collect(MAX_RECORDS, DRUG_TERM)
print(df["drug"].value_counts().head(10))

drug
PEMBROLIZUMAB.                   1616
PEMBROLIZUMAB 100MG/4ML MERCK       2
Name: count, dtype: int64


In [None]:
df.head()

Unnamed: 0,patientid,age,age_unit,sex,weight,drug,drugcharacterization,indication,dose_text,route,start_date,end_date,outcome_dead,reactions,receivedate,country
0,10222779,73,801,1,78.9,PEMBROLIZUMAB.,1,METASTATIC MALIGNANT MELANOMA,UNK,65,20150406,,True,[Cardiomyopathy],20140606,DE
1,10329852,83,801,2,69.9,PEMBROLIZUMAB.,1,,"10 MG/KG, QOW",42,20140325,20140325.0,False,"[Hypothyroidism, Adrenal insufficiency]",20140721,DE
2,10329852,83,801,2,69.9,PEMBROLIZUMAB.,1,,"10 MG/KG, QOW",42,20140715,20140715.0,False,"[Hypothyroidism, Adrenal insufficiency]",20140721,DE
3,10329852,83,801,2,69.9,PEMBROLIZUMAB.,1,MALIGNANT MELANOMA,"10 MG/KG, QOW",42,20140212,20140311.0,False,"[Hypothyroidism, Adrenal insufficiency]",20140721,DE
4,10329852,83,801,2,69.9,PEMBROLIZUMAB.,1,,"10 MG/KG, QOW",42,20140423,20140423.0,False,"[Hypothyroidism, Adrenal insufficiency]",20140721,DE
