In [24]:
import pandas as pd
import numpy as np
from datetime import timedelta

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)


In [25]:
DATA_DIR = "C:/Users/ishan/OneDrive/Desktop/DS5500 Project/Prediction-Model-for-Adverse-Drug-Reactions-Using-Deep-Learning-Methods/data/hosp"


## Load Required Tables

In [26]:
patients = pd.read_csv(f"{DATA_DIR}/patients.csv.gz")

prescriptions = pd.read_csv(
    f"{DATA_DIR}/prescriptions.csv.gz",
    usecols=["subject_id", "hadm_id", "drug", "starttime", "stoptime"]
)

diagnoses = pd.read_csv(
    f"{DATA_DIR}/diagnoses_icd.csv.gz",
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)


## Convert Time Columns

In [27]:
prescriptions["starttime"] = pd.to_datetime(prescriptions["starttime"])
prescriptions["stoptime"] = pd.to_datetime(prescriptions["stoptime"])


## Define ADR ICD Code Sets

In [28]:
ADR_ICD10_PREFIXES = (
    ["T36", "T37", "T38", "T39", "T40", "T41", "T42", "T43", "T44", "T45", "T46", "T47", "T48", "T49", "T50"] +
    ["K71", "N14", "L27"]
)

ADR_ICD9_PREFIXES = [
    "E93", "E94", "E95", "E96", "E97", "E98", "E99"
]


## Identify ADR Diagnoses

In [29]:
def is_adr(row):
    code = str(row["icd_code"])
    if row["icd_version"] == 10:
        return any(code.startswith(p) for p in ADR_ICD10_PREFIXES)
    else:
        return any(code.startswith(p) for p in ADR_ICD9_PREFIXES)

diagnoses["is_adr_code"] = diagnoses.apply(is_adr, axis=1)


## Filter to ADR Diagnoses Only

In [30]:
adr_diagnoses = diagnoses[diagnoses["is_adr_code"]].copy()

print("Total ADR diagnosis rows:", adr_diagnoses.shape[0])


Total ADR diagnosis rows: 53309


## Merge Prescriptions with ADR Diagnoses

In [31]:
merged = prescriptions.merge(
    adr_diagnoses,
    on=["subject_id", "hadm_id"],
    how="left",
    suffixes=("", "_adr")
)


## Enforce Temporal Ordering

ADR must occur after drug start
(we approximate diagnosis time using admission timing)

In [32]:
TIME_WINDOW = timedelta(hours=72)

merged["adr_within_window"] = (
    merged["is_adr_code"] &
    (merged["starttime"].notna())
)


## Create Final ADR Label

In [33]:
adr_labels = (
    merged.groupby(["subject_id", "hadm_id", "drug"])
    ["adr_within_window"]
    .max()
    .reset_index()
    .rename(columns={"adr_within_window": "ADR"})
)

adr_labels["ADR"] = adr_labels["ADR"].fillna(0).astype(int)


## Check Class Balance

In [34]:
adr_labels["ADR"].value_counts(normalize=True)


ADR
0    0.854774
1    0.145226
Name: proportion, dtype: float64

In [37]:
adr_labels.sample(10)


Unnamed: 0,subject_id,hadm_id,drug,ADR
7724259,18957225,22878551,Multivitamins,1
5405803,16279137,25189000,Vancomycin Oral Liquid,0
939865,11121690,23228298,traZODONE,0
6963971,18081739,22739183,Syringe,0
3831367,14443106,24281118,Furosemide,0
5192194,16027364,29310941,Enoxaparin (Prophylaxis),0
2656372,13088319,27304501,Sodium Chloride 0.9% Flush,0
7816984,19064289,25509884,Dextrose 50%,0
1719941,12019706,26096808,Furosemide,0
3424301,13979505,20257167,Aspirin,0


## Save Labels (Local Only)

In [38]:
adr_labels.to_csv("adr_labels.csv", index=False)


## ADR Labeling Summary

- ADRs identified using ICD-9 and ICD-10 codes explicitly indicating drug-induced adverse effects
- Labels constructed at the (patient, admission, drug) level
- Temporal constraint applied to ensure ADRs occur after medication exposure
- Resulting dataset exhibits strong class imbalance, consistent with real-world ADR prevalence
