In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)


In [2]:
DATA_DIR = "/Users/hithaishireddy/Desktop/ADR-project/Prediction-Model-for-Adverse-Drug-Reactions-Using-Deep-Learning-Methods/data/hosp"


## Load Required Tables

In [3]:
patients = pd.read_csv(f"{DATA_DIR}/patients.csv.gz")

prescriptions = pd.read_csv(
    f"{DATA_DIR}/prescriptions.csv.gz",
    usecols=["subject_id", "hadm_id", "drug", "starttime", "stoptime"]
)

diagnoses = pd.read_csv(
    f"{DATA_DIR}/diagnoses_icd.csv.gz",
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)


## Convert Time Columns

In [4]:
prescriptions["starttime"] = pd.to_datetime(prescriptions["starttime"])
prescriptions["stoptime"] = pd.to_datetime(prescriptions["stoptime"])


## Define ADR ICD Code Sets

In [5]:
ADR_ICD10_PREFIXES = (
    ["T36", "T37", "T38", "T39", "T40", "T41", "T42", "T43", "T44", "T45", "T46", "T47", "T48", "T49", "T50"] +
    ["K71", "N14", "L27"]
)

ADR_ICD9_PREFIXES = [
    "E93", "E94", "E95", "E96", "E97", "E98", "E99"
]


## Identify ADR Diagnoses

In [6]:
def is_adr(row):
    code = str(row["icd_code"])
    if row["icd_version"] == 10:
        return any(code.startswith(p) for p in ADR_ICD10_PREFIXES)
    else:
        return any(code.startswith(p) for p in ADR_ICD9_PREFIXES)

diagnoses["is_adr_code"] = diagnoses.apply(is_adr, axis=1)


## Filter to ADR Diagnoses Only

In [7]:
adr_diagnoses = diagnoses[diagnoses["is_adr_code"]].copy()

print("Total ADR diagnosis rows:", adr_diagnoses.shape[0])


Total ADR diagnosis rows: 53309


## Merge Prescriptions with ADR Diagnoses

In [8]:
merged = prescriptions.merge(
    adr_diagnoses,
    on=["subject_id", "hadm_id"],
    how="left",
    suffixes=("", "_adr")
)


## Enforce Temporal Ordering

ADR must occur after drug start
(we approximate diagnosis time using admission timing)

In [9]:
TIME_WINDOW = timedelta(hours=72)

merged["adr_within_window"] = (
    merged["is_adr_code"] &
    (merged["starttime"].notna())
)


## Create Final ADR Label

In [10]:
adr_labels = (
    merged.groupby(["subject_id", "hadm_id", "drug"])
    ["adr_within_window"]
    .max()
    .reset_index()
    .rename(columns={"adr_within_window": "ADR"})
)

adr_labels["ADR"] = adr_labels["ADR"].fillna(0).astype(int)


## Check Class Balance

In [11]:
adr_labels["ADR"].value_counts(normalize=True)


0    0.854774
1    0.145226
Name: ADR, dtype: float64

In [13]:
adr_labels.sample(50)


Unnamed: 0,subject_id,hadm_id,drug,ADR
5884859,16841586,24073065,MetFORMIN (Glucophage),0
7845654,19101371,23625760,Sertraline,0
1915501,12245786,24887890,0.9% Sodium Chloride,0
4088285,14744450,22311973,Metoclopramide,0
7004483,18131057,26539259,Maalox/Diphenhydramine/Lidocaine,1
3549505,14118784,24923182,Calcium Carbonate,0
8195420,19523707,23277273,Influenza Vaccine Quadrivalent,0
6504214,17554598,28828753,Potassium Chloride (Powder),1
2794442,13247319,27152622,Diltiazem Extended-Release,0
2808461,13264660,24276152,Propofol,0


## Save Labels (Local Only)

In [14]:
adr_labels.to_csv("adr_labels.csv", index=False)


## ADR Labeling Summary

- ADRs identified using ICD-9 and ICD-10 codes explicitly indicating drug-induced adverse effects
- Labels constructed at the (patient, admission, drug) level
- Temporal constraint applied to ensure ADRs occur after medication exposure
- Resulting dataset exhibits strong class imbalance, consistent with real-world ADR prevalence
