In [32]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_synthetic_ehr_realistic(
    n_patients=5000,
    seed=42,
    start_date=datetime(2021,1,1),
    target_short_gap_base=0.04,   # baseline prob of short gap
    target_short_gap_slope=0.055, # how strongly risk pushes toward short gap
):
    """
    Returns:
      patients_df: patient_id, age, sex, chronic_conditions
      admissions_df: admission_id, patient_id, admission_date, discharge_date, primary_diagnosis, admission_type, los, risk_score
      labs_df: lab_id, admission_id, lab_test, lab_value, lab_timestamp
      meds_df: med_id, admission_id, drug, dose, start_date, end_date
      readmissions_df: admission_id, readmitted_within_30d
    """

    rng = np.random.default_rng(seed)
    random.seed(seed)

    # --------------------------
    # Reference vocabularies
    # --------------------------
    chronic_pool = ["CHF","COPD","Diabetes","CKD","Hypertension","Asthma","Cancer","Stroke"]
    dx_all = ["CHF","COPD","Pneumonia","Sepsis","MI","GI Bleed","Stroke","CKD","UTI","Elective Surgery","Minor Trauma"]
    adm_types = ["Emergency","Urgent","Elective"]

    # Labs we’ll emit for every admission (values shift by dx/severity)
    LABS = ["BNP","Creatinine","WBC","CRP","Hemoglobin"]
    LAB_BASE = {
        "BNP":        (100, 50),
        "Creatinine": (1.0, 0.3),
        "WBC":        (7.0, 2.0),
        "CRP":        (4.0, 3.0),
        "Hemoglobin": (13.5, 1.8),
    }

    # Medication choices keyed by diagnosis
    DRUGS = {
        "CHF": ["Furosemide","Lisinopril","Metoprolol","Spironolactone"],
        "COPD": ["Albuterol","Prednisone","Tiotropium"],
        "Pneumonia": ["Ceftriaxone","Azithromycin","Amoxicillin"],
        "Sepsis": ["Piperacillin/Tazobactam","Vancomycin","Meropenem"],
        "MI": ["Aspirin","Atorvastatin","Clopidogrel","Metoprolol"],
        "GI Bleed": ["Pantoprazole","Octreotide"],
        "Stroke": ["Aspirin","Atorvastatin"],
        "CKD": ["Epoetin","Calcium acetate","Lisinopril"],
        "UTI": ["Ciprofloxacin","Nitrofurantoin"],
        "Elective Surgery": ["Paracetamol","Ibuprofen"],
        "Minor Trauma": ["Paracetamol","Ibuprofen"]
    }

    # Diagnosis continuity map for short-gap (likely-related) readmissions
    RELATED_DX = {
        "CHF": ["CHF","Pneumonia","CKD"],
        "COPD": ["COPD","Pneumonia"],
        "Pneumonia": ["Pneumonia","COPD","CHF"],
        "Sepsis": ["Sepsis","Pneumonia","CKD"],
        "MI": ["MI","CHF","Stroke"],
        "GI Bleed": ["GI Bleed","Cancer"],
        "Stroke": ["Stroke","MI"],
        "CKD": ["CKD","CHF"],
        "UTI": ["UTI","Sepsis"],
        "Elective Surgery": ["Elective Surgery"],
        "Minor Trauma": ["Minor Trauma"],
    }

    # --------------------------
    # Helper functions
    # --------------------------
    def choose_initial_dx(rng):
        # mix: more common dx slightly upweighted
        weights = np.array([0.12,0.10,0.12,0.08,0.08,0.06,0.07,0.07,0.10,0.12,0.08], dtype=float)
        weights /= weights.sum()
        return rng.choice(dx_all, p=weights)

    def choose_next_dx(prev_dx, risk, short_gap):
        if short_gap and prev_dx in RELATED_DX:
            # Heavily favor same/related diagnoses on short-gap readmissions
            candidates = RELATED_DX[prev_dx] + [prev_dx]*2
            return random.choice(candidates)
        # Otherwise mild continuity bias
        if prev_dx in RELATED_DX and random.random() < min(0.25 + 0.05*risk, 0.6):
            return random.choice(RELATED_DX[prev_dx] + [prev_dx])
        # Fresh unrelated dx
        return choose_initial_dx(rng)

    def choose_adm_type(prev_type, risk, short_gap):
        if short_gap or risk >= 6:
            # Early/acute readmission likely emergent/urgent
            return random.choices(["Emergency","Urgent"], weights=[0.7,0.3])[0]
        # For longer gaps, elective more plausible
        return random.choices(adm_types, weights=[0.4,0.2,0.4])[0]

    def simulate_labs(dx, severity):
        """
        severity ~ [0..1+] → pushes abnormalities farther from normal.
        Returns dict {lab_name: value}
        """
        out = {}
        for lab in LABS:
            mu, sd = LAB_BASE[lab]
            val = rng.normal(mu, sd)

            # diagnosis-specific shifts, scaled by severity
            if dx == "CHF" and lab == "BNP":
                val += 400 * (0.7 + severity) + rng.normal(0, 40*severity)
            if dx in ["Sepsis","CKD"] and lab == "Creatinine":
                val += 1.2 * (0.6 + 0.8*severity)
            if dx in ["Pneumonia","Sepsis","COPD","UTI"] and lab == "WBC":
                val += 4.0 * (0.6 + 0.8*severity)
            if dx in ["Pneumonia","Sepsis"] and lab == "CRP":
                val += 10.0 * (0.6 + 0.9*severity)
            if dx in ["GI Bleed","Cancer"] and lab == "Hemoglobin":
                val -= 2.0 * (0.6 + 0.8*severity)

            # clamp some physiological sanity
            if lab == "Creatinine": val = max(0.4, val)
            if lab == "Hemoglobin": val = max(5.0, val)
            if lab == "BNP":        val = max(10.0, val)
            if lab == "WBC":        val = max(1.0, val)
            if lab == "CRP":        val = max(0.1, val)

            out[lab] = float(val)
        return out

    def simulate_meds(dx, severity):
        pool = DRUGS.get(dx, ["Paracetamol"])
        # more severe → more meds, up to len(pool)
        k = 1 + int(min(len(pool)-1, rng.integers(0, 2 + int(2*severity))))
        return random.sample(pool, k=k) if len(pool) >= k else pool

    def risk_score(age, chronic_list, dx, adm_type, labs_dict, meds_list):
        """
        Create an additive risk score (not a prob). We later map it to short-gap probability.
        Calibrated so typical values land ~ 1 to 8.
        """
        r = 0.5  # base

        # Age
        if age >= 80: r += 2.0
        elif age >= 70: r += 1.2
        elif age >= 65: r += 0.6

        # Chronic conditions
        chronic_set = set(chronic_list)
        if "CHF" in chronic_set: r += 1.2
        if "CKD" in chronic_set: r += 1.0
        if "COPD" in chronic_set: r += 0.8
        if "Diabetes" in chronic_set: r += 0.5
        if "Hypertension" in chronic_set: r += 0.3
        if "Cancer" in chronic_set: r += 0.7
        if "Stroke" in chronic_set: r += 0.6
        if "Asthma" in chronic_set: r += 0.3

        # Diagnosis
        if dx in ["CHF","Sepsis","Pneumonia","MI","CKD","Stroke"]: r += 1.4
        elif dx in ["GI Bleed","COPD","UTI"]:                      r += 0.9
        elif dx in ["Elective Surgery","Minor Trauma"]:            r += 0.0  # low additional risk

        # Admission type
        if adm_type == "Emergency": r += 1.0
        elif adm_type == "Urgent":  r += 0.5
        # elective adds nothing

        # Labs (abnormality increases risk)
        if labs_dict.get("BNP", 0) > 400:          r += 1.0
        if labs_dict.get("Creatinine", 0) > 2.0:   r += 0.9
        if labs_dict.get("WBC", 0) > 12.0:         r += 0.7
        if labs_dict.get("CRP", 0) > 15.0:         r += 0.5
        if labs_dict.get("Hemoglobin", 99) < 10.0: r += 0.5

        # Medications (proxy for severity/complexity)
        if any(m in meds_list for m in ["Vancomycin","Piperacillin/Tazobactam","Meropenem"]): r += 1.1
        if "Furosemide" in meds_list: r += 0.6
        if "Insulin" in meds_list:    r += 0.3

        return max(r, 0.1)

    def short_gap_probability(risk):
        """
        Map risk score → probability of a short gap (<= 30 days).
        Tuned to land ~15% overall across a realistic risk distribution.
        """
        p = target_short_gap_base + target_short_gap_slope * risk
        return float(np.clip(p, 0.02, 0.65))  # cap for sanity

    # --------------------------
    # Storage
    # --------------------------
    patients = []
    admissions = []
    labs_rows = []
    meds_rows = []

    adm_id_counter = 1
    lab_id_counter = 1
    med_id_counter = 1

    # --------------------------
    # Generate patients
    # --------------------------
    for i in range(1, n_patients+1):
        patient_id = f"P{i:05d}"
        age = int(rng.integers(18, 95))
        sex = random.choice(["M","F"])
        n_chronic = int(max(0, rng.poisson(1)))
        chronic = random.sample(chronic_pool, k=min(n_chronic, len(chronic_pool)))
        patients.append((patient_id, age, sex, ", ".join(chronic)))

        # How many admissions?
        n_adm = int(max(1, rng.poisson(1.6) + 1))

        # Seed an initial timeline
        last_discharge = start_date + timedelta(days=int(rng.integers(0, 365)))

        prev_dx = None
        prev_type = None

        for a in range(n_adm):
            # For the *current* admission, we first propose dx & type based on continuity (from previous)
            # Temporarily assume not a short-gap; true short/long decided after we compute risk from current admission context
            # So pick a preliminary dx/type (soft continuity):
            prelim_dx = prev_dx if (prev_dx and random.random() < 0.35) else choose_initial_dx(rng)
            prelim_type = prev_type if (prev_type and random.random() < 0.25) else random.choice(adm_types)

            # Severity proxy to shape labs/meds (mild random + elective tends lower)
            base_sev = rng.uniform(0.2, 1.0)
            if prelim_type == "Elective":
                base_sev *= 0.6
            if prelim_dx in ["Sepsis","CHF","Pneumonia","MI","CKD"]:
                base_sev *= 1.15

            # Create labs & meds for THIS admission (based on prelim dx & severity)
            labs_dict = simulate_labs(prelim_dx, base_sev)
            meds_list = simulate_meds(prelim_dx, base_sev)

            # Compute risk from patient + this admission context
            r = risk_score(age, chronic, prelim_dx, prelim_type, labs_dict, meds_list)
            p_short = short_gap_probability(r)

            # Draw whether the next gap is short (this affects *time to next admission*)
            short_gap = (random.random() < p_short)

            # Now finalize dx & adm type with stronger continuity if short_gap
            primary_dx = choose_next_dx(prelim_dx, r, short_gap) if prev_dx else prelim_dx
            adm_type   = choose_adm_type(prelim_type, r, short_gap) if prev_type else prelim_type

            # Recompute labs/meds if dx changed materially (rare but possible)
            if primary_dx != prelim_dx:
                labs_dict = simulate_labs(primary_dx, base_sev)
                meds_list = simulate_meds(primary_dx, base_sev)
                r = risk_score(age, chronic, primary_dx, adm_type, labs_dict, meds_list)
                p_short = short_gap_probability(r)
                short_gap = (random.random() < p_short)

            # Determine actual gap and admission timing
            if short_gap:
                gap_days = int(rng.integers(3, 26))     # 3–25 days → likely 30d readmission
            else:
                gap_days = int(rng.integers(40, 240))   # long interval

            adm_start = last_discharge + timedelta(days=gap_days)

            # Length of stay (mildly longer with severity and emergency)
            base_los = rng.integers(2, 7)
            los = int(max(1, base_los + (1 if adm_type in ["Emergency","Urgent"] else 0) + rng.integers(0,2)))
            adm_end = adm_start + timedelta(days=los)

            # Save admission row
            admission_id = f"A{adm_id_counter:07d}"
            admissions.append((admission_id, patient_id, adm_start, adm_end, primary_dx, adm_type, los, r))
            adm_id_counter += 1

            # Save labs
            for lab in LABS:
                labs_rows.append((
                    f"L{lab_id_counter:07d}",
                    admission_id,
                    lab,
                    float(round(labs_dict[lab], 3)),
                    adm_start + timedelta(hours=int(rng.integers(0, 24)))
                ))
                lab_id_counter += 1

            # Save meds
            for drug in meds_list:
                sd = adm_start + timedelta(hours=int(rng.integers(0, 24)))
                ed = sd + timedelta(days=int(rng.integers(0, max(1, los//2)+1)))
                meds_rows.append((
                    f"M{med_id_counter:07d}",
                    admission_id,
                    drug,
                    random.choice(["5mg","10mg","20mg","50mg"]),
                    sd,
                    ed
                ))
                med_id_counter += 1

            # Advance timeline
            last_discharge = adm_end
            prev_dx = primary_dx
            prev_type = adm_type

    # --------------------------
    # Build DataFrames
    # --------------------------
    patients_df = pd.DataFrame(patients, columns=["patient_id","age","sex","chronic_conditions"])
    admissions_df = pd.DataFrame(admissions, columns=[
        "admission_id","patient_id","admission_date","discharge_date",
        "primary_diagnosis","admission_type","los","risk_score"
    ])
    labs_df = pd.DataFrame(labs_rows, columns=[
        "lab_id","admission_id","lab_test","lab_value","lab_timestamp"
    ])
    meds_df = pd.DataFrame(meds_rows, columns=[
        "med_id","admission_id","drug","dose","start_date","end_date"
    ])

    diagnosis_map={
    "CHF": "D001",
    "COPD": "D002",
    "Pneumonia": "D003",
    "Sepsis": "D004",
    "MI": "D005",
    "GI Bleed": "D006",
    "Stroke": "D007",
    "CKD": "D008",
    "UTI": "D009",
    "Elective Surgery": "D010",
    "Minor Trauma": "D011"
    }
    admissions_df['diagnosis_id']=admissions_df['primary_diagnosis'].map(diagnosis_map)
    diagnosis_df=admissions_df[['diagnosis_id','primary_diagnosis']].drop_duplicates().reset_index(drop=True)

    # --------------------------
    # Compute TRUE 30d readmission labels from the sequence
    # --------------------------
    admissions_df = admissions_df.sort_values(["patient_id","admission_date"]).reset_index(drop=True)
    next_adm = admissions_df.groupby("patient_id")["admission_date"].shift(-1)
    admissions_df["readmitted_within_30d"] = (
        ((next_adm - admissions_df["discharge_date"]).dt.days <= 30)
        & ((next_adm - admissions_df["discharge_date"]).dt.days >= 0)
    ).astype(int)

    readmissions_df = admissions_df[["admission_id","readmitted_within_30d"]].copy()


    return patients_df, admissions_df, labs_df, meds_df, readmissions_df, diagnosis_df




In [33]:

patients_df, admissions_df, labs_df, meds_df, readmissions_df, diagnosis_df = generate_synthetic_ehr_realistic(n_patients=5000)


In [43]:
diagnosis_df

Unnamed: 0,diagnosis_id,primary_diagnosis
0,D011,Minor Trauma
1,D001,CHF
2,D008,CKD
3,D010,Elective Surgery
4,D006,GI Bleed
5,D005,MI
6,D007,Stroke
7,D002,COPD
8,D003,Pneumonia
9,D004,Sepsis


In [44]:
patients_df.to_csv("patients.csv", index=False)
admissions_df.to_csv("admissions.csv", index=False)
labs_df.to_csv("labs.csv", index=False)
meds_df.to_csv("medications.csv", index=False)
readmissions_df.to_csv("readmissions.csv", index=False)
diagnosis_df.to_csv("diagnosis.csv", index=False)