Generate synthetic patient data

In [0]:
# Generate patient data

import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()
np.random.seed(42)

N_PATIENTS = 10000

# -------------------------
# 1. PATIENTS (MESSY)
# -------------------------
patients = []
for i in range(N_PATIENTS):
    p = {
        "patient_id": f"P{i+1:05d}",
        "trial_id": "CT-ONC-001",
        "site_id": f"SITE{np.random.randint(1,31):02d}",
        "sex": np.random.choice(["M", "F", "male", "Female", ""]),
        "age": np.random.choice(
            [np.random.randint(40, 85), -5, 150, None],
            p=[0.9, 0.03, 0.03, 0.04]
        ),
        "ethnicity": np.random.choice(
            ["White", "Black", "Asian", "Hispanic", "Other", None]
        ),
        "enrollment_date": fake.date_between("-18M", "today"),
        "treatment_arm": np.random.choice(
            ["Drug_A", "Drug_B", "Placebo"], p=[0.4, 0.4, 0.2]
        ),
        "baseline_biomarker": round(np.random.normal(70, 20), 1),
        "status": np.random.choice(
            ["Active", "Completed", "Withdrawn", "WITHDRAWN"]
        )
    }
    patients.append(p)

df_patients = pd.DataFrame(patients)

# Inject duplicates
df_patients = pd.concat(
    [df_patients, df_patients.sample(frac=0.1, random_state=1)],
    ignore_index=True
)

df_patients.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/patients/patients_messy.csv", index=False)

In [0]:
# Generate Visits Data

# -------------------------
# 2. VISITS (MESSY)
# -------------------------
visits = []
visit_types = ["Baseline", "Week_4", "Week_8", "Week_12", "Week_16", "week4", ""]

for _, p in df_patients.iterrows():
    enroll = pd.to_datetime(p["enrollment_date"])
    for i in range(np.random.randint(3, 6)):
        visits.append({
            "visit_id": f"V{p['patient_id']}_{i}",
            "patient_id": p["patient_id"],
            "visit_date": enroll + pd.Timedelta(days=28*i),
            "visit_type": np.random.choice(visit_types, p=[0.7,0.1,0.05,0.05,0.05,0.03,0.02])
        })

df_visits = pd.DataFrame(visits)
df_visits.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/visits/visits_messy.csv", index=False)

In [0]:
# Generate LABS data

# -------------------------
# 3. LABS (MESSY)
# -------------------------
labs = []
for _, v in df_visits.iterrows():
    for test in ["ALT", "AST"]:
        labs.append({
            "lab_id": fake.uuid4(),
            "patient_id": v["patient_id"],
            "visit_date": v["visit_date"],
            "lab_test": np.random.choice([test, test.lower()]),
            "lab_value": np.random.choice(
                [round(np.random.normal(40, 15),1), "ERROR", 9999, -100],
                p=[0.9, 0.03, 0.04, 0.03]
            ),
            "unit": "U/L",
            "normal_low": 10,
            "normal_high": 40
        })

df_labs = pd.DataFrame(labs)
df_labs.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/labs/labs_messy.csv", index=False)

In [0]:
# Generate Adverse Events data

# -------------------------
# 4. ADVERSE EVENTS (MESSY)
# -------------------------
aes = []
ae_terms = ["Nausea", "Fatigue", "Headache", "Elevated ALT", "Rash"]

for _, p in df_patients.sample(frac=0.25, random_state=2).iterrows():
    n = np.random.randint(1, 4)
    enroll = pd.to_datetime(p["enrollment_date"])
    for i in range(n):
        start = enroll + pd.Timedelta(days=np.random.randint(10, 120))
        end = start + pd.Timedelta(days=np.random.randint(-5, 20))  # broken dates
        aes.append({
            "ae_id": fake.uuid4(),
            "patient_id": p["patient_id"],
            "ae_term": np.random.choice(ae_terms),
            "ae_start_date": start,
            "ae_end_date": end,
            "severity": np.random.choice([1,2,3,4,5,0,6]),
            "serious": np.random.choice([True, False]),
            "related_to_drug": np.random.choice([True, False])
        })

df_ae = pd.DataFrame(aes)
df_ae.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/adverse_events/adverse_events_messy.csv", index=False)

In [0]:
# Generate Drug dosing data

# -------------------------
# 5. DRUG DOSING (MESSY)
# -------------------------
doses = []
for _, p in df_patients.iterrows():
    if p["treatment_arm"] == "Placebo":
        continue
    enroll = pd.to_datetime(p["enrollment_date"])
    for w in range(16):
        doses.append({
            "dose_id": fake.uuid4(),
            "patient_id": p["patient_id"],
            "dose_date": enroll + pd.Timedelta(days=7*w),
            "drug": p["treatment_arm"],
            "dose_mg": np.random.choice([100, 50, 0, -50], p=[0.75,0.1,0.1,0.05]),
            "dose_status": np.random.choice(
                ["Taken", "taken", "MISSED", "unknown"]
            )
        })

df_dosing = pd.DataFrame(doses)
df_dosing.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/drug_dosing/drug_dosing_messy.csv", index=False)

In [0]:
# Generate Outcomes

# -------------------------
# 6. OUTCOMES (MESSY)
# -------------------------
responses = ["PD", "SD", "PR", "CR", "UNK", ""]

outcomes = []
for _, p in df_patients.iterrows():
    enroll = pd.to_datetime(p["enrollment_date"])
    resp = np.random.choice(responses)
    prog = enroll + pd.Timedelta(days=np.random.randint(30, 300))
    death = prog + pd.Timedelta(days=np.random.randint(30, 200)) if np.random.rand() < 0.2 else None

    outcomes.append({
        "patient_id": p["patient_id"],
        "best_response": resp,
        "response_date": enroll + pd.Timedelta(days=np.random.randint(30, 120)),
        "progression_date": None if np.random.rand() < 0.1 else prog,
        "death_date": death,
        "censored": death is None
    })

df_outcomes = pd.DataFrame(outcomes)
df_outcomes.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/outcomes/outcomes_messy.csv", index=False)

print("Messy clinical trial CSVs generated (10k+ patients).")
