Generate synthetic patient data

In [0]:
import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()
np.random.seed(42)

# Number of patients
N_PATIENTS = 10000

# Generate Patients Table
patients = []
treatment_arms = ["Drug_A", "Drug_B", "Placebo"]
ethnicities = ["White", "Black", "Asian", "Hispanic", "Other"]
statuses = ["Active", "Completed", "Withdrawn"]

for i in range(N_PATIENTS):
    patients.append({
        "patient_id": f"P{i+1:05d}",
        "trial_id": "CT-ONC-001",
        "site_id": f"SITE{np.random.randint(1,21):02d}",
        "sex": np.random.choice(["M", "F"]),
        "age": np.random.randint(40, 85),
        "ethnicity": np.random.choice(ethnicities),
        "enrollment_date": fake.date_between(start_date="-1y", end_date="today"),
        "treatment_arm": np.random.choice(treatment_arms, p=[0.4,0.4,0.2]),
        "baseline_biomarker": round(np.random.normal(70, 15),1),
        "status": np.random.choice(statuses, p=[0.6,0.3,0.1])
    })

df_patients = pd.DataFrame(patients)


In [0]:
# Generate Visits Table (5 visits per patient)
visits = []
visit_types = ["Baseline", "Week_4", "Week_8", "Week_12", "Week_16"]

for _, row in df_patients.iterrows():
    enroll_date = pd.to_datetime(row['enrollment_date'])
    for i, vtype in enumerate(visit_types):
        visit_date = enroll_date + pd.Timedelta(days=i*28)
        visits.append({
            "visit_id": f"V{row['patient_id'][1:]}_{i+1}",
            "patient_id": row['patient_id'],
            "visit_date": visit_date.date(),
            "visit_type": vtype
        })

df_visits = pd.DataFrame(visits)

In [0]:
# Generate Labs Table (ALT and AST for each visit)
labs = []
lab_tests = ["ALT", "AST"]

for _, visit in df_visits.iterrows():
    for test in lab_tests:
        value = round(np.random.normal(40,10),1)
        labs.append({
            "lab_id": f"L{visit['visit_id'][1:]}_{test}",
            "patient_id": visit['patient_id'],
            "visit_date": visit['visit_date'],
            "lab_test": test,
            "lab_value": max(0,value),
            "unit": "U/L",
            "normal_low": 10,
            "normal_high": 40
        })

df_labs = pd.DataFrame(labs)

In [0]:
# Generate Adverse Events Table (20% of patients)
aes = []
ae_terms = ["Nausea", "Fatigue", "Headache", "Elevated ALT", "Rash"]
for _, patient in df_patients.sample(frac=0.2, random_state=42).iterrows():
    n_ae = np.random.randint(1,4)  # 1 to 3 events
    enroll_date = pd.to_datetime(patient['enrollment_date'])
    for j in range(n_ae):
        start_date = enroll_date + pd.Timedelta(days=np.random.randint(14,120))
        end_date = start_date + pd.Timedelta(days=np.random.randint(1,15))
        aes.append({
            "ae_id": f"AE{patient['patient_id'][1:]}_{j+1}",
            "patient_id": patient['patient_id'],
            "ae_term": np.random.choice(ae_terms),
            "ae_start_date": start_date.date(),
            "ae_end_date": end_date.date(),
            "severity": np.random.randint(1,5),
            "serious": np.random.choice([True, False], p=[0.2,0.8]),
            "related_to_drug": np.random.choice([True, False])
        })

df_aes = pd.DataFrame(aes)

In [0]:
# Save CSVs
df_patients.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/patients.csv", index=False)
df_visits.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial//visits.csv", index=False)
df_labs.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/labs.csv", index=False)
df_aes.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/adverse_events.csv", index=False)

print("CSV files generated: patients.csv, visits.csv, labs.csv, adverse_events.csv")
