Generate drug_dosing file

In [0]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Load patients
df_patients = pd.read_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/patients.csv", parse_dates=["enrollment_date"])

doses = []

for _, p in df_patients.iterrows():
    # Placebo handling
    if p["treatment_arm"] == "Placebo":
        continue

    drug = p["treatment_arm"]
    start_date = p["enrollment_date"]

    # 16 weeks of weekly dosing
    for week in range(16):
        dose_date = start_date + pd.Timedelta(days=7 * week)

        # Compliance logic
        rand = np.random.rand()
        if rand < 0.1:
            status = "Missed"
            dose = 0
        elif rand < 0.2:
            status = "Reduced"
            dose = 50
        else:
            status = "Taken"
            dose = 100

        doses.append({
            "dose_id": f"D{p['patient_id'][1:]}_{week+1}",
            "patient_id": p["patient_id"],
            "dose_date": dose_date.date(),
            "drug": drug,
            "dose_mg": dose,
            "dose_status": status
        })

df_dosing = pd.DataFrame(doses)
df_dosing.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/drug_dosing.csv", index=False)

print("Generated drug_dosing.csv")


Generate Outcomes file


In [0]:
import pandas as pd
import numpy as np

np.random.seed(42)

df_patients = pd.read_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/patients.csv", parse_dates=["enrollment_date"])

outcomes = []

response_probs = {
    "Drug_A": [0.1, 0.2, 0.4, 0.3],    # PD, SD, PR, CR
    "Drug_B": [0.15, 0.25, 0.4, 0.2],
    "Placebo": [0.4, 0.4, 0.15, 0.05]
}

responses = ["PD", "SD", "PR", "CR"]

for _, p in df_patients.iterrows():
    enroll = p["enrollment_date"]
    arm = p["treatment_arm"]

    # Best response
    best_response = np.random.choice(responses, p=response_probs[arm])
    response_date = enroll + pd.Timedelta(days=np.random.randint(30,120))

    # Progression logic
    if best_response in ["PD"]:
        progression_date = enroll + pd.Timedelta(days=np.random.randint(30,90))
    elif best_response == "SD":
        progression_date = enroll + pd.Timedelta(days=np.random.randint(90,180))
    else:
        progression_date = enroll + pd.Timedelta(days=np.random.randint(180,360))

    # Survival logic
    death_flag = np.random.rand() < (0.15 if arm != "Placebo" else 0.25)

    if death_flag:
        death_date = progression_date + pd.Timedelta(days=np.random.randint(30,180))
        censored = False
    else:
        death_date = pd.NaT
        censored = True

    outcomes.append({
        "patient_id": p["patient_id"],
        "best_response": best_response,
        "response_date": response_date.date(),
        "progression_date": progression_date.date(),
        "death_date": death_date.date() if pd.notna(death_date) else None,
        "censored": censored
    })

df_outcomes = pd.DataFrame(outcomes)
df_outcomes.to_csv("/Volumes/workspace/clinicaltrial_data/clinicaltrial/outcomes.csv", index=False)

print("Generated outcomes.csv")
