In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(30)

# Number of patients to generate
num_patients = 400

# Generate synthetic patient data based on observed distributions
synthetic_data = {
    "Patient ID": [f"P{str(i+1).zfill(3)}" for i in range(num_patients)],
    "Baseline Pain": np.random.randint(1, 10, num_patients),
    "Baseline Urgency": np.random.randint(1, 10, num_patients),
    "Baseline Frequency": np.random.randint(1, 10, num_patients),
    "Treatment Pain": np.random.randint(1, 10, num_patients),
    "Treatment Urgency": np.random.randint(1, 10, num_patients),
    "Treatment Frequency": np.random.randint(1, 10, num_patients),
    "Treatment Status": np.random.choice([0, 1], num_patients, p=[0.4, 0.6]),  # 40% treated, 60% control
}

# Define possible treatment times
possible_treatment_times = [0, 3, 6]

# Assign Treatment Time only to treated patients
synthetic_data["Treatment Time"] = [
    np.random.choice(possible_treatment_times) if synthetic_data["Treatment Status"][i] == 1 else np.nan
    for i in range(num_patients)
]

# Generate additional covariates for 3- and 6-month treatment times
# NaN if the treatment time doesn't reach 3 or 6 months
def generate_treatment_covariates(treatment_time, feature_name):
    """Generate treatment covariates for a specific time point."""
    return [
        np.random.randint(1, 10) if not np.isnan(treatment_time) and treatment_time >= time_point else np.nan
        for time_point in [3, 6]
    ]

# Add Treatment Pain, Urgency, and Frequency for 3 and 6 months
synthetic_data["Treatment Pain (3 mos)"], synthetic_data["Treatment Pain (6 mos)"] = zip(
    *[generate_treatment_covariates(t, "Pain") for t in synthetic_data["Treatment Time"]]
)
synthetic_data["Treatment Urgency (3 mos)"], synthetic_data["Treatment Urgency (6 mos)"] = zip(
    *[generate_treatment_covariates(t, "Urgency") for t in synthetic_data["Treatment Time"]]
)
synthetic_data["Treatment Frequency (3 mos)"], synthetic_data["Treatment Frequency (6 mos)"] = zip(
    *[generate_treatment_covariates(t, "Frequency") for t in synthetic_data["Treatment Time"]]
)

# Create DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Save to CSV
synthetic_df.to_csv("synthetic_patient_data.csv", index=False)

# Display sample of the generated dataset
print(synthetic_df.head(10))


  Patient ID  Baseline Pain  Baseline Urgency  Baseline Frequency  \
0       P001              6                 1                   9   
1       P002              6                 5                   5   
2       P003              5                 2                   7   
3       P004              8                 6                   5   
4       P005              3                 4                   3   
5       P006              6                 7                   6   
6       P007              2                 8                   2   
7       P008              4                 2                   4   
8       P009              8                 7                   1   
9       P010              8                 4                   6   

   Treatment Pain  Treatment Urgency  Treatment Frequency  Treatment Status  \
0               6                  1                    3                 0   
1               5                  4                    7                 0   
2  