In [7]:
import pandas as pd
import numpy as np
from scipy.stats import skewnorm, multinomial

np.random.seed(28)

# ================================================================
# 1. Realistic Patient Characteristics (Integer Values Only)
# ================================================================
num_patients = 400

# Generate correlated baseline symptoms using integer-valued multivariate normal
cov_matrix = [[1.0, 0.6, 0.5],
              [0.6, 1.0, 0.4],
              [0.5, 0.4, 1.0]]

# Generate and discretize baseline symptoms
baseline_symptoms = np.random.multivariate_normal(
    mean=[4.5, 5.0, 3.8],
    cov=cov_matrix,
    size=num_patients
)
baseline_symptoms = np.clip(np.round(baseline_symptoms), 1, 9).astype(int)

# ================================================================
# 2. Treatment Assignment Model (Integer-Compatible)
# ================================================================
def logistic_prob(symptoms):
    """Integer-friendly treatment probability calculation"""
    linear = 0.3*symptoms[:,0] + 0.4*symptoms[:,1] + 0.2*symptoms[:,2] - 6.5
    return 1 / (1 + np.exp(-linear))

treatment_probs = logistic_prob(baseline_symptoms)
treatment_status = np.random.binomial(1, treatment_probs)

# ================================================================
# 3. Treatment Time Model (Integer Months Only)
# ================================================================
treatment_time = np.full(num_patients, -1, dtype=int)  # -1 indicates untreated
treated_idx = np.where(treatment_status == 1)[0]

# Generate treatment times (0=immediate, 3=3 months, 6=6 months)
time_probs = multinomial.rvs(1, [0.4, 0.4, 0.2], size=len(treated_idx))
treatment_time[treated_idx] = np.argmax(time_probs, axis=1) * 3

# ================================================================
# 4. Post-Treatment Outcomes (Integer Values Only)
# ================================================================
def generate_post_treatment(base_value, treatment_effect):
    """Integer outcome generator with diminishing returns"""
    effect = np.round(treatment_effect * (1 - np.random.beta(2, 5)))
    return np.clip(base_value - effect + np.random.randint(-1, 2), 1, 9)

# Initialize outcome columns with integers
outcome_columns = ['Treatment Pain', 'Treatment Urgency', 'Treatment Frequency',
                   'Treatment Pain (3 mos)', 'Treatment Pain (6 mos)',
                   'Treatment Urgency (3 mos)', 'Treatment Urgency (6 mos)',
                   'Treatment Frequency (3 mos)', 'Treatment Frequency (6 mos)']

outcomes = pd.DataFrame(-1, columns=outcome_columns, index=range(num_patients), dtype=int)

for idx in range(num_patients):
    if treatment_status[idx] == 1:
        # Immediate treatment effect
        outcomes.loc[idx, 'Treatment Pain'] = generate_post_treatment(baseline_symptoms[idx, 0], 2)
        outcomes.loc[idx, 'Treatment Urgency'] = generate_post_treatment(baseline_symptoms[idx, 1], 2)
        outcomes.loc[idx, 'Treatment Frequency'] = generate_post_treatment(baseline_symptoms[idx, 2], 1)
        
        # Time-dependent outcomes
        t_time = treatment_time[idx]
        if t_time <= 3:
            outcomes.loc[idx, 'Treatment Pain (3 mos)'] = generate_post_treatment(
                outcomes.loc[idx, 'Treatment Pain'], 1
            )
            outcomes.loc[idx, 'Treatment Urgency (3 mos)'] = generate_post_treatment(
                outcomes.loc[idx, 'Treatment Urgency'], 1
            )
            outcomes.loc[idx, 'Treatment Frequency (3 mos)'] = generate_post_treatment(
                outcomes.loc[idx, 'Treatment Frequency'], 1
            )
            
        if t_time <= 6:
            outcomes.loc[idx, 'Treatment Pain (6 mos)'] = generate_post_treatment(
                outcomes.loc[idx, 'Treatment Pain (3 mos)'], 1
            )
            outcomes.loc[idx, 'Treatment Urgency (6 mos)'] = generate_post_treatment(
                outcomes.loc[idx, 'Treatment Urgency (3 mos)'], 1
            )
            outcomes.loc[idx, 'Treatment Frequency (6 mos)'] = generate_post_treatment(
                outcomes.loc[idx, 'Treatment Frequency (3 mos)'], 1
            )

# ================================================================
# 5. Control Group Outcomes (natural progression) - FIXED
# ================================================================
control_idx = np.where(treatment_status == 0)[0]

# Create mapping from column names to symptom indices
symptom_map = {
    'Treatment Pain': 0,
    'Treatment Urgency': 1,
    'Treatment Frequency': 2,
    'Treatment Pain (3 mos)': 0,
    'Treatment Urgency (3 mos)': 1,
    'Treatment Frequency (3 mos)': 2,
    'Treatment Pain (6 mos)': 0,
    'Treatment Urgency (6 mos)': 1,
    'Treatment Frequency (6 mos)': 2
}

for col in outcome_columns:
    if 'Treatment' in col:
        symptom_idx = symptom_map[col]
        base_values = baseline_symptoms[control_idx, symptom_idx]
        
        # Add integer noise (-1, 0, +1)
        outcomes.loc[control_idx, col] = np.clip(
            base_values + np.random.randint(-1, 2, len(control_idx)),
            1, 9
        )

# ================================================================
# 6. Final Dataset Assembly with Integer Handling
# ================================================================
synthetic_df = pd.DataFrame({
    'Patient ID': [f"P{str(i+1).zfill(3)}" for i in range(num_patients)],
    'Baseline Pain': baseline_symptoms[:, 0],
    'Baseline Urgency': baseline_symptoms[:, 1],
    'Baseline Frequency': baseline_symptoms[:, 2],
    'Treatment Status': treatment_status,
    'Treatment Time': treatment_time
}).join(outcomes.replace(-1, np.nan))  # Convert -1 to NaN for missing values

# Introduce missingness in follow-ups (maintain integer type)
for col in ['Treatment Pain (3 mos)', 'Treatment Pain (6 mos)',
            'Treatment Urgency (3 mos)', 'Treatment Urgency (6 mos)',
            'Treatment Frequency (3 mos)', 'Treatment Frequency (6 mos)']:
    mask = np.random.choice([True, False], size=num_patients, p=[0.1, 0.9])
    synthetic_df.loc[mask, col] = np.nan

# Save to CSV
synthetic_df.to_csv("integer_synthetic_data.csv", index=False)

# Display sample
print("Sample of synthetic dataset:")
print(synthetic_df.head(10).to_markdown(index=False))
print("\nDataset summary:")
print(synthetic_df.describe().to_markdown())

Sample of synthetic dataset:


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.