# Healthcare Synthetic Data Example

Generate privacy-safe synthetic patient records for research and development.

In [None]:
import numpy as np
import pandas as pd

from genesis import SyntheticGenerator, PrivacyConfig, QualityEvaluator, Constraint

## Create Sample Patient Records

In [None]:
np.random.seed(42)
n = 2000

# Demographics
age = np.random.normal(55, 18, n).clip(18, 95).astype(int)
gender = np.random.choice(['Male', 'Female'], n)

# Vitals (correlated with age)
systolic_bp = 110 + age * 0.5 + np.random.normal(0, 15, n)
diastolic_bp = 70 + age * 0.2 + np.random.normal(0, 10, n)
heart_rate = 80 - age * 0.1 + np.random.normal(0, 12, n)

# Lab values
glucose = 90 + np.random.exponential(20, n)
cholesterol = 180 + age * 0.8 + np.random.normal(0, 30, n)

# Diagnosis (correlated with age and vitals)
diabetes_prob = 0.05 + (glucose > 126) * 0.3 + (age > 60) * 0.1
hypertension_prob = 0.05 + (systolic_bp > 140) * 0.3 + (age > 50) * 0.1

conditions = []
for i in range(n):
    if np.random.random() < diabetes_prob[i]:
        conditions.append('Diabetes')
    elif np.random.random() < hypertension_prob[i]:
        conditions.append('Hypertension')
    else:
        conditions.append('Healthy')

patients = pd.DataFrame({
    'age': age,
    'gender': gender,
    'systolic_bp': systolic_bp.round(0),
    'diastolic_bp': diastolic_bp.round(0),
    'heart_rate': heart_rate.round(0),
    'glucose': glucose.round(1),
    'cholesterol': cholesterol.round(0),
    'primary_diagnosis': conditions
})

print(f"Patient records: {len(patients)}")
patients.describe()

## Configure Privacy Settings

Healthcare data requires strict privacy protection.

In [None]:
privacy_config = PrivacyConfig(
    privacy_level='high',
    enable_differential_privacy=True,
    epsilon=0.5,  # Strong privacy guarantee
    k_anonymity=10,
    l_diversity=3,
    sensitive_columns=['primary_diagnosis'],
    quasi_identifiers=['age', 'gender'],
)

print("Privacy Configuration:")
print(f"  Epsilon: {privacy_config.epsilon}")
print(f"  K-Anonymity: {privacy_config.k_anonymity}")

## Define Medical Constraints

In [None]:
constraints = [
    Constraint.range('age', 18, 100),
    Constraint.range('systolic_bp', 70, 200),
    Constraint.range('diastolic_bp', 40, 130),
    Constraint.range('heart_rate', 40, 150),
    Constraint.positive('glucose'),
    Constraint.range('cholesterol', 100, 400),
]

print(f"Defined {len(constraints)} medical constraints")

## Generate Synthetic Patients

In [None]:
generator = SyntheticGenerator(
    method='ctgan',
    privacy=privacy_config
)

discrete_cols = ['gender', 'primary_diagnosis']

generator.fit(
    patients,
    discrete_columns=discrete_cols,
    constraints=constraints
)

synthetic_patients = generator.generate(n_samples=2000)

print(f"Synthetic patients: {len(synthetic_patients)}")
synthetic_patients.describe()

## Quality Evaluation

In [None]:
evaluator = QualityEvaluator(patients, synthetic_patients)
report = evaluator.evaluate(target_column='primary_diagnosis')

print(report.summary())

## Verify Medical Plausibility

In [None]:
# Check that constraints are satisfied
print("Constraint Verification:")
print(f"  Age range: {synthetic_patients['age'].min()} - {synthetic_patients['age'].max()}")
print(f"  Systolic BP range: {synthetic_patients['systolic_bp'].min():.0f} - {synthetic_patients['systolic_bp'].max():.0f}")
print(f"  Glucose min: {synthetic_patients['glucose'].min():.1f}")

# Check diagnosis distribution
print("\nDiagnosis Distribution:")
print("Real:")
print(patients['primary_diagnosis'].value_counts(normalize=True))
print("\nSynthetic:")
print(synthetic_patients['primary_diagnosis'].value_counts(normalize=True))

## Privacy Verification

In [None]:
from genesis.privacy.anonymity import check_k_anonymity, check_l_diversity

# Check k-anonymity
k_result = check_k_anonymity(
    synthetic_patients, 
    quasi_identifiers=['age', 'gender'], 
    k=10
)
print(f"K-Anonymity (k=10): {k_result['satisfies_k']}")
print(f"Achieved k: {k_result['achieved_k']}")

# Check l-diversity
l_result = check_l_diversity(
    synthetic_patients,
    quasi_identifiers=['age', 'gender'],
    sensitive_column='primary_diagnosis',
    l=3
)
print(f"\nL-Diversity (l=3): {l_result['satisfies_l']}")
print(f"Achieved l: {l_result['achieved_l']}")

## Export for Research

In [None]:
# Save synthetic data
synthetic_patients.to_csv('synthetic_patients.csv', index=False)

# Save quality report
report.save_html('patient_data_quality_report.html')

print("Files saved successfully!")