In [1]:
# =============================================================================
# 1. HEART DISEASE (Cleveland) - Already exists
# =============================================================================
# URL: https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data

import pandas as pd

data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data",
    header=None,
    na_values="?",
)

cols = [
    "age",  # Age in years
    "sex",  # Sex (1=male, 0=female)
    "cp",  # Chest pain type (1-4)
    "trestbps",  # Resting blood pressure (mm Hg)
    "chol",  # Serum cholesterol (mg/dl)
    "fbs",  # Fasting blood sugar > 120 mg/dl (1=true, 0=false)
    "restecg",  # Resting ECG results (0-2)
    "thalach",  # Maximum heart rate achieved
    "exang",  # Exercise induced angina (1=yes, 0=no)
    "oldpeak",  # ST depression induced by exercise
    "slope",  # Slope of peak exercise ST segment
    "ca",  # Number of major vessels (0-3) colored by fluoroscopy
    "thal",  # Thalassemia (3=normal, 6=fixed defect, 7=reversible defect)
    "num",  # Target: Heart disease diagnosis (0=no disease, 1-4=disease)
]

data.columns = cols
print(f"Heart Disease: {data.shape[0]} rows, {data.shape[1]} columns")
data.head()


Heart Disease: 303 rows, 14 columns


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [2]:
# =============================================================================
# 2. BREAST CANCER WISCONSIN (Original)
# =============================================================================
# URL: https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# Source: University of Wisconsin Hospitals, Madison (Dr. William H. Wolberg)
# Instances: 699 | Features: 10 | Missing values: 16 (marked as "?")

breast_cancer = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    header=None,
    na_values="?",
)

breast_cancer_cols = [
    "id",  # Sample code number (id)
    "clump_thickness",  # Clump Thickness (1-10)
    "uniformity_cell_size",  # Uniformity of Cell Size (1-10)
    "uniformity_cell_shape",  # Uniformity of Cell Shape (1-10)
    "marginal_adhesion",  # Marginal Adhesion (1-10)
    "single_epithelial_size",  # Single Epithelial Cell Size (1-10)
    "bare_nuclei",  # Bare Nuclei (1-10)
    "bland_chromatin",  # Bland Chromatin (1-10)
    "normal_nucleoli",  # Normal Nucleoli (1-10)
    "mitoses",  # Mitoses (1-10)
    "class",  # Class (2=benign, 4=malignant)
]

breast_cancer.columns = breast_cancer_cols
breast_cancer = breast_cancer.drop(columns=["id"])  # Remove ID column
print(
    f"Breast Cancer Wisconsin: {breast_cancer.shape[0]} rows, {breast_cancer.shape[1]} columns"
)
breast_cancer.head()


Breast Cancer Wisconsin: 699 rows, 10 columns


Unnamed: 0,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [3]:
# =============================================================================
# 3. PIMA INDIANS DIABETES
# =============================================================================
# Original UCI: https://archive.ics.uci.edu/dataset/34/diabetes
# Direct CSV: https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv
# Source: National Institute of Diabetes and Digestive and Kidney Diseases
# Instances: 768 | Features: 8 | All patients: Pima Indian females, age >= 21

pima_diabetes = pd.read_csv(
    "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
    header=None,
)

pima_cols = [
    "pregnancies",  # Number of times pregnant
    "glucose",  # Plasma glucose concentration (2hr oral glucose tolerance test)
    "blood_pressure",  # Diastolic blood pressure (mm Hg)
    "skin_thickness",  # Triceps skin fold thickness (mm)
    "insulin",  # 2-Hour serum insulin (mu U/ml)
    "bmi",  # Body mass index (weight in kg/(height in m)^2)
    "diabetes_pedigree",  # Diabetes pedigree function
    "age",  # Age (years)
    "outcome",  # Class variable (0=no diabetes, 1=diabetes)
]

pima_diabetes.columns = pima_cols
print(
    f"Pima Indians Diabetes: {pima_diabetes.shape[0]} rows, {pima_diabetes.shape[1]} columns"
)
pima_diabetes.head()


Pima Indians Diabetes: 768 rows, 9 columns


Unnamed: 0,pregnancies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# =============================================================================
# 4. LIVER DISORDERS (BUPA)
# =============================================================================
# URL: https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data
# Source: BUPA Medical Research Ltd.
# Instances: 345 | Features: 6 | All patients: Male

liver_disorders = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data",
    header=None,
)

liver_cols = [
    "mcv",  # Mean corpuscular volume
    "alkphos",  # Alkaline phosphatase
    "sgpt",  # Alanine aminotransferase (ALT)
    "sgot",  # Aspartate aminotransferase (AST)
    "gammagt",  # Gamma-glutamyl transpeptidase
    "drinks",  # Number of half-pint equivalents of alcoholic beverages drunk per day
    "selector",  # Field used to split data into two sets (1 or 2)
]

liver_disorders.columns = liver_cols
print(
    f"Liver Disorders (BUPA): {liver_disorders.shape[0]} rows, {liver_disorders.shape[1]} columns"
)
liver_disorders.head()


Liver Disorders (BUPA): 345 rows, 7 columns


Unnamed: 0,mcv,alkphos,sgpt,sgot,gammagt,drinks,selector
0,85,92,45,27,31,0.0,1
1,85,64,59,32,23,0.0,2
2,86,54,33,16,54,0.0,2
3,91,78,34,24,36,0.0,2
4,87,70,12,28,10,0.0,2


In [5]:
# =============================================================================
# 5. HEPATITIS C VIRUS (HCV) DATA
# =============================================================================
# UCI: https://archive.ics.uci.edu/dataset/571/hcv-data
# Source: UCI ML Repository (Hoffmann et al.)
# Instances: 615 | Features: 12 | Contains missing values

# Using ucimlrepo package for newer UCI datasets
# pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

hcv_data = fetch_ucirepo(id=571)
hcv = pd.concat([hcv_data.data.features, hcv_data.data.targets], axis=1)

# Column names are already set by ucimlrepo:
# Category, Age, Sex, ALB, ALP, ALT, AST, BIL, CHE, CHOL, CREA, GGT, PROT
# Category = Blood Donor, Hepatitis, Fibrosis, Cirrhosis

print(f"HCV Data: {hcv.shape[0]} rows, {hcv.shape[1]} columns")
print(f"Columns: {list(hcv.columns)}")
hcv.head()


HCV Data: 615 rows, 13 columns
Columns: ['Age', 'Sex', 'ALB', 'ALP', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'CGT', 'PROT', 'ALT', 'Category']


Unnamed: 0,Age,Sex,ALB,ALP,AST,BIL,CHE,CHOL,CREA,CGT,PROT,ALT,Category
0,32,m,38.5,52.5,22.1,7.5,6.93,3.23,106.0,12.1,69.0,7.7,0=Blood Donor
1,32,m,38.5,70.3,24.7,3.9,11.17,4.8,74.0,15.6,76.5,18.0,0=Blood Donor
2,32,m,46.9,74.7,52.6,6.1,8.84,5.2,86.0,33.2,79.3,36.2,0=Blood Donor
3,32,m,43.2,52.0,22.6,18.9,7.33,4.74,80.0,33.8,75.7,30.6,0=Blood Donor
4,32,m,39.2,74.1,24.8,9.6,9.15,4.32,76.0,29.9,68.7,32.6,0=Blood Donor


In [6]:
# =============================================================================
# 6. EARLY STAGE DIABETES RISK PREDICTION
# =============================================================================
# UCI: https://archive.ics.uci.edu/dataset/529/early-stage-diabetes-risk-prediction
# Source: Sylhet Diabetes Hospital, Bangladesh
# Instances: 520 | Features: 16 | No missing values

diabetes_risk = fetch_ucirepo(id=529)
early_diabetes = pd.concat(
    [diabetes_risk.data.features, diabetes_risk.data.targets], axis=1
)

# Columns (from UCI documentation):
# age, gender, polyuria, polydipsia, sudden_weight_loss, weakness,
# polyphagia, genital_thrush, visual_blurring, itching, irritability,
# delayed_healing, partial_paresis, muscle_stiffness, alopecia, obesity, class

print(
    f"Early Stage Diabetes: {early_diabetes.shape[0]} rows, {early_diabetes.shape[1]} columns"
)
print(f"Columns: {list(early_diabetes.columns)}")
early_diabetes.head()


Early Stage Diabetes: 520 rows, 17 columns
Columns: ['age', 'gender', 'polyuria', 'polydipsia', 'sudden_weight_loss', 'weakness', 'polyphagia', 'genital_thrush', 'visual_blurring', 'itching', 'irritability', 'delayed_healing', 'partial_paresis', 'muscle_stiffness', 'alopecia', 'obesity', 'class']


Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [7]:
# =============================================================================
# AGGREGATE TO INSTITUTION LEVEL AND SAVE
# =============================================================================
# This transforms patient-level data into institution-level data suitable for
# benchmarking research. Each institution becomes one row with aggregate stats.

from src.data import aggregate_to_institutions

# Number of synthetic institutions to create
N_INSTITUTIONS = 30  # More institutions = better for clustering experiments

# Split strategies: random, stratified, clustered
SPLIT_STRATEGY = "random"
# Optional overrides for stratified/clustered strategies
STRATIFY_COL = None  # Defaults to target_col when using stratified
CLUSTER_COL = None  # Use schema-specific values when using clustered

# Define schema for each dataset: (numeric_cols, categorical_cols, target_col)
dataset_schemas = {
    "heart_disease": {
        "df": data,
        "numeric": ["age", "trestbps", "chol", "thalach", "oldpeak"],
        "categorical": ["sex", "cp", "fbs", "restecg", "exang", "slope"],
        "target": "num",  # Binarize: 0 = no disease, 1+ = disease
        "cluster_col": "age",
    },
    "breast_cancer": {
        "df": breast_cancer,
        "numeric": [
            "clump_thickness",
            "uniformity_cell_size",
            "uniformity_cell_shape",
            "marginal_adhesion",
            "single_epithelial_size",
            "bare_nuclei",
            "bland_chromatin",
            "normal_nucleoli",
            "mitoses",
        ],
        "categorical": [],
        "target": "class",  # Will become malignancy_rate (% of class=4)
        "cluster_col": "clump_thickness",
    },
    "pima_diabetes": {
        "df": pima_diabetes,
        "numeric": [
            "pregnancies",
            "glucose",
            "blood_pressure",
            "skin_thickness",
            "insulin",
            "bmi",
            "diabetes_pedigree",
            "age",
        ],
        "categorical": [],
        "target": "outcome",  # diabetes_rate (% with diabetes)
        "cluster_col": "glucose",
    },
    "liver_disorders": {
        "df": liver_disorders,
        "numeric": ["mcv", "alkphos", "sgpt", "sgot", "gammagt", "drinks"],
        "categorical": [],
        "target": "selector",  # Will be treated as prevalence
        "cluster_col": "mcv",
    },
    "hcv": {
        "df": hcv,
        "numeric": [
            "Age",
            "ALB",
            "ALP",
            "AST",
            "BIL",
            "CHE",
            "CHOL",
            "CREA",
            "CGT",
            "PROT",
            "ALT",
        ],
        "categorical": ["Sex"],
        "target": "Category",  # Needs preprocessing - see below
        "cluster_col": "Age",
    },
    "early_diabetes": {
        "df": early_diabetes,
        "numeric": ["age"],
        "categorical": [
            "gender",
            "polyuria",
            "polydipsia",
            "sudden_weight_loss",
            "weakness",
            "polyphagia",
            "genital_thrush",
            "visual_blurring",
            "itching",
            "irritability",
            "delayed_healing",
            "partial_paresis",
            "muscle_stiffness",
            "alopecia",
            "obesity",
        ],
        "target": "class",  # Needs preprocessing - see below
        "cluster_col": "age",
    },
}

# Preprocess targets that need binarization
# Heart disease: 0 = no disease, 1-4 = disease -> binarize to 0/1
data["num"] = (data["num"] > 0).astype(int)

# Breast cancer: 2 = benign, 4 = malignant -> map to 0/1
breast_cancer["class"] = (breast_cancer["class"] == 4).astype(int)

# HCV: Convert Category to numeric (is_diseased: 0=Blood Donor, 1=any disease)
hcv["Category"] = hcv["Category"].apply(lambda x: 0 if "Blood Donor" in str(x) else 1)

# Early diabetes: Positive/Negative -> 1/0
early_diabetes["class"] = (early_diabetes["class"] == "Positive").astype(int)

print("=" * 60)
print(f"Creating {N_INSTITUTIONS} synthetic institutions per dataset")
print("=" * 60)

for name, schema in dataset_schemas.items():
    df = schema["df"].copy()

    # Aggregate to institution level
    inst_df = aggregate_to_institutions(
        df=df,
        n_institutions=N_INSTITUTIONS,
        numeric_cols=schema["numeric"],
        categorical_cols=schema["categorical"],
        target_col=schema["target"],
        seed=42,
        split_strategy=SPLIT_STRATEGY,
        stratify_col=schema.get("stratify_col", STRATIFY_COL),
        cluster_col=schema.get("cluster_col", CLUSTER_COL),
    )

    # Save to CSV
    inst_df.to_csv(f"data/{name}.csv", index=False)

    print(f"\n{name}:")
    print(f"  Patients: {len(df)} -> Institutions: {len(inst_df)}")
    print(f"  Columns: {list(inst_df.columns)}")
    print(
        f"  Target rate range: {inst_df['target_rate'].min():.2f} - {inst_df['target_rate'].max():.2f}"
    )


Creating 30 synthetic institutions per dataset

heart_disease:
  Patients: 303 -> Institutions: 30
  Columns: ['institution_id', 'n_patients', 'age_mean', 'age_std', 'trestbps_mean', 'trestbps_std', 'chol_mean', 'chol_std', 'thalach_mean', 'thalach_std', 'oldpeak_mean', 'oldpeak_std', 'sex_mode', 'cp_mode', 'fbs_mode', 'restecg_mode', 'exang_mode', 'slope_mode', 'target_rate', 'target_std']
  Target rate range: 0.18 - 0.70

breast_cancer:
  Patients: 699 -> Institutions: 30
  Columns: ['institution_id', 'n_patients', 'clump_thickness_mean', 'clump_thickness_std', 'uniformity_cell_size_mean', 'uniformity_cell_size_std', 'uniformity_cell_shape_mean', 'uniformity_cell_shape_std', 'marginal_adhesion_mean', 'marginal_adhesion_std', 'single_epithelial_size_mean', 'single_epithelial_size_std', 'bare_nuclei_mean', 'bare_nuclei_std', 'bland_chromatin_mean', 'bland_chromatin_std', 'normal_nucleoli_mean', 'normal_nucleoli_std', 'mitoses_mean', 'mitoses_std', 'target_rate', 'target_std']
  Target 