In [None]:
# build_app_data.py
import json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.manifold import SpectralEmbedding  # or UMAP if you prefer

# =========================
# 0. CONFIG – EDIT THESE
# =========================

OUTPUT_JSON = Path("snf-data.json")

# TODO: point these to your actual objects / files

# 1) Patient index & metadata
#    meta must have index = patient_id, columns: "age", "sex" (0/1 or 'M'/'F'), and some risk variable.
def load_meta() -> pd.DataFrame:
    # Example: from a CSV you already use
    meta = pd.read_csv("/Users/harisreedeth/Desktop/D/personal/ProjectMAIP/data/01_processed/support_preprocessed_clean.csv").set_index("eid")
    return meta

# 2) View matrices (C, P, S) – rows = patients (same index as meta)
def load_views() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    C = pd.read_csv("/Users/harisreedeth/Desktop/D/personal/ProjectMAIP/data/01_processed/C_view.csv")  # comorbidity
    P = pd.read_csv("/Users/harisreedeth/Desktop/D/personal/ProjectMAIP/data/01_processed/P_view_scaled.csv")  # physiology
    S = pd.read_csv("/Users/harisreedeth/Desktop/D/personal/ProjectMAIP/data/01_processed/S_view.csv")  # socio-contextual

    # Make sure indices are the same and aligned
    common_index = C.index.intersection(P.index).intersection(S.index)
    C = C.loc[common_index]
    P = P.loc[common_index]
    S = S.loc[common_index]
    return C, P, S

# 3) Fused similarity (NxN) and SNF cluster labels
def load_snf_results(common_ids: pd.Index) -> tuple[np.ndarray, np.ndarray]:
    # W_fused: affinity/similarity matrix from your SNF-lite script
    W_fused = np.load("W_fused.npy")       # shape (N, N)
    labels = np.load("snf_labels.npy")     # shape (N,)

    # Optionally reorder to match common_ids if needed
    # If you saved them in the same order as meta / C/P/S, you can skip reindexing.
    # Otherwise, you need a mapping from patient_id -> row index in W_fused.

    assert W_fused.shape[0] == len(common_ids), "W_fused rows != number of patients"
    assert labels.shape[0] == len(common_ids), "labels length != number of patients"
    return W_fused, labels


# =========================
# 1. HELPER FUNCTIONS
# =========================

def minmax(x: np.ndarray) -> np.ndarray:
    x = x.astype(float)
    mn = np.nanmin(x)
    mx = np.nanmax(x)
    if mx <= mn:
        return np.zeros_like(x, dtype=float)
    return (x - mn) / (mx - mn)


def make_embedding(W_fused: np.ndarray, n_components: int = 2) -> np.ndarray:
    """
    Build a 2D embedding from the fused similarity matrix.
    We use SpectralEmbedding with a precomputed affinity.
    """
    emb = SpectralEmbedding(
        n_components=n_components,
        affinity="precomputed",
        random_state=42,
    ).fit_transform(W_fused)
    return emb  # shape (N, 2)


def make_view_scores(C: pd.DataFrame, P: pd.DataFrame, S: pd.DataFrame) -> np.ndarray:
    """
    Turn each view into a 0–1 "burden" score per patient.

    Simple approach:
    - C: number of comorbidities (row sum of binary indicators).
    - P: mean of z-scored physiological variables (higher = worse).
    - S: sum or mean of socio-contextual burden variables.

    You can refine this later, but this gets you a sensible first cut.
    """
    # --- C: comorbidity burden ---
    # assume C columns are 0/1 or counts
    c_raw = C.sum(axis=1).to_numpy()

    # --- P: physiological burden ---
    # z-score each column then row-mean
    P_z = (P - P.mean()) / (P.std(ddof=0) + 1e-6)
    p_raw = P_z.mean(axis=1).to_numpy()

    # --- S: socio-context burden ---
    # again, row sum or row mean depending on how you encoded it
    s_raw = S.sum(axis=1).to_numpy()

    c = minmax(c_raw)
    p = minmax(p_raw)
    s = minmax(s_raw)

    view_profile = np.vstack([c, p, s]).T  # shape (N, 3)
    return view_profile


def make_primary_condition(C: pd.DataFrame) -> list[str]:
    """
    Create a simple 'primaryCondition' label per patient.

    Here: we pick the comorbidity column with the largest value for that patient.
    You may want to replace this with something smarter (e.g. APACHE primary diagnosis).
    """
    # If C is binary, argmax gives the first comorbidity present.
    # If multiple are present, you just get the first; refine later if needed.
    col_names = np.array(C.columns)
    idx_max = C.values.argmax(axis=1)
    primary = [col_names[j] for j in idx_max]
    return primary


def make_risk_score(meta: pd.DataFrame) -> np.ndarray:
    """
    Build a 0–1 riskScore for the UI.

    Options:
    - use a predicted 1-year mortality probability if you have it,
    - or rescale an existing severity score (SOFA, APS) to 0–1.
    """
    if "pred_mortality_1y" in meta.columns:
        raw = meta["pred_mortality_1y"].to_numpy()
    elif "sofa" in meta.columns:
        raw = meta["sofa"].to_numpy()
    else:
        # fallback: age-based proxy (just to get something on screen)
        raw = meta["age"].to_numpy()

    return minmax(raw)


# =========================
# 2. MAIN: BUILD ARRAYS
# =========================

def build_app_data() -> dict:
    # 1) Load metadata and views, align indices
    meta = load_meta()
    C, P, S = load_views()

    # Ensure common index across meta and views
    common_index = meta.index.intersection(C.index).intersection(P.index).intersection(S.index)
    meta = meta.loc[common_index]
    C = C.loc[common_index]
    P = P.loc[common_index]
    S = S.loc[common_index]

    # 2) Load SNF fused matrix and labels in the same order
    W_fused, labels = load_snf_results(common_index)
    N = len(common_index)

    # 3) Arrays we need
    ids = common_index.astype(str).tolist()        # patient IDs as strings
    clusters = labels.astype(int)                  # SNF cluster labels
    embedding = make_embedding(W_fused, n_components=2)  # (N, 2)
    view_profile = make_view_scores(C, P, S)              # (N, 3)

    primary_condition = make_primary_condition(C)         # list[str]
    age = meta["age"].to_numpy().astype(int)              # age (or whatever your column is named)
    # map sex to 'M'/'F'
    if "sex" in meta.columns:
        sex_raw = meta["sex"]
        # adapt mapping to your coding scheme
        sex = np.where((sex_raw == 1) | (sex_raw == "F") | (sex_raw == "Female"), "F", "M")
    else:
        sex = np.array(["M"] * N)  # placeholder if you truly don't have sex (not ideal)

    risk_score = make_risk_score(meta)                   # 0–1

    # =========================
    # 3. Build Patient objects
    # =========================

    patients = []
    for i, pid in enumerate(ids):
        patients.append({
            "id": pid,
            "cluster": int(clusters[i]),
            "embedding": {
                "x": float(embedding[i, 0]),
                "y": float(embedding[i, 1]),
            },
            "attributes": {
                "age": int(age[i]),
                "sex": str(sex[i]),
                "riskScore": float(risk_score[i]),
                "primaryCondition": str(primary_condition[i]),
            },
            "profile": {
                "c": float(view_profile[i, 0]),
                "p": float(view_profile[i, 1]),
                "s": float(view_profile[i, 2]),
            },
        })

    # =========================
    # 4. Build ClusterProfile
    # =========================

    unique_clusters = np.unique(clusters)
    cluster_profiles = []
    for k in unique_clusters:
        mask = (clusters == k)
        count = int(mask.sum())

        if count == 0:
            continue

        # avg of view scores within cluster
        avg_c = float(view_profile[mask, 0].mean())
        avg_p = float(view_profile[mask, 1].mean())
        avg_s = float(view_profile[mask, 2].mean())
        avg_risk = float(risk_score[mask].mean())

        # basic automatic label based on dominant view
        max_val = max(("c", avg_c), ("p", avg_p), ("s", avg_s), key=lambda t: t[1])[0]
        if max_val == "c":
            name = f"Cluster {k} – Chronic burden"
            description = "Multimorbid, chronic-dominant profile."
            dom_feats = ["Diabetes", "CHF", "Polypharmacy"]
        elif max_val == "p":
            name = f"Cluster {k} – Acute physiology"
            description = "Acute physiological instability."
            dom_feats = ["Tachycardia", "Hypoxia", "Elevated Lactate"]
        else:
            name = f"Cluster {k} – Context burden"
            description = "High socio-economic deprivation."
            dom_feats = ["Housing instability", "Low income", "Social isolation"]

        cluster_profiles.append({
            "id": int(k),
            "name": name,
            "count": count,
            "description": description,
            "averages": {
                "c": avg_c,
                "p": avg_p,
                "s": avg_s,
                "risk": avg_risk,


In [None]:
import json
import numpy as np

# Example placeholders – replace with your real arrays
# ids: list[str], clusters: np.ndarray shape (n,)
# embedding: np.ndarray shape (n,2)
# view_profile: np.ndarray shape (n,3) columns = [c,p,s]
# primary_condition, age, sex, risk_score: arrays or lists

patients = []
for i, pid in enumerate(ids):
    patients.append({
        "id": str(pid),
        "cluster": int(clusters[i]),
        "embedding": {
            "x": float(embedding[i, 0]),
            "y": float(embedding[i, 1]),
        },
        "attributes": {
            "age": int(age[i]),
            "sex": "F" if sex[i] == 1 else "M",  # adapt to your coding
            "riskScore": float(risk_score[i]),
            "primaryCondition": str(primary_condition[i]),
        },
        "profile": {
            "c": float(view_profile[i, 0]),
            "p": float(view_profile[i, 1]),
            "s": float(view_profile[i, 2]),
        },
    })

# Build clusters summary – align cluster IDs to your SNF labels
clusters = []
for k in sorted(set(clusters)):
    mask = (clusters == k)
    count = int(mask.sum())
    if count == 0:
        continue

    avg_c = float(view_profile[mask, 0].mean())
    avg_p = float(view_profile[mask, 1].mean())
    avg_s = float(view_profile[mask, 2].mean())
    avg_risk = float(risk_score[mask].mean())

    # simple rule-based label, you can refine later
    max_view = max(("c", avg_c), ("p", avg_p), ("s", avg_s), key=lambda t: t[1])[0]
    if max_view == "c":
        desc = "Multimorbid, chronic-dominant profile."
        dom_feats = ["Diabetes", "CHF", "Polypharmacy"]
    elif max_view == "p":
        desc = "Acute physiological instability."
        dom_feats = ["Tachycardia", "Hypoxia", "Elevated Lactate"]
    else:
        desc = "High socio-economic deprivation."
        dom_feats = ["Housing instability", "Low income", "Social isolation"]

    clusters.append({
        "id": int(k),
        "name": f"Phenotype {chr(65 + int(k))}",
        "count": count,
        "description": desc,
        "averages": {"c": avg_c, "p": avg_p, "s": avg_s, "risk": avg_risk},
        "dominantFeatures": dom_feats,
    })

app_data = {"patients": patients, "clusters": clusters}

with open("snf-data.json", "w") as f:
    json.dump(app_data, f, indent=2)
