# Lateness Analysis - Knife Start Delay: Charts 


In [None]:
# Imports & Config 
import pandas as pd
import numpy as np
import plotly.graph_objects as go

DATA_PATH = r"C:/Users/egtay/Downloads/Telegram Desktop/Final_Cleaned_Dataset_OPTIC_7.csv"
VALUE_COL = "KNIFE_START_DELAY"   
LATE_COL  = "Is_Late"             
TOP_K     = 10

# Dropdown menu items: display label -> dataframe column 
CATEGORY_MAP = {
    "Delay_Category": "Delay_Category",
    "LOCATION": "LOCATION",
    "ROOM": "ROOM",
    "EQUIPMENT ": "__EQUIPMENT_SPLIT__",   
    "EMERGENCY_PRIORITY": "EMERGENCY_PRIORITY",
    "DISCIPLINE": "DISCIPLINE",
    "ANESTHESIA": "ANESTHESIA",
    "ADMISSION_CLASS_TYPE": "ADMISSION_CLASS_TYPE",
    "ADMISSION_WARD": "ADMISSION_WARD",
    "ADMISSION_BED": "ADMISSION_BED",
    "AOH": "AOH",
    "BLOOD": "BLOOD",
    "IMPLANT": "__IMPLANT_BIN__",    
    "CANCER_INDICATOR": "CANCER_INDICATOR",
    "TRAUMA_INDICATOR": "TRAUMA_INDICATOR",
}

# Delay bins for the clustered bar (% within late cases)
BINS = [0, 30, 60, 90, 120, np.inf]
BIN_LABELS = ["0-30", "30-60", "60-90", "90-120", "120+"]


In [None]:
# Load & Preprocess (single pass) 
df = pd.read_csv(DATA_PATH)

# Ensure numeric delay and drop nulls for delay-linked visuals
df[VALUE_COL] = pd.to_numeric(df[VALUE_COL], errors="coerce")
df = df.dropna(subset=[VALUE_COL]).copy()

# Ensure late flag exists and is int
if LATE_COL not in df.columns:
    df[LATE_COL] = 0
df[LATE_COL] = pd.to_numeric(df[LATE_COL], errors="coerce").fillna(0).astype(int)

# EQUIPMENT explode (if present) -> df_eq with __EQUIPMENT_SPLIT__
if "EQUIPMENT" in df.columns:
    eq = (
        df[["OPERATION_ID", "EQUIPMENT"]]
        .assign(EQUIPMENT=lambda d: d["EQUIPMENT"].astype(str).fillna("").str.split(","))
        .explode("EQUIPMENT")
    )
    eq["EQUIPMENT"] = eq["EQUIPMENT"].astype(str).str.strip()
    eq.loc[eq["EQUIPMENT"].isin(["", "nan", "None", "NULL"]), "EQUIPMENT"] = np.nan
    eq = eq.dropna(subset=["EQUIPMENT"])
    df_eq = df.merge(eq, on="OPERATION_ID", how="left", suffixes=("", "_x"))
    df_eq["__EQUIPMENT_SPLIT__"] = df_eq["EQUIPMENT_x"]
else:
    df_eq = df.copy()
    df_eq["__EQUIPMENT_SPLIT__"] = np.nan

# IMPLANT bin (0 vs non-0) -> df["__IMPLANT_BIN__"]
if "IMPLANT" in df.columns:
    def _to_implant_bin(s):
        try:
            v = float(s)
            return "No implant" if v == 0 else "Has implant"
        except Exception:
            return "Has implant (non-0)"
    df["__IMPLANT_BIN__"] = df["IMPLANT"].astype(str).map(_to_implant_bin)
else:
    df["__IMPLANT_BIN__"] = np.nan


In [None]:
# Shared Helpers
def get_working_df(label: str) -> tuple[pd.DataFrame, str]:
    """Return a copy of the right base df and its column name for a given dropdown label."""
    col = CATEGORY_MAP[label]
    base = df_eq if col == "__EQUIPMENT_SPLIT__" else df
    w = base.copy()
    if col in w.columns:
        w[col] = (
            w[col].astype(str)
                 .str.strip()
                 .replace({"": np.nan, "nan": np.nan, "None": np.nan, "NULL": np.nan})
                 .fillna("Unspecified")
        )
    return w, col

def select_top_by_volume(w: pd.DataFrame, col: str, k: int = TOP_K) -> pd.DataFrame:
    counts = w[col].value_counts()
    if len(counts) > k:
        return w[w[col].isin(counts.iloc[:k].index)]
    return w

def make_clustered_pivots(label: str):
    """Return (levels, pivot_pct, pivot_cnt, pivot_med) for the clustered % bar chart."""
    w, col = get_working_df(label)
    if col not in w.columns or w.empty:
        return [], pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # Only late cases for this visual
    w = w[w[LATE_COL] == 1].copy()
    if w.empty:
        return [], pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    w = select_top_by_volume(w, col, TOP_K)
    w["Delay_Bin"] = pd.cut(w[VALUE_COL], bins=BINS, labels=BIN_LABELS, right=False, include_lowest=True)

    gb = w.groupby([col, "Delay_Bin"], observed=False)[VALUE_COL]
    agg = gb.agg(count="count", median="median").reset_index()
    totals = w.groupby(col, observed=False)[VALUE_COL].count().rename("total").reset_index()
    agg = agg.merge(totals, on=col, how="left")
    agg["pct"] = (agg["count"] / agg["total"] * 100.0)

    order = totals.sort_values("total", ascending=False)[col].astype(str).tolist()
    pivot_pct = agg.pivot(index=col, columns="Delay_Bin", values="pct").reindex(index=order, columns=BIN_LABELS).fillna(0.0)
    pivot_cnt = agg.pivot(index=col, columns="Delay_Bin", values="count").reindex(index=order, columns=BIN_LABELS).fillna(0).astype(int)
    pivot_med = agg.pivot(index=col, columns="Delay_Bin", values="median").reindex(index=order, columns=BIN_LABELS).round(1)
    return order, pivot_pct, pivot_cnt, pivot_med

def make_median_agg(label: str) -> pd.DataFrame:
    """Median VALUE_COL by category (Top-K by volume), with counts; sorted by median desc."""
    w, col = get_working_df(label)
    if col not in w.columns:
        return pd.DataFrame(columns=[col, "median_delay", "count"])
    w = select_top_by_volume(w, col, TOP_K)
    agg = (w.groupby(col, dropna=False)[VALUE_COL]
             .agg(median_delay="median", count="count")
             .reset_index()
             .sort_values("median_delay", ascending=False))
    agg[col] = agg[col].astype(str)
    return agg

def _to_dt(s):
    return None if s is None else pd.to_datetime(s, errors="coerce")

def _dur_mins(t1, t0):
    if t1 is None or t0 is None:
        return None
    return (t1 - t0).dt.total_seconds() / 60.0

def build_stage_duration_frame(base_df: pd.DataFrame) -> pd.DataFrame:
    cols = {
        "RECEPTION_IN": _to_dt(base_df.get("ACTUAL_RECEPTION_IN_TIME")),
        "ENTER_OR": _to_dt(base_df.get("ACTUAL_ENTER_OR_TIME")),
        "INDUCTION": _to_dt(base_df.get("ACTUAL_ANAESTHESIA_INDUCTION")),
        "SURG_PREP": _to_dt(base_df.get("ACTUAL_SURGERY_PREP_TIME")),
        "KNIFE": _to_dt(base_df.get("ACTUAL_KNIFE_TO_SKIN_TIME")),
        "CLOSURE": _to_dt(base_df.get("ACTUAL_SKIN_CLOSURE")),
        "REVERSAL": _to_dt(base_df.get("ACTUAL_PATIENT_REVERSAL_TIME")),
        "EXIT_OR": _to_dt(base_df.get("ACTUAL_EXIT_OR_TIME")),
        "RECOVERY_IN": _to_dt(base_df.get("ACTUAL_EXIT_RECOVERY_TIME")),
        "OR_CLEANUP": _to_dt(base_df.get("ACTUAL_OR_CLEANUP_TIME")),
    }
    stage_specs = {
        "Reception→Enter OR": _dur_mins(cols["ENTER_OR"], cols["RECEPTION_IN"]),
        "Enter OR→Induction": _dur_mins(cols["INDUCTION"], cols["ENTER_OR"]),
        "Induction→Prep":     _dur_mins(cols["SURG_PREP"], cols["INDUCTION"]),
        "Prep→Knife":         _dur_mins(cols["KNIFE"], cols["SURG_PREP"]),
        "Procedure (Knife→Closure)": _dur_mins(cols["CLOSURE"], cols["KNIFE"]),
        "Reversal (Closure→Exit)":   _dur_mins(cols["EXIT_OR"], cols["CLOSURE"]),
        "Cleanup (Exit→End)": (_dur_mins(cols["OR_CLEANUP"], cols["EXIT_OR"]) 
                                 if cols["OR_CLEANUP"] is not None else _dur_mins(cols["RECOVERY_IN"], cols["EXIT_OR"])),
        "Setup (Enter→Knife)": _dur_mins(cols["KNIFE"], cols["ENTER_OR"]),
        "Post-proc (Closure→End)": (_dur_mins(cols["OR_CLEANUP"], cols["CLOSURE"]) 
                                      if cols["OR_CLEANUP"] is not None else _dur_mins(cols["RECOVERY_IN"], cols["CLOSURE"])),
    }
    stage_df = pd.DataFrame({k: v for k, v in stage_specs.items() if v is not None})
    for c in stage_df.columns:
        stage_df.loc[stage_df[c] < -5, c] = np.nan

    if "EMERGENCY_PRIORITY" in base_df.columns:
        stage_df["__SPLIT__"] = np.where(
            base_df["EMERGENCY_PRIORITY"].astype(str).str.lower().isin(["emergency", "1", "true", "yes"]),
            "Emergency", "Elective"
        )
    elif "CASE_STATUS" in base_df.columns:
        stage_df["__SPLIT__"] = np.where(
            base_df["CASE_STATUS"].astype(str).str.lower().str.contains("emerg"),
            "Emergency", "Elective"
        )
    else:
        stage_df["__SPLIT__"] = "All"
    return stage_df

def corr_matrix(stage_df: pd.DataFrame, how="pearson", subset="All") -> pd.DataFrame:
    sdf = stage_df[stage_df["__SPLIT__"] == subset] if subset != "All" else stage_df.copy()
    num_cols = [c for c in sdf.columns if c != "__SPLIT__"]
    sdf = sdf[num_cols]
    if sdf.shape[0] < 2:
        return pd.DataFrame([], columns=num_cols, index=num_cols)
    return sdf.corr(method=how)


In [None]:
# Chart 1 — Stage Duration Correlation Heatmap (Pearson/Spearman) 
stage_df = build_stage_duration_frame(df)

splits = ["All"] + (["Elective", "Emergency"] if stage_df["__SPLIT__"].nunique() > 1 else [])
methods = ["pearson", "spearman"]

# Precompute correlation matrices
mats = {(s, m): corr_matrix(stage_df, how=m, subset=s) for s in splits for m in methods}
default = mats[(splits[0], methods[0])]
if default.empty:
    raise ValueError("Not enough data to compute correlations. Check timestamp columns and data quality.")

order = list(default.columns)

def mat_to_z_text(m):
    m = m.reindex(index=order, columns=order)
    z = m.values
    text = np.round(m.values, 3).astype(str)
    return z, text

z0, text0 = mat_to_z_text(default)

fig1 = go.Figure(data=go.Heatmap(
    z=z0, x=order, y=order, colorscale="RdBu", zmin=-1, zmax=1,
    colorbar=dict(title="Correlation")
))

# helper to add annotations
def add_annotations(fig, text_matrix):
    fig.update_traces(hovertemplate="x: %{x}<br>y: %{y}<br>corr: %{z:.3f}<extra></extra>")
    fig.update_layout(annotations=[])
    ann = []
    for i, ylab in enumerate(order):
        for j, xlab in enumerate(order):
            ann.append(dict(x=xlab, y=ylab, text=text_matrix[i][j],
                            showarrow=False, font=dict(size=11)))
    fig.update_layout(annotations=ann)

add_annotations(fig1, text0)

buttons3 = []
for s in splits:
    for m in methods:
        mat = mats[(s, m)]
        if mat.empty:
            z = np.zeros((len(order), len(order)))
            text = np.full_like(z, "", dtype=object)
        else:
            z, text = mat_to_z_text(mat)
        buttons3.append(dict(
            label=f"{s} • {m.capitalize()}",
            method="update",
            args=[
                {"z": [z]},
                {"title": f"Stage Duration Correlation Heatmap — {s} • {m.capitalize()}"}
            ]
        ))

fig1.update_layout(
    title=dict(
        text="Stage Duration Correlation Heatmap",
        x=0.05, xanchor="left",
        y=1.0, yanchor="top",
        pad=dict(t=10, b=20)
    ),
    margin=dict(t=110, l=80, r=60, b=80),
    updatemenus=[dict(
        buttons=buttons3,
        direction="down",
        x=1.23, xanchor="left",
        y=1.0,  yanchor="top",
        showactive=True,
        bgcolor="rgba(255,255,255,0.9)",
        pad=dict(t=8, r=8)
    )],
    xaxis=dict(side="top", tickangle=-45, tickfont=dict(size=10), automargin=True),
    yaxis=dict(autorange="reversed", tickfont=dict(size=10), automargin=True),
    height=700
)

fig1.show()

In [None]:
# Chart 2: Clustered Column % of Late Cases by Delay Range (Top-K) 
fig2 = go.Figure()
buttons1, masks1 = [], []
trace_offset = 0
total_traces = 0

precomp = {}
for label in CATEGORY_MAP.keys():
    levels, p_pct, p_cnt, p_med = make_clustered_pivots(label)
    precomp[label] = (levels, p_pct, p_cnt, p_med)
    if not levels:
        masks1.append([False] * trace_offset)
        continue

    # add one Bar per delay bin
    for b in BIN_LABELS:
        y = p_pct[b].values
        c = p_cnt[b].values
        m = p_med[b].values
        med_disp = np.where(np.isnan(m), "", np.round(m, 1))
        fig2.add_trace(go.Bar(
            x=levels, y=y, name=b,
            customdata=np.c_[c, med_disp],
            hovertemplate=(
                "Delayed Minutes (Knife-to-Skin): <b>" + b + "</b><br>"
                "%{x}<br>"
                "Pct late: %{y:.1f}%<br>"
                "Cases: %{customdata[0]}<br>"
                "Median: %{customdata[1]} min<extra></extra>"
            )
        ))
    n_new = len(BIN_LABELS)
    masks1.append([False] * trace_offset + [True] * n_new)
    trace_offset += n_new

total_traces = trace_offset
masks1 = [m + [False] * (total_traces - len(m)) for m in masks1]

# Dropdown buttons
for label, mask in zip(CATEGORY_MAP.keys(), masks1):
    buttons1.append(dict(
        label=label,
        method="update",
        args=[
            {"visible": mask},
            {
                "title": f"Late Cases — % by Delay Range for {label} (Top {TOP_K} by volume)",
                "yaxis": {"title": "% of Late Cases"},
                "xaxis": {"title": label},
                "barmode": "group",
                "legend": {"title": {"text": "Delayed Minutes (Knife-to-Skin)"}},
            },
        ]
    ))

if masks1:
    init_mask = masks1[0]
    for i, tr in enumerate(fig2.data):
        tr.visible = init_mask[i]

fig2.update_layout(
    title=f"Late Cases — % by Delay Range for {list(CATEGORY_MAP.keys())[0]} (Top {TOP_K} by volume)",
    yaxis_title="% of Late Cases",
    barmode="group",
    legend_title_text="Delayed Minutes (Knife-to-Skin)",
    updatemenus=[dict(
        type="dropdown",
        x=1.0, xanchor="right",
        y=1.15, yanchor="top",
        buttons=buttons1,
        showactive=True
    )],
    margin=dict(l=60, r=30, t=70, b=80)
)

fig2.show()

In [None]:
# Chart 3 — Median KNIFE_START_DELAY by Category (Top-K with dynamic y-range) 
fig3 = go.Figure()
buttons2, masks2 = [], []
trace_offset = 0
aggs = {}

for label in CATEGORY_MAP.keys():
    agg = make_median_agg(label)
    aggs[label] = agg
    if agg.empty:
        masks2.append([False] * trace_offset)
        continue

    cat_col = CATEGORY_MAP[label]
    fig3.add_trace(go.Bar(
        x=agg[cat_col],
        y=agg["median_delay"],
        customdata=np.c_[agg["count"]],
        text=agg["median_delay"].round(1).astype(str),
        textposition="outside",
        hovertemplate=(
            f"<b>{label}</b>: %{{x}}<br>"
            "Median: %{y:.1f} min<br>"
            "Cases: %{customdata[0]}<extra></extra>"
        ),
        name=label
    ))
    masks2.append([False] * trace_offset + [True])
    trace_offset += 1

total2 = trace_offset
masks2 = [m + [False] * (total2 - len(m)) for m in masks2]

for label, mask in zip(CATEGORY_MAP.keys(), masks2):
    agg = aggs[label]
    if not agg.empty:
        y_min, y_max = agg["median_delay"].min(), agg["median_delay"].max()
        lower = 0 if y_min >= 0 else y_min - 0.1 * abs(y_min)
        upper = max(100, y_max + 0.1 * abs(y_max))
        yr = [lower, upper]
    else:
        yr = [0, 100]

    buttons2.append(dict(
        label=label,
        method="update",
        args=[
            {"visible": mask},
            {
                "title": f"Median KNIFE_START_DELAY by {label} (Top {TOP_K})",
                "yaxis": {"range": yr},
                "xaxis": {"title": label},
            },
        ]
    ))

# init: show first label
if masks2:
    init_mask = masks2[0]
    for i, tr in enumerate(fig3.data):
        tr.visible = init_mask[i]

# default y range according to first label
if aggs and not aggs[list(CATEGORY_MAP.keys())[0]].empty:
    a0 = aggs[list(CATEGORY_MAP.keys())[0]]
    y_min, y_max = a0["median_delay"].min(), a0["median_delay"].max()
    lower = 0 if y_min >= 0 else y_min - 0.1 * abs(y_min)
    upper = max(100, y_max + 0.1 * abs(y_max))
    fig3.update_yaxes(range=[lower, upper])
else:
    fig3.update_yaxes(range=[0, 100])

fig3.update_layout(
    title=f"Median KNIFE_START_DELAY by {list(CATEGORY_MAP.keys())[0]} (Top {TOP_K})",
    yaxis_title="Median KNIFE_START_DELAY (minutes)",
    bargap=0.25,
    updatemenus=[dict(
        type="dropdown",
        x=1.0, xanchor="right",
        y=1.15, yanchor="top",
        buttons=buttons2,
        showactive=True
    )],
    margin=dict(l=60, r=30, t=70, b=80)
)

fig3.show()


In [None]:
# Chart 4 — Box Plot of KNIFE_START_DELAY by Category (Top-K, Dropdown) 
import plotly.express as px

fig4 = go.Figure()
buttons4, masks4 = [], []
trace_offset = 0
precomp_box = {}

# color palette
palette = px.colors.qualitative.Plotly + px.colors.qualitative.D3 + px.colors.qualitative.Set2

for label in CATEGORY_MAP.keys():
    w, col = get_working_df(label)
    if col not in w.columns:
        masks4.append([False] * trace_offset)
        continue

    # Limit to top-K and fix order
    counts = w[col].value_counts()
    keep = counts.iloc[:TOP_K].index.tolist()
    w = w[w[col].isin(keep)].copy()
    if w.empty:
        masks4.append([False] * trace_offset)
        continue
    w[col] = pd.Categorical(w[col], categories=keep, ordered=True)

    # assign one color per level 
    color_map = {cat: palette[i % len(palette)] for i, cat in enumerate(keep)}

    for i, cat in enumerate(keep):
        subset = w[w[col] == cat]
        fig4.add_trace(
            go.Box(
                x=[cat] * len(subset),
                y=subset[VALUE_COL],
                boxpoints="outliers",
                marker=dict(color=color_map[cat], opacity=0.7),
                line=dict(color=color_map[cat], width=1),
                name=str(cat),
                width=0.6,  
                hovertemplate=f"<b>{label}</b>: {cat}<br>Delay: %{{y:.1f}} min<extra></extra>",
            )
        )

    # track number of traces added
    n_new = len(keep)
    masks4.append([False] * trace_offset + [True] * n_new)
    trace_offset += n_new
    precomp_box[label] = {"df": w, "order": keep}

# pad masks
total4 = trace_offset
masks4 = [m + [False] * (total4 - len(m)) for m in masks4]

# dropdown logic
for label, mask in zip(CATEGORY_MAP.keys(), masks4):
    info = precomp_box.get(label)
    if info is not None:
        w = info["df"]
        order = info["order"]
        q1, q99 = w[VALUE_COL].quantile([0.01, 0.99])
        lower = max(-50, q1 - 0.1 * abs(q1))
        upper = q99 + 0.1 * abs(q99)
        yr = [lower, upper]
        xargs = {"title": label, "categoryorder": "array", "categoryarray": order}
    else:
        yr = [0, 100]
        xargs = {"title": label}

    buttons4.append(dict(
        label=label,
        method="update",
        args=[
            {"visible": mask},
            {
                "title": f"Distribution of KNIFE_START_DELAY by {label} (Top {TOP_K})",
                "yaxis": {"title": "KNIFE_START_DELAY (minutes)", "range": yr},
                "xaxis": xargs,
            },
        ]
    ))

# initialize with first label visible
if masks4:
    init_mask = masks4[0]
    for i, tr in enumerate(fig4.data):
        tr.visible = init_mask[i]

# initial axes
first_label = list(CATEGORY_MAP.keys())[0]
if precomp_box.get(first_label):
    info0 = precomp_box[first_label]
    w0, order0 = info0["df"], info0["order"]
    q1, q99 = w0[VALUE_COL].quantile([0.01, 0.99])
    lower = max(-50, q1 - 0.1 * abs(q1))
    upper = q99 + 0.1 * abs(q99)
    fig4.update_xaxes(title_text=first_label, categoryorder="array", categoryarray=order0)
    fig4.update_yaxes(title_text="KNIFE_START_DELAY (minutes)", range=[lower, upper])
else:
    fig4.update_yaxes(title_text="KNIFE_START_DELAY (minutes)", range=[0, 100])

fig4.update_layout(
    title=f"Distribution of KNIFE_START_DELAY by {first_label} (Top {TOP_K})",
    boxmode="group",
    margin=dict(l=60, r=30, t=70, b=80),
    updatemenus=[dict(
        type="dropdown",
        x=1.0, xanchor="right",
        y=1.15, yanchor="top",
        buttons=buttons4,
        showactive=True
    )],
)

fig4.show()