In [4]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots


In [5]:
import json, pandas as pd, os

# Required clean dataset from Phase 2
DATA_PATH = r"C:\Users\habib\OneDrive\المستندات\Graduation Project\GRAD-proj-DEPI\DS1\Data\Clean\phase2_final_model_ready.csv"
assert os.path.exists(DATA_PATH), "Clean dataset missing. Run Phase 2 first."
df = pd.read_csv(DATA_PATH)

In [6]:
SPEC_PATH = "Data/Clean/model_spec.json"
if os.path.exists(SPEC_PATH):
    with open(SPEC_PATH) as f:
        spec = json.load(f)
    FEATURES_NUM = spec["features_num"]
    FEATURES_CAT = spec["features_cat"]
    FEATURES_BIN = spec["features_bin"]
    TARGET = spec["target"]
else:
    # fallback if spec not saved yet
    FEATURES_NUM = ["Age (years)","Resting BP (mm Hg)","Cholesterol (mg/dl)",
                    "Max Heart Rate (bpm)","ST Depression (oldpeak)"]
    FEATURES_CAT = ["Chest Pain Type","Resting ECG","ST Slope","Thalassemia","Major Vessels (0–3)"]
    FEATURES_BIN = ["Fasting Blood Sugar","Exercise Angina",
                    "Fasting Blood Sugar Missing","Exercise Angina Missing"]
    TARGET = "Heart Disease Class (0–4)"

print("Loaded Phase 2 clean data:", DATA_PATH, "\nShape:", df.shape)
print("Using spec:", FEATURES_NUM, FEATURES_CAT, FEATURES_BIN, TARGET)

Loaded Phase 2 clean data: C:\Users\habib\OneDrive\المستندات\Graduation Project\GRAD-proj-DEPI\DS1\Data\Clean\phase2_final_model_ready.csv 
Shape: (918, 15)
Using spec: ['Age (years)', 'Resting BP (mm Hg)', 'Cholesterol (mg/dl)', 'Max Heart Rate (bpm)', 'ST Depression (oldpeak)'] ['Chest Pain Type', 'Resting ECG', 'ST Slope', 'Thalassemia', 'Major Vessels (0–3)'] ['Fasting Blood Sugar', 'Exercise Angina', 'Fasting Blood Sugar Missing', 'Exercise Angina Missing'] Heart Disease Class (0–4)


In [7]:
import os, pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

plt.rcParams.update({
    "figure.facecolor":"white","axes.facecolor":"white","axes.edgecolor":"#222",
    "axes.labelcolor":"#222","xtick.color":"#222","ytick.color":"#222",
    "font.size":11,"axes.grid":True,"grid.color":"#eee","axes.titleweight":"bold"
})


TGT = "Heart Disease Class (0–4)"
NUM = ["Age (years)", "Resting BP (mm Hg)", "Cholesterol (mg/dl)",
       "Max Heart Rate (bpm)", "ST Depression (oldpeak)"]
CAT = ["Chest Pain Type","Resting ECG","ST Slope","Thalassemia","Major Vessels (0–3)"]

# Visualization copy: exclude Unknown in charts for clarity (kept in df for modeling)
df_viz = df.copy()
for c in CAT:
    if c in df_viz.columns:
        df_viz = df_viz[df_viz[c].astype(str) != "Unknown"]

print("EDA data (visuals) shape:", df_viz.shape)


EDA data (visuals) shape: (299, 15)


**A) Univariate analysis**
> Numeric: histograms to see shape and outliers.

> Categorical: count plots.

In [8]:
heart_counts = df_viz[TGT].value_counts().sort_index().reset_index()
heart_counts.columns = ['Heart Disease Type', 'Count']
heart_counts['Percent'] = (heart_counts['Count']/heart_counts['Count'].sum()*100).round(1)

fig = px.pie(
    heart_counts,
    names='Heart Disease Type',
    values='Count',
    title='Heart Disease Type Distribution',
    color='Heart Disease Type',
    color_discrete_sequence=px.colors.sequential.Viridis,
    hole=0.35
)
fig.update_traces(
    textposition='outside',
    texttemplate="%{label}: %{percent:.1%}",
    hovertemplate="Class %{label}<br>Count %{value}<br>% %{percent:.1%}<extra></extra>",
    pull=[0.03]*len(heart_counts)
)
fig.update_layout(showlegend=False, paper_bgcolor="white", plot_bgcolor="white")
fig.show()

In [18]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

TGT = "Heart Disease Class (0–4)"
NUM = ["Age (years)", "Resting BP (mm Hg)", "Cholesterol (mg/dl)",
       "Max Heart Rate (bpm)", "ST Depression (oldpeak)"]
palette = {0:"#2F4B7C",1:"#F95D6A",2:"#00A7A7",3:"#FFB000",4:"#6A4C93"}

rows, cols = 2, 3
fig = make_subplots(rows=rows, cols=cols, subplot_titles=NUM)

def add_numeric_panel(fig, col, r, c):
    # Density-like smooth histogram per class, stacked for clarity
    classes = sorted(df_viz[TGT].dropna().unique())
    ymax = 0
    for k in classes:
        s = df_viz.loc[df_viz[TGT]==k, col].dropna()
        if s.empty: continue
        hist = np.histogram(s, bins=30, density=False)
        x = (hist[1][1:] + hist[1][:-1]) / 2
        y = hist[0]
        ymax = max(ymax, y.max() if len(y) else 0)
        fig.add_trace(go.Scatter(
            x=x, y=y, mode="lines", line=dict(color=palette[int(k)], width=2),
            name=f"Class {int(k)}", hovertemplate=f"{col}=%{{x}}<br>Count=%{{y}}<extra></extra>"
        ), row=r, col=c)
    # Global median and range annotation
    s_all = df_viz[col].dropna()
    if not s_all.empty:
        med = s_all.median()
        fig.add_shape(type="line", x0=med, x1=med, y0=0, y1=ymax*1.05,
                      line=dict(color="#A0AEC0", width=2, dash="dash"), row=r, col=c)
        fig.add_annotation(text=f"Median {med:.1f}", x=med, y=ymax*1.08,
                           xref=f"x{'' if (r-1)*cols+c==1 else (r-1)*cols+c}",
                           yref=f"y{'' if (r-1)*cols+c==1 else (r-1)*cols+c}",
                           showarrow=False, font=dict(size=10, color="#333"))

    fig.update_xaxes(title_text=col, row=r, col=c)
    fig.update_yaxes(title_text="Count", row=r, col=c)

r=c=1
for i, col in enumerate(NUM):
    add_numeric_panel(fig, col, r, c)
    c += 1
    if c > cols: c, r = 1, r+1

fig.update_layout(height=720, width=1020, title_text="Target vs Numeric",
                  template="plotly_white", legend_title="Class")
fig.show()


In [10]:


unknown_label = "Unknown (not recorded)"  # matches your display label
cat_cols = ['ST Slope', 'Thalassemia', 'Resting ECG', 'Chest Pain Type', 'Major Vessels (0–3)']

removed_counts = {}

for col in cat_cols:
    if col in df_viz.columns:
        # Normalize values: map raw 'Unknown' to display label and treat NaN as Unknown for filtering
        s = df_viz[col].astype(str)
        s = s.replace({"Unknown": unknown_label})
        s = s.fillna(unknown_label)
        # Count rows to be removed
        to_remove = (s == unknown_label)
        removed_counts[col] = int(to_remove.sum())
        # Apply filter
        df_viz = df_viz.loc[~to_remove].copy()
    else:
        removed_counts[col] = 0

print("Rows kept for visualization:", df_viz.shape[0])
print("Removed per column (Unknowns):", removed_counts)

Rows kept for visualization: 299
Removed per column (Unknowns): {'ST Slope': 0, 'Thalassemia': 0, 'Resting ECG': 0, 'Chest Pain Type': 0, 'Major Vessels (0–3)': 0}


In [11]:
fig = make_subplots(rows=3, cols=2, subplot_titles=cat_cols)
palette = ['#4c78a8', '#f58518', '#54a24b', '#e45756', '#72b7b2']  # consistent pleasant colors

for i, col in enumerate(cat_cols):
    row = (i // 2) + 1
    col_pos = (i % 2) + 1
    data = (df_viz[col].astype(str)
            .value_counts()
            .sort_values(ascending=False)
            .reset_index())
    data.columns = [col, 'Count']
    # add percentages for labels
    total = data['Count'].sum()
    data['Percent'] = (data['Count']/total*100).round(1)
    bar = go.Bar(
        x=data[col].astype(str),
        y=data['Count'],
        text=[f"{p}%" for p in data['Percent']],
        textposition='outside',
        name=col,
        marker=dict(color=palette[i % len(palette)])
    )
    fig.add_trace(bar, row=row, col=col_pos)

fig.update_layout(
    height=1000, width=980,
    title_text="Categorical Features — Counts and Percentages (Unknown excluded in visuals)",
    showlegend=False, template='plotly_white', bargap=0.35
)
fig.update_xaxes(tickangle=30)
fig.show()


**B) Bivariate analysis (vs. target)**
> Continuous vs target: boxplots per class (clearer than overlaid histograms).

> Categorical vs target: grouped count plots.

In [12]:
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go

target_col = 'Heart Disease Class (0–4)'

cont_cols = ['Age (years)', 'Resting BP (mm Hg)', 'Cholesterol (mg/dl)',
             'Max Heart Rate (bpm)', 'ST Depression (oldpeak)']

for c in cont_cols:
    if c in df_viz.columns:
        df_viz[c] = pd.to_numeric(df_viz[c], errors='coerce')

fig = make_subplots(rows=2, cols=3, subplot_titles=cont_cols)

row, col = 1, 1
for c in cont_cols:
    if c in df_viz.columns:
        box = go.Box(
            x=df_viz[target_col],
            y=df_viz[c],
            name=c,
            boxmean=True,
            hovertemplate=f"<b>Heart Disease Class</b>: %{{x}}<br><b>{c}</b>: %{{y}}<extra></extra>",
            marker_color='rgba(99, 110, 250, 0.7)'
        )
        fig.add_trace(box, row=row, col=col)
        col += 1
        if col > 3:
            col = 1
            row += 1

fig.update_layout(
    height=900,
    width=1100,
    title_text="Continuous Features vs Heart Disease Class ",
    template="plotly_white",
    showlegend=False
)
fig.show()


In [13]:
cat_cols = ['Chest Pain Type', 'Resting ECG', 'ST Slope', 'Thalassemia', 'Major Vessels (0–3)']

fig2 = make_subplots(rows=3, cols=2, subplot_titles=cat_cols)

row, col = 1, 1
for c in cat_cols:
    if c in df_viz.columns:
        data = df_viz.groupby([c, target_col]).size().reset_index(name='Count')
        bar = go.Bar(
            x=data[c],
            y=data['Count'],
            name=c,
            hovertemplate=f"<b>{c}</b>: %{{x}}<br>Heart Disease Class: %{{customdata}}<br>Count: %{{y}}<extra></extra>",    
            customdata=data[target_col],
            marker=dict(color='rgba(239,85,59,0.8)')
        )
        fig2.add_trace(bar, row=row, col=col)
        col += 1
        if col > 2:
            col = 1
            row += 1

fig2.update_layout(
    height=1200,
    width=1000,
    title_text="Categorical Features vs Heart Disease Class (Interactive Grouped Bars)",
    template="plotly_white",
    showlegend=False
)
fig2.show()


In [14]:
bin_cols = ['Fasting Blood Sugar', 'Exercise Angina']

fig3 = make_subplots(rows=1, cols=2, subplot_titles=bin_cols)

for i, c in enumerate(bin_cols, start=1):
    if c in df_viz.columns:
        data = df_viz.groupby([c, target_col]).size().reset_index(name='Count')
        fig3.add_trace(
            go.Bar(
                x=data[c],
                y=data['Count'],
                customdata=data[target_col],
                hovertemplate=f"<b>{c}</b>: %{{x}}<br>Heart Disease Class: %{{customdata}}<br>Count: %{{y}}<extra></extra>",
                marker=dict(color='rgba(0, 204, 150, 0.7)')
            ),
            row=1, col=i
        )

fig3.update_layout(
    height=500,
    width=900,
    title_text="Binary Features vs Heart Disease Class (Interactive)",
    template="plotly_white",
    showlegend=False
)
fig3.show()

**C) Correlation heatmap (numeric only)**

In [15]:

NUMERIC_FOR_CORR = ["Age (years)","Resting BP (mm Hg)","Cholesterol (mg/dl)",
                    "Max Heart Rate (bpm)","ST Depression (oldpeak)"]

corr = df[NUMERIC_FOR_CORR].corr(numeric_only=True)
fig = px.imshow(corr, color_continuous_scale="Viridis", origin="lower",
                text_auto=True, aspect="auto",
                title="Correlation Map (Cleaned Numeric Features)")
fig.update_layout(plot_bgcolor="white", paper_bgcolor="white")
fig.show()


In [16]:
import os, time
os.makedirs("Reports/figs", exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")
plt.savefig(f"Reports/figs/bivariate_boxplots_{ts}.png", dpi=220, bbox_inches="tight")
print("Saved:", f"Reports/figs/bivariate_boxplots_{ts}.png")


Saved: Reports/figs/bivariate_boxplots_20251104_094755.png


<Figure size 640x480 with 0 Axes>

In [19]:
import os, time, numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

os.makedirs("Reports/html", exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")

try:
    fig_bp
except NameError:
    # Rebuild a minimal version of the premium boxplot panel
    TGT = "Heart Disease Class (0–4)"
    NUM = ["Age (years)","Resting BP (mm Hg)","Cholesterol (mg/dl)",
           "Max Heart Rate (bpm)","ST Depression (oldpeak)"]
    cls_pal = {0:"#2F4B7C",1:"#F95D6A",2:"#00A7A7",3:"#FFB000",4:"#6A4C93"}
    rows, cols = 2, 3
    fig_bp = make_subplots(rows=rows, cols=cols, subplot_titles=NUM)
    r=c=1
    for i, col in enumerate(NUM):
        for k in sorted(df_viz[TGT].dropna().unique()):
            s = df_viz.loc[df_viz[TGT]==k, col].dropna()
            if s.empty: continue
            fig_bp.add_trace(go.Box(
                y=s, name=f"{int(k)}", marker_color=cls_pal[int(k)],
                boxmean=True, boxpoints="outliers", jitter=0, whiskerwidth=0.8,
                hovertemplate="Class %{name}<br>"+col+": %{y:.1f}<extra></extra>"
            ), row=r, col=c)
        fig_bp.update_xaxes(title_text="Class", row=r, col=c)
        fig_bp.update_yaxes(title_text=col, row=r, col=c)
        c += 1
        if c > cols: c, r = 1, r+1
    fig_bp.update_layout(height=740, width=1020, template="plotly_white",
                         title_text="Continuous vs Target — Boxplots", boxmode="group")

# Save HTML (no Kaleido needed)
out = f"Reports/html/plotly_boxplots_{ts}.html"
fig_bp.write_html(out, include_plotlyjs="cdn", full_html=True)
print("Saved:", out)


Saved: Reports/html/plotly_boxplots_20251104_095049.html


In [21]:
import plotly.express as px
import pandas as pd, numpy as np, time
from pathlib import Path

# 0) Ensure df_viz exists (Unknown excluded for visuals)
# If not already defined:
# df_viz = df.copy()
# for col in ["Chest Pain Type","Resting ECG","ST Slope","Thalassemia","Major Vessels (0–3)"]:
#     df_viz = df_viz[df_viz[col].astype(str) != "Unknown"]

TGT = "Heart Disease Class (0–4)"
cls_pal = {0:"#2F4B7C", 1:"#F95D6A", 2:"#00A7A7", 3:"#FFB000", 4:"#6A4C93"}

def cat_spotlight_ranked(df_in, col):
    dfi = df_in.copy()
    dfi[col] = dfi[col].astype(str)
    dfi[TGT] = dfi[TGT].astype(int)
    tbl = (dfi.groupby([col, TGT]).size()
           .reset_index(name="Count"))
    totals = (tbl.groupby(col, as_index=False)["Count"]
              .sum().rename(columns={"Count":"Total"}))
    tbl = tbl.merge(totals, on=col, how="left")
    order = totals.sort_values("Total", ascending=False)[col].tolist()
    tbl["Percent"] = (tbl["Count"]/tbl.groupby(col)["Count"].transform("sum")*100).round(1)
    fig = px.bar(
        tbl, x=col, y="Count", color=TGT, barmode="group",
        category_orders={col: order}, color_discrete_map=cls_pal,
        title=f"{col} by {TGT} — Ranked"
    )
    fig.update_traces(
        customdata=tbl["Percent"],
        hovertemplate=f"{col}=%{{x}}<br>Class=%{{marker.color}}"
                      f"<br>Count=%{{y}} (%{{customdata}}%)<extra></extra>"
    )
    fig.update_layout(plot_bgcolor="white", paper_bgcolor="white",
                      bargap=0.35, xaxis_title=col, yaxis_title="Count")
    return fig

# 1) Ensure fig_bp exists; if not, rebuild a minimal version
try:
    fig_bp
except NameError:
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    NUM = ["Age (years)","Resting BP (mm Hg)","Cholesterol (mg/dl)",
           "Max Heart Rate (bpm)","ST Depression (oldpeak)"]
    rows, cols = 2, 3
    fig_bp = make_subplots(rows=rows, cols=cols, subplot_titles=NUM)
    r=c=1
    for i, col in enumerate(NUM):
        for k in sorted(df_viz[TGT].dropna().unique()):
            s = df_viz.loc[df_viz[TGT]==k, col].dropna()
            if s.empty: continue
            fig_bp.add_trace(go.Box(
                y=s, name=f"{int(k)}", marker_color=cls_pal[int(k)],
                boxmean=True, boxpoints="outliers", jitter=0, whiskerwidth=0.8,
                hovertemplate="Class %{name}<br>"+col+": %{y:.1f}<extra></extra>"
            ), row=r, col=c)
        fig_bp.update_xaxes(title_text="Class", row=r, col=c)
        fig_bp.update_yaxes(title_text=col, row=r, col=c)
        c += 1
        if c > cols: c, r = 1, r+1
    fig_bp.update_layout(height=740, width=1020, template="plotly_white",
                         title_text="Continuous vs Target — Boxplots", boxmode="group")

# 2) Build dashboard
Path("Reports/html").mkdir(parents=True, exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")
html_path = f"Reports/html/phase3_bivariate_dashboard_{ts}.html"

sections = []
sections.append(fig_bp.to_html(include_plotlyjs="cdn", full_html=False))
for c in ["Chest Pain Type","Resting ECG","ST Slope","Thalassemia","Major Vessels (0–3)"]:
    sections.append(cat_spotlight_ranked(df_viz, c).to_html(include_plotlyjs=False, full_html=False))

with open(html_path, "w", encoding="utf-8") as f:
    f.write("<html><head><meta charset='utf-8'><title>Phase 3 — Bivariate</title></head><body>")
    f.write("<h2 style='font-family:Arial;margin:8px 0 16px;'>Phase 3 — Bivariate Analysis</h2>")
    for s in sections: f.write(s)
    f.write("</body></html>")

print("Saved dashboard:", html_path)


Saved dashboard: Reports/html/phase3_bivariate_dashboard_20251104_095214.html
