In [1]:
import numpy as np
import pandas as pd

In [2]:
paths = [
    "https://raw.githubusercontent.com/khanabdullah9/Heart-Disease-ds/refs/heads/master/train.csv",
    "https://raw.githubusercontent.com/khanabdullah9/Heart-Disease-ds/refs/heads/master/test.csv"
]

In [None]:
data = pd.read_csv(paths[0])
data.shape

In [None]:
def create_age_group(age):
    if age >= 0 and age <= 20:
        return "< 20"
    if age >= 21 and age <= 30:
        return "21-30"
    if age >= 31 and age <= 40:
        return "31-40"
    if age >= 41 and age <= 50:
        return "41-50"
    if age >= 51 and age <= 60:
        return "51-60"
    if age >= 61 and age <= 70:
        return "61-70"
    if age >= 71 and age <= 80:
        return "71-80"
    if age >= 81 and age <= 90:
        return "81-90"
    if age >= 91 and age <= 100:
        return "91-100"

def data_cleaning(raw_data):
    df = raw_data.copy()

    # feature engineering
    df["age_group"] = df["age"].apply(create_age_group)
    df["class"] = df["diagnosed_diabetes"].map({
        0: "Negative", 1: "Positive"
    })

    return df

In [None]:
df = data_cleaning(data)
df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def plot_pie_distribution(df: pd.DataFrame, feature: str, figsize = (3,3), title = ""):
    category_counts = df[feature].value_counts()
    labels = category_counts.index.tolist()
    sizes = category_counts.values

    plt.figure(figsize=figsize)

    sections, texts, autotexts = plt.pie(
        sizes,
        autopct='%1.1f%%',
        textprops={'fontsize': 12},
        wedgeprops={'edgecolor': 'black'}
    )

    plt.legend(
        sections,
        labels,
        title=feature.title(),
        loc="center left",
        bbox_to_anchor=(1, 0, 0.5, 1)
    )

    if title == "":
      title = f'Distribution of {feature}'
      
    plt.title(title, fontsize=14)
    plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

def plot_multiple_pie_charts(df: pd.DataFrame, features: list[str], dim,title = ""):
    num_features = len(features)

    # num_cols = int(math.ceil(math.sqrt(num_features)))
    # num_rows = int(math.ceil(num_features / num_cols))
    num_cols = dim[1]; num_rows = dim[0]

    fig, axes = plt.subplots(num_rows, num_cols,
                             figsize=(5 * num_cols, 5 * num_rows))
    plt.suptitle(title, fontsize = 14, fontweight = "bold")

    axes = axes.flatten()

    for i, feature in enumerate(features):
        category_counts = df[feature].value_counts()
        labels = category_counts.index.tolist()
        sizes = category_counts.values

        axes[i].pie(
            sizes,
            labels=labels,
            autopct='%1.1f%%'
        )

        axes[i].set_title(f'Distribution of {feature.title()}', fontsize=14)
        axes[i].axis('equal')



    plt.tight_layout()
    plt.show()


In [None]:
patients = df[df["diagnosed_diabetes"] == 1] # tested positive
non_patients = df[df["diagnosed_diabetes"] == 0] # tested negative

In [None]:
plot_multiple_pie_charts(non_patients, [
        "gender", "ethnicity", "education_level", "income_level", "smoking_status", "employment_status"
    ],
    dim = (2,3),
    title = "People NOT diagnosed with diabetes")

def plot_box(feature, df):
    sns.set(rc={'figure.figsize':(4,4)})
    sns.boxplot(
        x='class',
        y=feature,
        data=df
    ).set(
        xlabel="",
        ylabel=feature
    )
    plt.show()

In [None]:
plot_box("alcohol_consumption_per_week", df = df)

In [None]:
def plot_multiple_boxplots_grid(df: pd.DataFrame, features: list[str], dim):
    num_features = len(features)

    # Calculate grid dimensions (e.g., 4 features -> 2 rows, 2 columns)
    # num_cols = int(math.ceil(math.sqrt(num_features)))
    # num_rows = int(math.ceil(num_features / num_cols))
    num_cols = dim[1]; num_rows = dim[0]

    # Create the figure and subplots
    fig, axes = plt.subplots(num_rows, num_cols,
                             figsize=(5 * num_cols, 5 * num_rows))

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    for i, feature in enumerate(features):
        sns.boxplot(
            x='class',
            y=feature,
            data=df,
            ax=axes[i]
        )

        axes[i].set_xlabel("Class")
        axes[i].set_ylabel(feature.replace('_', ' ').title()) # Clean up label
        axes[i].set_title(f'Distribution of {feature.title()} by result')

    # Remove any unused subplots
    for j in range(num_features, num_rows * num_cols):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

In [None]:
features_to_plot = ["physical_activity_minutes_per_week","bmi","heart_rate","cholesterol_total"]
plot_multiple_boxplots_grid(df, features_to_plot, dim = (2,2))

In [None]:
def plot_waffle_chart(feature, df,Waffle):
    labels = df[feature].unique()

    data = {}
    for lbl in labels:
        data[lbl] = df[df[feature] == lbl].shape[1]


    fig = plt.figure(
        FigureClass=Waffle,
        rows=15,
        columns=20,
        values= data,
        legend = {"bbox_to_anchor" : (1.5,1)}
    )

In [None]:
from pywaffle import Waffle

plot_waffle_chart(feature = 'age_group', df = df, Waffle = Waffle)