Reading and Importing Datasets

In [1]:
import pandas as pd
from io import StringIO
from pathlib import Path

RAW_COLUMN_NAMES = [
    "id",
    "ccf",
    "age",
    "sex",
    "painloc",
    "painexer",
    "relrest",
    "pncaden",
    "cp",
    "trestbps",
    "htn",
    "chol",
    "smoke",
    "cigs",
    "years",
    "fbs",
    "dm",
    "famhist",
    "restecg",
    "ekgmo",
    "ekgday",
    "ekgyr",
    "dig",
    "prop",
    "nitr",
    "pro",
    "diuretic",
    "proto",
    "thaldur",
    "thaltime",
    "met",
    "thalach",
    "thalrest",
    "tpeakbps",
    "tpeakbpd",
    "dummy",
    "trestbpd",
    "exang",
    "xhypo",
    "oldpeak",
    "slope",
    "rldv5",
    "rldv5e",
    "ca",
    "restckm",
    "exerckm",
    "restef",
    "restwm",
    "exeref",
    "exerwm",
    "thal",
    "thalsev",
    "thalpul",
    "earlobe",
    "cmo",
    "cday",
    "cyr",
    "num",
    "lmt",
    "ladprox",
    "laddist",
    "diag",
    "cxmain",
    "ramus",
    "om1",
    "om2",
    "rcaprox",
    "rcadist",
    "lvx1",
    "lvx2",
    "lvx3",
    "lvx4",
    "lvf",
    "cathef",
    "junk",
    "name",
]

PROCESSED_COLUMN_NAMES = [
    "age",
    "sex",
    "cp",
    "trestbps",
    "chol",
    "fbs",
    "restecg",
    "thalach",
    "exang",
    "oldpeak",
    "slope",
    "ca",
    "thal",
    "num",
]

CONTROL_CHARS = (chr(13), chr(9), chr(0))


def _load_raw_uci(text: str, path: str) -> pd.DataFrame:
    """Return a DataFrame from the raw space-delimited heart dataset."""
    sanitized = text
    for ch in CONTROL_CHARS:
        sanitized = sanitized.replace(ch, " ")

    tokens = [tok for tok in sanitized.split() if tok]
    record_length = len(RAW_COLUMN_NAMES)
    record_count, remainder = divmod(len(tokens), record_length)
    if remainder:
        print(
            f"Warning: {path} contains {remainder} trailing tokens that will be ignored"
        )

    trimmed = tokens[: record_count * record_length]
    rows = [
        trimmed[i * record_length : (i + 1) * record_length]
        for i in range(record_count)
    ]

    df = pd.DataFrame(rows, columns=RAW_COLUMN_NAMES)
    numeric_cols = RAW_COLUMN_NAMES[:-1]  # every column except the dummy name string
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")
    df.replace(-9, pd.NA, inplace=True)
    return df


def _load_processed_uci(text: str, path: str) -> pd.DataFrame:
    """Return a DataFrame from the processed comma-delimited heart dataset."""
    df = pd.read_csv(
        StringIO(text),
        header=None,
        names=PROCESSED_COLUMN_NAMES,
        na_values=["?"],
    )
    df = df.apply(pd.to_numeric, errors="coerce")
    df.replace(-9, pd.NA, inplace=True)
    return df


def load_uci_heart_dataset(path: str) -> pd.DataFrame:
    """Load either the raw (76-column) or processed (14-column) UCI heart file."""
    text = Path(path).read_bytes().decode("latin1", errors="ignore")

    first_non_blank = next((line for line in text.splitlines() if line.strip()), "")
    if "," in first_non_blank:
        return _load_processed_uci(text, path)

    return _load_raw_uci(text, path)


datasets = {
    "cleveland": load_uci_heart_dataset("processed.cleveland.data"),
    "hungarian": load_uci_heart_dataset("processed.hungarian.data"),
    "switzerland": load_uci_heart_dataset("processed.switzerland.data"),
    "long_beach_va": load_uci_heart_dataset("processed.va.data"),
}

for name, frame in datasets.items():
    print(f"{name}: {frame.shape[0]} rows × {frame.shape[1]} columns")

cleveland_df = datasets["cleveland"]
hungarian_df = datasets["hungarian"]
switzerland_df = datasets["switzerland"]
long_beach_va_df = datasets["long_beach_va"]

cleveland: 303 rows × 14 columns
hungarian: 294 rows × 14 columns
switzerland: 123 rows × 14 columns
long_beach_va: 200 rows × 14 columns
