In [7]:
# Setup path and import
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))  

from src.preprocessing import preprocess_image
import pandas as pd
from tqdm import tqdm


Generate labels.csv from CBIS-DDSM metadata

This script consolidates and standardizes metadata from the CBIS-DDSM dataset.
It merges mass and calcification case files (train and test sets), encodes the
'pathology' column into binary labels (0 = BENIGN, 1 = MALIGNANT), and constructs
full relative paths to the JPEG image files. It also renames selected columns for
consistency and exports a clean metadata CSV for use in preprocessing and model training.

Output:
    data/cibs-ddsm/metadata/labels.csv

Expected Downstream Use:
    - Used in 01_preprocessing.ipynb to locate, enhance, and resize mammograms
    - Used to associate labels and metadata with image files for CNN training

In [28]:
import pandas as pd
from pathlib import Path

meta_dir = Path("../data/cibs-ddsm/metadata")
output_path = meta_dir / "labels.csv"

# Load files
mass_train = pd.read_csv(meta_dir / "mass_case_description_train_set.csv")
mass_test = pd.read_csv(meta_dir / "mass_case_description_test_set.csv")
calc_train = pd.read_csv(meta_dir / "calc_case_description_train_set.csv")
calc_test = pd.read_csv(meta_dir / "calc_case_description_test_set.csv")

# Merge all
df = pd.concat([mass_train, mass_test, calc_train, calc_test], ignore_index=True)

# Label encoding
df["label"] = df["pathology"].map(lambda x: 1 if x == "MALIGNANT" else 0)

# Build image path
df["image_file_path"] = "jpeg/" + df["patient_id"] + "/" + df["image file path"]

# Rename columns for consistency
df = df.rename(columns={
    "abnormality type": "abnormality_type",
    "image view": "view",
    "left or right breast": "laterality"
})

# Final column selection
df_final = df[["image_file_path", "label", "abnormality_type", "view", "laterality", "patient_id"]]
df_final.to_csv(output_path, index=False)

print(f"✅ labels.csv created at: {output_path}")

✅ labels.csv created at: ../data/cibs-ddsm/metadata/labels.csv
