# Preprocessing 

## 1.0 Setting up path and generating labels

In [3]:
# Setup path and import
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))  

from src.preprocessing import preprocess_image
import pandas as pd
from tqdm import tqdm

### Generate labels.csv from CBIS-DDSM metadata

This script consolidates and standardizes metadata from the CBIS-DDSM dataset.
It merges mass and calcification case files (train and test sets), encodes the
'pathology' column into binary labels (0 = BENIGN, 1 = MALIGNANT), and constructs
full relative paths to the JPEG image files. It also renames selected columns for
consistency and exports a clean metadata CSV for use in preprocessing and model training.

Output:
    data/cibs-ddsm/metadata/labels.csv

Expected Downstream Use:
    - Used in 01_preprocessing.ipynb to locate, enhance, and resize mammograms
    - Used to associate labels and metadata with image files for CNN training

In [4]:
import pandas as pd
from pathlib import Path

meta_dir = Path("../data/cibs-ddsm/metadata")
output_path = meta_dir / "labels.csv"

# Load files
mass_train = pd.read_csv(meta_dir / "mass_case_description_train_set.csv")
mass_test = pd.read_csv(meta_dir / "mass_case_description_test_set.csv")
calc_train = pd.read_csv(meta_dir / "calc_case_description_train_set.csv")
calc_test = pd.read_csv(meta_dir / "calc_case_description_test_set.csv")

# Merge all
df = pd.concat([mass_train, mass_test, calc_train, calc_test], ignore_index=True)

# Label encoding
df["label"] = df["pathology"].map(lambda x: 1 if x == "MALIGNANT" else 0)

# Build image path
df["image_file_path"] = "jpeg/" + df["patient_id"] + "/" + df["image file path"]

# Rename columns for consistency
df = df.rename(columns={
    "abnormality type": "abnormality_type",
    "image view": "view",
    "left or right breast": "laterality"
})

# Final column selection
df_final = df[["image_file_path", "label", "abnormality_type", "view", "laterality", "patient_id"]]
df_final.to_csv(output_path, index=False)

print(f"✅ labels.csv created at: {output_path}")

✅ labels.csv created at: ../data/cibs-ddsm/metadata/labels.csv


In [17]:
labels_df = pd.read_csv("../data/cibs-ddsm/metadata/labels.csv")
labels_df.head()

Unnamed: 0,image_file_path,label,abnormality_type,view,laterality,patient_id
0,P_00001/Mass-Training_P_00001_LEFT_CC/1.3.6.1....,1,mass,CC,LEFT,P_00001
1,P_00001/Mass-Training_P_00001_LEFT_MLO/1.3.6.1...,1,mass,MLO,LEFT,P_00001
2,P_00004/Mass-Training_P_00004_LEFT_CC/1.3.6.1....,0,mass,CC,LEFT,P_00004
3,P_00004/Mass-Training_P_00004_LEFT_MLO/1.3.6.1...,0,mass,MLO,LEFT,P_00004
4,P_00004/Mass-Training_P_00004_RIGHT_MLO/1.3.6....,0,mass,MLO,RIGHT,P_00004


### Matching image file name to metadata

In [25]:
from pathlib import Path
import pandas as pd

labels_df = pd.read_csv("../data/cibs-ddsm/metadata/labels.csv")

# Extract last directory (DICOM UID folder)
labels_df["dicom_uid"] = labels_df["image_file_path"].apply(lambda x: Path(x).parent.name)

#### Prepare DataFrame for file path matching

In [26]:
image_dir = Path("../data/cibs-ddsm/raw")
jpg_paths = list(image_dir.glob("*/**/*.jpg"))

image_df = pd.DataFrame({
    "jpg_path": jpg_paths,
    "dicom_uid": [p.parent.name for p in jpg_paths]
})

#### Merge metadata with matched image paths

In [28]:
merged = labels_df.merge(image_df, on="dicom_uid", how="inner")
print(f"Matched {len(merged)} out of {len(labels_df)} label entries")

Matched 3568 out of 3568 label entries


In [29]:
merged = merged.drop(columns=["image_file_path"])  # remove old path
merged = merged.rename(columns={"jpg_path": "image_file_path"})
merged.to_csv("../data/cibs-ddsm/metadata/labels_resolved.csv", index=False)

### Cleaned labelled file 

In [31]:
labels_df = pd.read_csv("../data/cibs-ddsm/metadata/labels_resolved.csv")

In [34]:
labels_df.sample(5)

Unnamed: 0,label,abnormality_type,view,laterality,patient_id,dicom_uid,image_file_path
256,1,mass,MLO,LEFT,P_00383,1.3.6.1.4.1.9590.100.1.2.559685247119091113207...,../data/cibs-ddsm/raw/1.3.6.1.4.1.9590.100.1.2...
3140,0,calcification,MLO,RIGHT,P_01864,1.3.6.1.4.1.9590.100.1.2.135264260013637742110...,../data/cibs-ddsm/raw/1.3.6.1.4.1.9590.100.1.2...
2824,0,calcification,MLO,LEFT,P_01437,1.3.6.1.4.1.9590.100.1.2.413719783912934165841...,../data/cibs-ddsm/raw/1.3.6.1.4.1.9590.100.1.2...
2997,0,calcification,MLO,RIGHT,P_01691,1.3.6.1.4.1.9590.100.1.2.339103214011000787523...,../data/cibs-ddsm/raw/1.3.6.1.4.1.9590.100.1.2...
1719,0,calcification,MLO,LEFT,P_00011,1.3.6.1.4.1.9590.100.1.2.121177287111311333525...,../data/cibs-ddsm/raw/1.3.6.1.4.1.9590.100.1.2...


In [35]:
from pathlib import Path

labels_df["exists"] = labels_df["image_file_path"].apply(lambda x: Path(x).exists())
print(f"{labels_df['exists'].sum()} / {len(labels_df)} images found")

3568 / 3568 images found


-------

## 2.0 Image Preprocessing 

Create output directory for processed images

In [36]:
processed_dir = Path("../data/cibs-ddsm/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

Define Preprocessing Function

In [40]:
import cv2
from tqdm import tqdm

def preprocess_image(img_path, target_size=(224, 224)):
    img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
    if img is None:
        return None
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    img = clahe.apply(img)
    img = cv2.resize(img, target_size)
    return img

Process and save images

In [41]:
processed_dir = Path("../data/cibs-ddsm/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

for _, row in tqdm(labels_df.iterrows(), total=len(labels_df)):
    img = preprocess_image(row["image_file_path"])
    if img is not None:
        out_path = processed_dir / (Path(row["image_file_path"]).name)
        cv2.imwrite(str(out_path), img)

100%|██████████████████████████████████████████████████████████████████████████████████████| 3568/3568 [01:52<00:00, 31.76it/s]


In [43]:
labels_df["processed_path"] = labels_df["image_file_path"].apply(
    lambda x: str(processed_dir / Path(x).name)
)
labels_df.to_csv("../data/cibs-ddsm/metadata/meta.csv", index=False)
print("Saved updated meta.csv with processed image paths.")

Saved updated meta.csv with processed image paths.
