In [None]:
import os
import pandas as pd
import numpy as np
import pydicom
from PIL import Image, ImageEnhance
import random
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
ROOT_DIR = r"D:\manifest-ZkhPvrLo5216730872708713142\CBIS-DDSM"
RAW_DIR = ROOT_DIR
OUT_DIR = r"D:\manifest-ZkhPvrLo5216730872708713142\CBIS-DDSM_processed"

os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
calc_train = pd.read_csv(os.path.join(ROOT_DIR, "calc_case_description_train_set.csv"))
calc_test  = pd.read_csv(os.path.join(ROOT_DIR, "calc_case_description_test_set.csv"))
mass_train = pd.read_csv(os.path.join(ROOT_DIR, "mass_case_description_train_set.csv"))
mass_test  = pd.read_csv(os.path.join(ROOT_DIR, "mass_case_description_test_set.csv"))

df = pd.concat([calc_train, calc_test, mass_train, mass_test], ignore_index=True)
df = df[["pathology", "image file path"]]
df.columns = ["pathology", "image_file"]
df["pathology"] = df["pathology"].str.lower().str.strip()
df.head()


In [None]:
LABEL_MAP = {
    "malignant": "malignant",
    "benign": "benign",
    "benign_without_callback": "benign"
}

df["class"] = df["pathology"].map(LABEL_MAP)
df = df.dropna(subset=["class"])
df.head()


In [None]:
dicom_files = []

for root, _, files in os.walk(RAW_DIR):
    for f in files:
        if f.lower().endswith(".dcm"):
            dicom_files.append(os.path.join(root, f))

len(dicom_files)


In [None]:
dicom_lookup = {os.path.basename(x): x for x in dicom_files}


In [None]:
CLASSES = ["benign", "malignant"]

for c in CLASSES:
    os.makedirs(os.path.join(OUT_DIR, c), exist_ok=True)


In [None]:
def load_dicom(path):
    ds = pydicom.dcmread(path)
    arr = ds.pixel_array.astype(np.float32)
    arr -= arr.min()
    m = arr.max()
    if m > 0:
        arr /= m
    arr = (arr * 255).astype(np.uint8)
    return arr


In [None]:
def resize_img(arr):
    return Image.fromarray(arr).resize((224, 224))


In [None]:
def augment(img):
    out = []
    if random.random() > 0.5:
        out.append(img.transpose(Image.FLIP_LEFT_RIGHT))
    out.append(img.rotate(random.randint(-10, 10)))
    enh = ImageEnhance.Brightness(img)
    out.append(enh.enhance(random.uniform(0.8, 1.2)))
    return out


In [None]:
AUG_PER_IMAGE = {
    "malignant": 3,
    "benign": 1
}


In [None]:
processed = 0

for _, row in tqdm(df.iterrows(), total=len(df)):
    fname = row["image_file"]
    cls   = row["class"]

    if fname not in dicom_lookup:
        continue

    src = dicom_lookup[fname]
    arr = load_dicom(src)
    img = resize_img(arr)

    out_path = os.path.join(OUT_DIR, cls, fname.replace(".dcm", ".png"))
    img.save(out_path)

    n_aug = AUG_PER_IMAGE[cls]
    for i in range(n_aug):
        augs = augment(img)
        for j, a in enumerate(augs):
            ap = out_path.replace(".png", f"_aug{i}_{j}.png")
            a.save(ap)

    processed += 1

processed


In [None]:
sample_folder = os.path.join(OUT_DIR, "malignant")
sample_file = os.listdir(sample_folder)[0]

img = Image.open(os.path.join(sample_folder, sample_file))
plt.imshow(img, cmap="gray")
plt.axis("off")
