# Data Preparation

In the prior section, we identified a few structural concerns worth addressing before any quality or exploratory analysis analyses take place. Here, we extract the relevant task-specific information from the CBIS-DDSM case and dicom datasets and integrate the data into a single, combined full mammogram dataset. 

Our process will take four steps: 
1. Combine the calcification mass training and test sets into a single full mammogram dataset, 
2. Add DICOM image file paths to the *series* metadata,
3. Extract the *DICOM* image metadata and merge it with the case data from #1. 
4. Create the Dataset object, our access to the CBIS-DDSM data.

The full dataset will have a few upgrades that will facilitate the analysis, detection, and classification tasks:
1. A mammogram ID, consisting of abnormality type, fileset (train/test), patient_id, breast laterality, and view will uniquely identify each full mammogram image.
2. A Boolean target variable, 'cancer', will be added combining BENIGN and BENIGN_WITHOUT_CALLBACK into a single Boolean value.
3. The Dataset will be a self-explanatory API for analysis, exploration, experimentation, and visualization.

Alright.

## Case Dataset Integration
The following code cells will integrate all case data into a single file.

In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../../..")))
from typing import Union
from glob import glob

import dask
import numpy as np
import pandas as pd
import pydicom

from bcd.dal.file import IOService
from bcd.utils.file import getsize
from bcd.utils.profile import profiler
from bcd.data_prep.base import DataPrep
from bcd.data_prep.case import CasePrep
from bcd.data_prep.series import SeriesPrep
from bcd.data_prep.cbis import CBISPrep

In [2]:
# %load -r 39-173 bcd/data_prep/case.py
class CasePrep(DataPrep):
    """Performs Case metadata preparation.

    Combines training and test cases into a single csv case file.

    Args:
        calc_train_fp, calc_test_fp, mass_train_fp, mass_test_fp (str): The file paths to the
            calcification and mass training and test sets.
        case_fp (str): Path to output calcification and mass datasets.
        force (bool): Whether to force execution if output already exists. Default is False.
    """

    def __init__(
        self,
        calc_train_fp: str,
        calc_test_fp: str,
        mass_train_fp: str,
        mass_test_fp: str,
        case_fp: str,
        force: bool = False,
    ) -> None:
        super().__init__()
        self._calc_train_fp = calc_train_fp
        self._calc_test_fp = calc_test_fp
        self._mass_train_fp = mass_train_fp
        self._mass_test_fp = mass_test_fp
        self._case_fp = case_fp
        self._force = force

    def prep(self) -> pd.DataFrame:
        """Combines training and test cases into a single csv case file."""

        if self._force or not os.path.exists(self._case_fp):
            # Merge all case data into a single DataFrame
            df_cases = self._merge_cases()

            # Set morphological features to NA as appropriate
            df_cases.loc[
                df_cases["abnormality_type"] == "mass", "calc_type"
            ] = "NOT APPLICABLE"
            df_cases.loc[
                df_cases["abnormality_type"] == "mass", "calc_distribution"
            ] = "NOT APPLICABLE"
            df_cases.loc[
                df_cases["abnormality_type"] == "calcification", "mass_shape"
            ] = "NOT APPLICABLE"
            df_cases.loc[
                df_cases["abnormality_type"] == "calcification", "mass_margins"
            ] = "NOT APPLICABLE"

            # Assign the mammogram id.
            df_cases = self._assign_mmg_id(df=df_cases)

            # Create the Boolean target corresponding to pathology
            df_cases["cancer"] = np.where(
                df_cases["pathology"] == "MALIGNANT", True, False
            )

            # Drop the filename columns.
            columns_to_drop = [
                "image_file_path",
                "cropped_image_file_path",
                "ROI_mask_file_path",
            ]
            df_cases = df_cases.drop(columns=columns_to_drop)

            # Change laterality to laterality, the DICOM attribute
            df_cases = df_cases.rename(columns={"laterality": "laterality"})

            self._save(df=df_cases, filepath=self._case_fp)

            return df_cases

        return pd.read_csv(self._case_fp)

    def _merge_cases(self) -> pd.DataFrame:
        """Combines mass and calcification train and test files into a single file."""
        # Extracts absolute paths, a pre-emptive measure in case
        # jupyter book can't access the path
        calc_train_fp = os.path.abspath(self._calc_train_fp)
        calc_test_fp = os.path.abspath(self._calc_test_fp)
        mass_train_fp = os.path.abspath(self._mass_train_fp)
        mass_test_fp = os.path.abspath(self._mass_test_fp)

        # Read the data
        df_calc_train = pd.read_csv(calc_train_fp)
        df_calc_test = pd.read_csv(calc_test_fp)
        df_mass_train = pd.read_csv(mass_train_fp)
        df_mass_test = pd.read_csv(mass_test_fp)

        # Add the filesets
        df_calc_train["fileset"] = "training"
        df_calc_test["fileset"] = "test"
        df_mass_train["fileset"] = "training"
        df_mass_test["fileset"] = "test"

        # Standardize column names with underscores in place of spaces.
        df_calc_train = self._format_column_names(df=df_calc_train)
        df_calc_test = self._format_column_names(df=df_calc_test)
        df_mass_train = self._format_column_names(df=df_mass_train)
        df_mass_test = self._format_column_names(df=df_mass_test)

        # Concatenate the files
        df = pd.concat(
            [df_calc_train, df_calc_test, df_mass_train, df_mass_test], axis=0
        )

        return df

    def _assign_mmg_id(self, df: pd.DataFrame) -> pd.DataFrame:
        """Assign a mammogram id to each observation."""
        df["mmg_id"] = (
            df["abnormality_type"].apply(lambda x: x[0:4].capitalize())
            + "-"
            + df["fileset"].apply(lambda x: x.capitalize())
            + "_"
            + df["patient_id"]
            + "_"
            + df["laterality"].apply(lambda x: x.upper())
            + "_"
            + df["image_view"].apply(lambda x: x.upper())
        )

        return df

    def _format_column_names(self, df: pd.DataFrame) -> str:
        """Replaces spaces in column names with underscores."""

        def replace_columns(colname: str) -> str:
            return colname.replace(" ", "_")

        df.columns = df.columns.to_series().apply(replace_columns)
        return df



In [7]:

calc_test = "data/meta/0_raw/calc_case_description_test_set.csv"
calc_train = "data/meta/0_raw/calc_case_description_train_set.csv"
mass_test = "data/meta/0_raw/mass_case_description_test_set.csv"
mass_train = "data/meta/0_raw/mass_case_description_train_set.csv"

case_fp = "data/meta/1_interim/cases.csv"

cp = CasePrep(calc_train_fp=calc_train, calc_test_fp=calc_test, mass_train_fp=mass_train, mass_test_fp=mass_test, case_fp=case_fp, force=False)
cases = cp.prep()
cases.info()
cases.sample(n=5, random_state=55)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3568 entries, 0 to 3567
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   patient_id         3568 non-null   object
 1   breast_density     3568 non-null   int64 
 2   laterality         3568 non-null   object
 3   image_view         3568 non-null   object
 4   abnormality_id     3568 non-null   int64 
 5   abnormality_type   3568 non-null   object
 6   calc_type          3544 non-null   object
 7   calc_distribution  3129 non-null   object
 8   assessment         3568 non-null   int64 
 9   pathology          3568 non-null   object
 10  subtlety           3568 non-null   int64 
 11  fileset            3568 non-null   object
 12  mass_shape         3564 non-null   object
 13  mass_margins       3508 non-null   object
 14  mmg_id             3568 non-null   object
 15  cancer             3568 non-null   bool  
dtypes: bool(1), int64(4), object(11)
memory us

Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,subtlety,fileset,mass_shape,mass_margins,mmg_id,cancer
3563,P_01825,2,RIGHT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,3,BENIGN_WITHOUT_CALLBACK,3,test,LOBULATED,MICROLOBULATED,Mass-Test_P_01825_RIGHT_MLO,False
17,P_00008,1,RIGHT,MLO,3,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,training,NOT APPLICABLE,NOT APPLICABLE,Calc-Training_P_00008_RIGHT_MLO,False
3533,P_01690,3,LEFT,CC,1,mass,NOT APPLICABLE,NOT APPLICABLE,0,BENIGN,4,test,OVAL,OBSCURED,Mass-Test_P_01690_LEFT_CC,False
2677,P_01162,3,LEFT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,4,BENIGN,4,training,ROUND,CIRCUMSCRIBED,Mass-Training_P_01162_LEFT_MLO,False
3030,P_01656,2,LEFT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,5,MALIGNANT,5,training,OVAL,CIRCUMSCRIBED,Mass-Training_P_01656_LEFT_MLO,True


The dataset above has both mass and calcification training and test data, as well as a mammogram id, 'mmmg_id', and a Boolean target 'cancer'.

## Series Metadata
Next, we add filepaths to the series metadata.

In [None]:
# %load -r 31-88 bcd/data_prep/series.py
# ------------------------------------------------------------------------------------------------ #
class SeriesPrep(DataPrep):
    """Adds filepaths to the Series dataset

    Args:
        filepath (str): Path to the DICOM series metadata.
        series_filepath (str) Path for the results
        force (bool): Whether to force execution if output already exists. Default is False.
    """

    __BASEDIR = "data/image/0_raw/"

    def __init__(
        self,
        filepath: str,
        series_filepath: str,
        force: bool = False,
    ) -> None:
        super().__init__()
        self._filepath = os.path.abspath(filepath)
        self._series_filepath = os.path.abspath(series_filepath)
        self._force = force

    @profiler
    def prep(self) -> pd.DataFrame:
        """Extracts image metadata from the DICOM image files."""

        if self._force or not os.path.exists(self._series_filepath):
            # Reads the series metadata that contains subject, series, and
            # file location information
            studies = IOService.read(self._filepath)

            # Add filepaths to the study data first to avoid batch
            # operation exceptions with dask.
            studies = self._get_filepaths(studies=studies)

            df = pd.DataFrame(data=studies)

            self._save(df=df, filepath=self._series_filepath)

            return df

        return pd.read_csv(self._series_filepath)

    def _get_filepaths(self, studies: pd.Series) -> pd.DataFrame:
        """Adds filepaths to the studies dataframe"""
        studies_filepaths = []
        for _, row in studies.iterrows():
            location = row["file_location"].replace("./", "")
            filepath = os.path.join(self.__BASEDIR, location)
            filepaths = glob(filepath + "/*.dcm")
            for file in filepaths:
                row["filepath"] = file
                studies_filepaths.append(row)
        return studies_filepaths


In [8]:
fpi = "data/meta/0_raw/metadata.csv"
fpo = "data/meta/3_final/series.csv"
sp = SeriesPrep(filepath=fpi, series_filepath=fpo, force=False)
series = sp.prep()
series.info()
series.sample(n=5, random_state=55)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10239 entries, 0 to 10238
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   series_uid            10239 non-null  object
 1   collection            10239 non-null  object
 2   data_description_uri  10239 non-null  object
 3   subject_id            10239 non-null  object
 4   study_uid             10239 non-null  object
 5   study_date            10239 non-null  object
 6   series_description    10239 non-null  object
 7   modality              10239 non-null  object
 8   sop_class_name        10239 non-null  object
 9   sop_class_uid         10239 non-null  object
 10  number_of_images      10239 non-null  int64 
 11  file_size             10239 non-null  object
 12  file_location         10239 non-null  object
 13  download_timestamp    10239 non-null  object
 14  filepath              10239 non-null  object
dtypes: int64(1), object(14)
memory usage

Unnamed: 0,series_uid,collection,data_description_uri,subject_id,study_uid,study_date,series_description,modality,sop_class_name,sop_class_uid,number_of_images,file_size,file_location,download_timestamp,filepath
752,1.3.6.1.4.1.9590.100.1.2.118805243010527642836...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_01419_LEFT_CC_1,1.3.6.1.4.1.9590.100.1.2.267657522710744920625...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,18.04 MB,./CBIS-DDSM/Calc-Test_P_01419_LEFT_CC_1/08-29-...,2023-05-24T03:09:56.273,data/image/0_raw/CBIS-DDSM/Calc-Test_P_01419_L...
8883,1.3.6.1.4.1.9590.100.1.2.160255505411776231122...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Training_P_00260_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.404192837610226889722...,09-06-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,22.08 MB,./CBIS-DDSM/Calc-Training_P_00260_RIGHT_CC_1/0...,2023-05-24T07:00:54.02,data/image/0_raw/CBIS-DDSM/Calc-Training_P_002...
6889,1.3.6.1.4.1.9590.100.1.2.190371998012909216226...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Mass-Test_P_01697_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.420210647711670485715...,10-04-2016,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,24.01 MB,./CBIS-DDSM/Mass-Test_P_01697_LEFT_MLO_1/10-04...,2023-05-24T06:07:39.287,data/image/0_raw/CBIS-DDSM/Mass-Test_P_01697_L...
1810,1.3.6.1.4.1.9590.100.1.2.109486979812564135735...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00180_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.363307813117337481178...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14.43 MB,./CBIS-DDSM/Calc-Test_P_00180_LEFT_MLO_1/08-29...,2023-05-24T03:43:49.606,data/image/0_raw/CBIS-DDSM/Calc-Test_P_00180_L...
1182,1.3.6.1.4.1.9590.100.1.2.223222662511824177432...,CBIS-DDSM,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00390_RIGHT_MLO_1,1.3.6.1.4.1.9590.100.1.2.366591994512355436005...,08-29-2017,ROI mask images,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,13.60 MB,./CBIS-DDSM/Calc-Test_P_00390_RIGHT_MLO_1/08-2...,2023-05-24T03:24:41.338,data/image/0_raw/CBIS-DDSM/Calc-Test_P_00390_R...


Full filepaths have been added for all 10,239 images in the CBIS-DDSM.

## DICOM Image Metadata

Next, we extract the DICOM data described in {numref}`dicom_image_metadata` and merge that with the case data.

```{table} DICOM Image Metadata
:name: dicom_image_metadata

| # | Name                       | Description                                                                              |
|---|----------------------------|------------------------------------------------------------------------------------------|
| 1 | bit_depth                  | Number of bits used to define each pixel                                                 |
| 2 | rows                       | Number of pixel rows in the image                                                        |
| 3 | cols                       | Number of pixel columns in the image                                                     |
| 4 | aspect_ratio               | Ratio of width to height in image                                                        |
| 5 | size                       | Product of width and height in image                                                     |
| 6 | min_pixel_value            | Minimum pixel value                                                                      |
| 7 | max_pixel_value            | Maximum pixel value                                                                      |
| 8 | mean_pixel_value           | Average pixel value                                                                      |
| 9 | std_pixel_value            | Standard deviation of pixel values                                                       |

                                                                                                                                            |
```


In [None]:
# %load -r 35-143 bcd/data_prep/cbis.py
class CBISPrep(DataPrep):
    """Extracts DICOM data and integrates it with a single Case dataset staged for quality assessment.

    Iterates through the full mammography DICOM metadata in parallel, extracting image and pixel
    data and statistics, then combines the data with the case dataset.

    Args:
        case_filepath (str): Path to the case dataset.
        series_filepath (str): Path to the series dataset.
        cbis_filepath (str): Path to the combined case dicom dataset.
        force (bool): Whether to force execution if output already exists. Default is False.
    """

    def __init__(
        self,
        case_filepath: str,
        series_filepath: str,
        cbis_filepath: str,
        force: bool = False,
    ) -> None:
        super().__init__()
        self._case_filepath = os.path.abspath(case_filepath)
        self._series_filepath = os.path.abspath(series_filepath)
        self._cbis_filepath = os.path.abspath(cbis_filepath)
        self._force = force

    @profiler
    def prep(self) -> pd.DataFrame:
        """Extracts image metadata from the DICOM image files."""

        if self._force or not os.path.exists(self._cbis_filepath):
            # Reads the series metadata that contains subject, series, and
            # file location information
            cases = IOService.read(self._case_filepath)
            series = IOService.read(self._series_filepath)

            # Obtain the full mammogram images
            series = series.loc[series["series_description"] == "full mammogram images"]

            results = []
            # Graph of work is created and executed lazily at compute time.
            for _, study in series.iterrows():
                image_result = dask.delayed(self._extract_data)(study)
                results.append(image_result)

            # Compute the results and convert to dataframe
            results = dask.compute(*results)
            df = pd.DataFrame(data=results)

            # Merge the data with the case dataset
            df = cases.merge(df, on="mmg_id", how="inner")

            self._save(df=df, filepath=self._cbis_filepath)

            return df

        return IOService.read(self._cbis_filepath)

    def _extract_data(self, study: pd.Series) -> dict:
        """Reads study and dicom data from a file."""

        dcm = pydicom.dcmread(study["filepath"])
        img = dcm.pixel_array

        d = {}
        d["mmg_id"] = "_".join(study["subject_id"].split("_")[0:5])
        d["bit_depth"] = dcm.BitsStored
        d["rows"], d["cols"] = img.shape
        d["aspect_ratio"] = d["cols"] / d["rows"]
        d["size"] = d["rows"] * d["cols"]
        d["file_size"] = getsize(study["filepath"])
        d["min_pixel_value"] = dcm.SmallestImagePixelValue
        d["max_pixel_value"] = dcm.LargestImagePixelValue
        d["mean_pixel_value"] = np.mean(img)
        d["std_pixel_value"] = np.std(img)
        d["filepath"] = study["filepath"]

        return d


In [10]:
cases = "data/meta/1_interim/cases.csv"
series = "data/meta/3_final/series.csv"
cbis = "data/meta/2_staged/cbis.csv"
cp = CBISPrep(case_filepath=cases, series_filepath=series, cbis_filepath=cbis, force=False)
cbis = cp.prep()
cbis.info()
cbis.sample(n=5, random_state=55)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3568 entries, 0 to 3567
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patient_id         3568 non-null   object 
 1   breast_density     3568 non-null   int64  
 2   laterality         3568 non-null   object 
 3   image_view         3568 non-null   object 
 4   abnormality_id     3568 non-null   int64  
 5   abnormality_type   3568 non-null   object 
 6   calc_type          3544 non-null   object 
 7   calc_distribution  3129 non-null   object 
 8   assessment         3568 non-null   int64  
 9   pathology          3568 non-null   object 
 10  subtlety           3568 non-null   int64  
 11  fileset            3568 non-null   object 
 12  mass_shape         3564 non-null   object 
 13  mass_margins       3508 non-null   object 
 14  mmg_id             3568 non-null   object 
 15  cancer             3568 non-null   bool   
 16  bit_depth          3568 

Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,...,rows,cols,aspect_ratio,size,file_size,min_pixel_value,max_pixel_value,mean_pixel_value,std_pixel_value,filepath
3563,P_01825,2,RIGHT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,3,BENIGN_WITHOUT_CALLBACK,...,4520,2888,0.64,13053760,24.9 MB,0,61031,12100.82,15102.12,data/image/0_raw/CBIS-DDSM/Mass-Test_P_01825_R...
17,P_00008,1,RIGHT,MLO,3,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,...,4576,3048,0.67,13947648,26.6 MB,0,65535,16262.07,15051.22,data/image/0_raw/CBIS-DDSM/Calc-Training_P_000...
3533,P_01690,3,LEFT,CC,1,mass,NOT APPLICABLE,NOT APPLICABLE,0,BENIGN,...,5386,3706,0.69,19960516,38.07 MB,0,65535,12696.9,14839.88,data/image/0_raw/CBIS-DDSM/Mass-Test_P_01690_L...
2677,P_01162,3,LEFT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,4,BENIGN,...,5911,4096,0.69,24211456,46.18 MB,0,65535,13687.71,15241.62,data/image/0_raw/CBIS-DDSM/Mass-Training_P_011...
3030,P_01656,2,LEFT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,5,MALIGNANT,...,4736,2656,0.56,12578816,23.99 MB,0,65535,18227.27,16527.84,data/image/0_raw/CBIS-DDSM/Mass-Training_P_016...


We have all case information along with the DICOM image metadata in a single dataset. 

Finally, we integrate the data into a Dataset for quality assessment and exploratory data analysis.