# Data Preparation Part 1: Structural Upgrade

In the prior section, we identified a few structural concerns worth addressing before any quality or exploratory analysis efforts take place. Here, we apply a few upgrades that should streamline the data quality analysis in terms of structural consistency. Concretely, we will:
1. Combine the training and test data for calcifications and masses into a single dataset,
2. Clean up the variable name inconsistencies,
3. Simplify the indexing with a mammogram identifier linked to cases and images, 
4. Extract image data and statistics from the DICOM dataset, 
5. Add a Boolean target variable which is True for malignancy and False otherwise,
6. Integrate both case and image data into a single Dataset with quality and exploratory analysis, visualization, statistical testing, and experimentation capability.


## Case Dataset Upgrades
We'll start with the case data. The following code cells implement the upgrades to the case dataset.


In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../../..")))
from typing import Union
from glob import glob

import dask
import numpy as np
import pandas as pd
import pydicom

from bcd.dal.file import IOService
from bcd.utils.file import getsize
from bcd.utils.profile import profiler
from bcd.data_prep.base import DataPrep
from bcd.data_prep.case import CasePrep
from bcd.data_prep.dicom import DicomPrep

In [2]:
# %load -r 38-187 bcd/data_prep/case.py
# ------------------------------------------------------------------------------------------------ #
class CasePrep(DataPrep):
    """Performs Case metadata preparation.

    Combines training and test cases into a single csv case file.

    Args:
        calc_train_fp, calc_test_fp, mass_train_fp, mass_test_fp (str): The file paths to the
            calcification and mass training and test sets.
        case_fp (str): Path to output calcification and mass datasets.
        force (bool): Whether to force execution if output already exists. Default is False.
    """

    def __init__(
        self,
        calc_train_fp: str,
        calc_test_fp: str,
        mass_train_fp: str,
        mass_test_fp: str,
        case_fp: str,
        force: bool = False,
    ) -> None:
        super().__init__()
        self._calc_train_fp = calc_train_fp
        self._calc_test_fp = calc_test_fp
        self._mass_train_fp = mass_train_fp
        self._mass_test_fp = mass_test_fp
        self._case_fp = case_fp
        self._force = force

    def prep(self) -> pd.DataFrame:
        """Combines training and test cases into a single csv case file."""
        case_fp = os.path.abspath(self._case_fp)

        os.makedirs(os.path.dirname(self._case_fp), exist_ok=True)

        if self._force or not os.path.exists(self._case_fp):
            # Merge all case data into a single DataFrame
            df_cases = self._merge_cases(
                calc_train_fp=self._calc_train_fp,
                calc_test_fp=self._calc_test_fp,
                mass_train_fp=self._mass_train_fp,
                mass_test_fp=self._mass_test_fp,
            )

            # Set morphological features to NA as appropriate
            df_cases.loc[
                df_cases["abnormality_type"] == "mass", "calc_type"
            ] = "NOT APPLICABLE"
            df_cases.loc[
                df_cases["abnormality_type"] == "mass", "calc_distribution"
            ] = "NOT APPLICABLE"
            df_cases.loc[
                df_cases["abnormality_type"] == "calcification", "mass_shape"
            ] = "NOT APPLICABLE"
            df_cases.loc[
                df_cases["abnormality_type"] == "calcification", "mass_margins"
            ] = "NOT APPLICABLE"

            # Assign the mammogram id.
            df_cases = self._assign_mmg_id(df=df_cases)

            # Transform 'BENIGN WITHOUT CALLBACK' to 'BENIGN'
            df_cases["cancer"] = np.where(
                df_cases["pathology"] == "MALIGNANT", True, False
            )

            # Drop the filename columns.
            columns_to_drop = [
                "image_file_path",
                "cropped_image_file_path",
                "ROI_mask_file_path",
            ]
            df_cases = df_cases.drop(columns=columns_to_drop)

            # Change left_or_right_breast to laterality, the DICOM attribute
            df_cases = df_cases.rename(columns={"left_or_right_breast": "laterality"})

            self._save(df=df_cases, filepath=self._case_fp)

            return df_cases

        return pd.read_csv(self._case_fp)

    def _merge_cases(
        self,
        calc_train_fp: str,
        calc_test_fp: str,
        mass_train_fp: str,
        mass_test_fp: str,
    ) -> pd.DataFrame:
        """Combines mass and calcification train and test files into a single file."""
        # Extracts absolute paths, a pre-emptive measure in case
        # jupyter book can't access the path
        calc_train_fp = os.path.abspath(calc_train_fp)
        calc_test_fp = os.path.abspath(calc_test_fp)
        mass_train_fp = os.path.abspath(mass_train_fp)
        mass_test_fp = os.path.abspath(mass_test_fp)

        df_calc_train = pd.read_csv(calc_train_fp)
        df_calc_test = pd.read_csv(calc_test_fp)
        df_mass_train = pd.read_csv(mass_train_fp)
        df_mass_test = pd.read_csv(mass_test_fp)

        # Add the filesets so that we can distinguish training
        # and test data
        df_calc_train["fileset"] = "training"
        df_calc_test["fileset"] = "test"
        df_mass_train["fileset"] = "training"
        df_mass_test["fileset"] = "test"

        # Replace spaces in column names with underscores.
        df_calc_train = self._format_column_names(df=df_calc_train)
        df_calc_test = self._format_column_names(df=df_calc_test)
        df_mass_train = self._format_column_names(df=df_mass_train)
        df_mass_test = self._format_column_names(df=df_mass_test)

        # Concatenate the files
        df = pd.concat(
            [df_calc_train, df_calc_test, df_mass_train, df_mass_test], axis=0
        )

        return df

    def _assign_mmg_id(self, df: pd.DataFrame) -> pd.DataFrame:
        """Assign a mammogram id to each observation."""
        df["mmg_id"] = (
            df["abnormality_type"].apply(lambda x: x[0:4].capitalize())
            + "-"
            + df["fileset"].apply(lambda x: x.capitalize())
            + "_"
            + df["patient_id"]
            + "_"
            + df["left_or_right_breast"].apply(lambda x: x.upper())
            + "_"
            + df["image_view"].apply(lambda x: x.upper())
        )

        return df

    def _format_column_names(self, df: pd.DataFrame) -> str:
        """Replaces spaces in column names with underscores."""

        def replace_columns(colname: str) -> str:
            return colname.replace(" ", "_")

        df.columns = df.columns.to_series().apply(replace_columns)
        return df



In [3]:
calc_test = "data/meta/0_raw/calc_case_description_test_set.csv"
calc_train = "data/meta/0_raw/calc_case_description_train_set.csv"
mass_test = "data/meta/0_raw/mass_case_description_test_set.csv"
mass_train = "data/meta/0_raw/mass_case_description_train_set.csv"
case_fp = "data/meta/1_interim/cases_staged.csv"

cp = CasePrep(calc_train_fp=calc_train, calc_test_fp=calc_test, mass_train_fp=mass_train, mass_test_fp=mass_test, case_fp=case_fp, force=True)
cases = cp.prep()

<class 'pandas.core.frame.DataFrame'>
Index: 3568 entries, 0 to 377
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   patient_id         3568 non-null   object
 1   breast_density     3568 non-null   int64 
 2   laterality         3568 non-null   object
 3   image_view         3568 non-null   object
 4   abnormality_id     3568 non-null   int64 
 5   abnormality_type   3568 non-null   object
 6   calc_type          3544 non-null   object
 7   calc_distribution  3129 non-null   object
 8   assessment         3568 non-null   int64 
 9   pathology          3568 non-null   object
 10  subtlety           3568 non-null   int64 
 11  fileset            3568 non-null   object
 12  mass_shape         3564 non-null   object
 13  mass_margins       3508 non-null   object
 14  mmg_id             3568 non-null   object
 15  cancer             3568 non-null   bool  
dtypes: bool(1), int64(4), object(11)
memory usage: 4

Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,subtlety,fileset,mass_shape,mass_margins,mmg_id,cancer
373,P_01825,2,RIGHT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,3,BENIGN_WITHOUT_CALLBACK,3,test,LOBULATED,MICROLOBULATED,Mass-Test_P_01825_RIGHT_MLO,False
17,P_00008,1,RIGHT,MLO,3,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,training,NOT APPLICABLE,NOT APPLICABLE,Calc-Training_P_00008_RIGHT_MLO,False
343,P_01690,3,LEFT,CC,1,mass,NOT APPLICABLE,NOT APPLICABLE,0,BENIGN,4,test,OVAL,OBSCURED,Mass-Test_P_01690_LEFT_CC,False
805,P_01162,3,LEFT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,4,BENIGN,4,training,ROUND,CIRCUMSCRIBED,Mass-Training_P_01162_LEFT_MLO,False
1158,P_01656,2,LEFT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,5,MALIGNANT,5,training,OVAL,CIRCUMSCRIBED,Mass-Training_P_01656_LEFT_MLO,True


## DICOM Image Metadata

Next, we extract the DICOM data and create the imaging metadata as described in {numref}`dicom_image_metadata`.

```{table} DICOM Image Metadata
:name: dicom_image_metadata

| #  | Name                       | Description                                                                              |
|----|----------------------------|------------------------------------------------------------------------------------------|
| 1  | patient_id                 | The patient identifier matching that from the case data.                                 |
| 2  | subject_id                 | Composite identifier containing abnormality type, side, image view, and   abnormality id |
| 3  | abnormality_type           | Type of abnormality, either 'calc', or 'mass'.                                           |
| 4  | laterality                 | Left or right breast                                                                     |
| 5  | image_view                 | Either CC or MLO                                                                         |
| 6  | fileset                    | Either "training" or "test"                                                              |
| 7  | series_description         | Either "full mammogram images", "cropped images", or   "ROI mask images"                 |
| 8  | photometric_interpretation | DICOM Attribute indicating how pixels are displayed                                      |
| 9  | bit_depth                  | Number of bits used to define each pixel                                                 |
| 10 | rows                       | Number of pixel rows in the image                                                        |
| 11 | cols                       | Number of pixel columns in the image                                                     |
| 12 | aspect_ratio               | Ratio of width to height in image                                                        |
| 13 | size                       | Product of width and height in image                                                     |
| 14 | min_pixel_value            | Minimum pixel value                                                                      |
| 15 | max_pixel_value            | Maximum pixel value                                                                      |
| 16 | mean_pixel_value           | Average pixel value                                                                      |
| 17 | std_pixel_value            | Standard deviation of pixel values                                                       |
| 18 | filepath                   | Path to image                                                                            |
| 19 | mmg_id                     | Mammogram id                                                                             |
                                                                                                                                            |
```
The following code cell produces the dataset.



In [5]:
# %load -r 36-143 bcd/data_prep/dicom.py
class DicomPrep(DataPrep):
    """Performs extraction of the DICOM data.

    Iterates through the DICOM metadata in parallel, extracting subject, series, and file location
    data. Then each DICOM file in the directory is parsed and the results are
    combined into to a DataFrame and saved.

    Args:
        filepath (str): Path to the DICOM series metadata.
        dicom_filepath (str) Path for the results
        skip_list (list): List of filepaths to skip.
        force (bool): Whether to force execution if output already exists. Default is False.
    """

    __BASEDIR = "data/image/0_raw/"

    def __init__(
        self,
        filepath: str,
        dicom_filepath: str,
        skip_list: list = None,
        force: bool = False,
    ) -> None:
        super().__init__()
        self._filepath = os.path.abspath(filepath)
        self._dicom_filepath = os.path.abspath(dicom_filepath)
        self._skip_list = skip_list
        self._force = force

    @profiler
    def prep(self) -> pd.DataFrame:
        """Extracts image metadata from the DICOM image files."""

        if self._force or not os.path.exists(self._dicom_filepath):
            # Reads the series metadata that contains subject, series, and
            # file location information
            studies = IOService.read(self._filepath)

            # Add filepaths to the study data first to avoid batch
            # operation exceptions with dask.
            studies = self._get_filepaths(studies=studies)

            results = []
            # Graph of work is created and executed lazily at compute time.
            for study in studies:
                image_result = dask.delayed(self._extract_data)(study)
                results.append(image_result)

            # Compute the results and convert to dataframe
            results = dask.compute(*results)
            df = pd.DataFrame(data=results)

            self._save(df=df, filepath=self._dicom_filepath)

            return df

        return pd.read_csv(self._dicom_filepath)

    def _get_filepaths(self, studies: pd.Series) -> pd.DataFrame:
        """Adds filepaths to the studies dataframe"""
        studies_filepaths = []
        for _, row in studies.iterrows():
            location = row["file_location"].replace("./", "")
            filepath = os.path.join(self.__BASEDIR, location)
            filepaths = glob(filepath + "/*.dcm")
            for file in filepaths:
                row["filepath"] = file
                studies_filepaths.append(row)
        return studies_filepaths

    def _extract_data(self, study: pd.Series) -> dict:
        """Reads study and dicom data from a file."""

        # Parse the study id
        studyid = study["subject_id"].split("_")[0:5]
        abtype, fileset = studyid[0].split("-")

        # Extract the DICOM data
        dcm = pydicom.dcmread(study["filepath"])
        img = dcm.pixel_array

        d = {}
        d["patient_id"] = ("_").join(studyid[1:3])
        d["subject_id"] = study["subject_id"]
        d["abnormality_type"] = abtype.lower()
        d["laterality"] = studyid[3]
        d["image_view"] = studyid[4]
        d["fileset"] = fileset.lower()
        d["series_description"] = study["series_description"]
        d["photometric_interpretation"] = dcm.PhotometricInterpretation
        d["bit_depth"] = dcm.BitsStored
        d["rows"], d["cols"] = img.shape
        d["aspect_ratio"] = d["cols"] / d["rows"]
        d["size"] = d["rows"] * d["cols"]
        d["file_size"] = getsize(study["filepath"])
        d["min_pixel_value"] = dcm.SmallestImagePixelValue
        d["max_pixel_value"] = dcm.LargestImagePixelValue
        d["mean_pixel_value"] = np.mean(img)
        d["std_pixel_value"] = np.std(img)
        d["filepath"] = study["filepath"]
        d["mmg_id"] = "_".join(study["subject_id"].split("_")[0:5])

        return d


In [7]:
series = "data/meta/0_raw/metadata.csv"
dicom = "data/meta/1_interim/dicom.csv"
p = DicomPrep(filepath=series, dicom_filepath=dicom, force=False)
dicom = p.prep()

Finally, we integrate the data into a Dataset for quality assessment and exploratory data analysis.