# CBIS-DDSM Data Transformation
Two sets of transformations will be performed for two separate purposes. 

The CBIS-DDSM has 45 calcification types, 9 calcification distributions, 20 mass shapes, and 19 mass margins. Many of these are compound categories, in that two or more categories are combined. For instance, calcification type 'ROUND_AND_REGULAR-PUNCTATE-AMORPHOUS' indicates three different types: 'ROUND_AND_REGULAR', 'PUNCTATE', and 'AMORPHOUS'. Segregating these compound categories into separate classes drastically reduces the number of categories to analyze. More importantly, it aligns our data and the analyses with the common morphological taxonomy. Our first task; then, is to expand the clean dataset with dummy encoded morphological categories.

Second, machine learning models, such as that which we will be using during the exploratory analysis, require numeric input. Here, we'll dummy encode the nominal variables in the mass and calcification datasets separately; thereby, restricting each dataset to the features appropriate for that abnormality type. 

Third, during image analysis, we'll need a single data set for both image and case metadata. A DICOM-CASE dataset containing case and DICOM data will be created for downstream image analysis. 

In all, five datasets will be created. 
1. Calcification Training Set
2. Calcification Test Set
3. Mass Training Set
4. Mass Test Set
5. DICOM-CASE Dataset

To avoid any potential bias introduced by separating the compound categories, these datasets will encode the original category values.

In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
pd.options.display.max_columns = 99

In [2]:
FP_CLEAN = "data/clean/cases.csv"
FP_DICOM = "data/staged/dicom.csv"
FP_XREF = "data/staged/case_series_xref.csv"
FP_DICOM_CASE = "data/clean/dicom_case.csv"
FP_PREPROCESSED = "data/preprocessed/cases.csv"
FP_COOKED_CALC_TRAIN = "data/cooked/calc_train.csv"
FP_COOKED_CALC_TEST = "data/cooked/calc_test.csv"
FP_COOKED_MASS_TRAIN = "data/cooked/mass_train.csv"
FP_COOKED_MASS_TEST = "data/cooked/mass_test.csv"


In [3]:
CALC_TYPES = ["AMORPHOUS","COARSE","DYSTROPHIC","EGGSHELL","FINE_LINEAR_BRANCHING","LARGE_RODLIKE","LUCENT_CENTERED",
"MILK_OF_CALCIUM","PLEOMORPHIC","PUNCTATE","ROUND_AND_REGULAR","SKIN","VASCULAR"]
CALC_DISTRIBUTIONS = ["CLUSTERED","LINEAR","REGIONAL","DIFFUSELY_SCATTERED","SEGMENTAL"]
MASS_SHAPES = ["IRREGULAR","ARCHITECTURAL_DISTORTION","OVAL","LYMPH_NODE","LOBULATED","FOCAL_ASYMMETRIC_DENSITY","ROUND","ASYMMETRIC_BREAST_TISSUE"]
MASS_MARGINS = ["SPICULATED","ILL_DEFINED","CIRCUMSCRIBED","OBSCURED","MICROLOBULATED"]
MORPHOLOGY = {'calc_type': {'prefix': 'CT','values': CALC_TYPES}, 'calc_distribution': {'prefix': 'CD', 'values': CALC_DISTRIBUTIONS},'mass_shape': {'prefix': 'MS','values': MASS_SHAPES}, 'mass_margins': {'prefix': 'MM', 'values': MASS_MARGINS}}



## DICOM-CASE Dataset

Load the data.

In [4]:
dfc = pd.read_csv(FP_CLEAN)
dfd = pd.read_csv(FP_DICOM)
dfx = pd.read_csv(FP_XREF)

Remove redundant columns.

In [5]:
cols_to_ignore = ['patient_id', 'left_or_right_breast', 'image_view']
cols = [col for col in dfc.columns if col not in cols_to_ignore]
dfc2 = dfc[cols]

Merge the datasets.

In [6]:
dfdc = dfd.merge(dfx[['case_id', 'series_uid']], how='left', on='series_uid')
dfdc = dfdc.merge(dfc2, how='left', on='case_id')
dfdc.to_csv(FP_DICOM_CASE, index=False)

## Morphological Feature Encoding

In [20]:
def encode_column(df, prefix, col, value):
    newcol = prefix + '_' + value
    df[newcol] = np.where(df[col].str.contains(value),1,0)
    return df

In [21]:
def encode_dataset(df: pd.DataFrame, morphology: dict) -> pd.DataFrame:
    for feature, data in morphology.items():
        for value in data['values']:
            df = encode_column(df=df, prefix=data['prefix'], col=feature, value=value)
    return df

In [22]:
df = pd.read_csv(FP_CLEAN)
df['cancer'] = np.where(df['cancer'] == True,1,0)
df_enc = encode_dataset(df=df.copy(), morphology=MORPHOLOGY)
os.makedirs(os.path.dirname(FP_PREPROCESSED), exist_ok=True)
df_enc.to_csv(FP_PREPROCESSED, index=False)
df.head()

Unnamed: 0,patient_id,breast_density,left_or_right_breast,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,subtlety,fileset,mass_shape,mass_margins,case_id,cancer
0,P_00005,3,RIGHT,CC,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,train,NOT APPLICABLE,NOT APPLICABLE,P_00005_RIGHT_calcification_CC_1,1
1,P_00005,3,RIGHT,MLO,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,train,NOT APPLICABLE,NOT APPLICABLE,P_00005_RIGHT_calcification_MLO_1,1
2,P_00007,4,LEFT,CC,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,train,NOT APPLICABLE,NOT APPLICABLE,P_00007_LEFT_calcification_CC_1,0
3,P_00007,4,LEFT,MLO,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,train,NOT APPLICABLE,NOT APPLICABLE,P_00007_LEFT_calcification_MLO_1,0
4,P_00008,1,LEFT,CC,1,calcification,AMORPHOUS,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,train,NOT APPLICABLE,NOT APPLICABLE,P_00008_LEFT_calcification_CC_1,0


## Full Dataset Encoding

In [23]:
CALC_VARIATES = [

        "left_or_right_breast",
        "image_view",
        "calc_type",
        "calc_distribution",
        "fileset",
        "cancer",
        "breast_density",
        "subtlety",                
        ]

MASS_VARIATES = [

        "left_or_right_breast",
        "image_view",
        "mass_shape",
        "mass_margins",
        "fileset",
        "cancer",
        "breast_density",
        "subtlety",
        
        ]

In [24]:
df_calc = df.loc[(df['abnormality_type'] == 'calcification')][CALC_VARIATES]
df_mass = df.loc[(df['abnormality_type'] == 'mass')][MASS_VARIATES]
df_calc.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1872 entries, 0 to 1871
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   left_or_right_breast  1872 non-null   object
 1   image_view            1872 non-null   object
 2   calc_type             1872 non-null   object
 3   calc_distribution     1872 non-null   object
 4   fileset               1872 non-null   object
 5   cancer                1872 non-null   int64 
 6   breast_density        1872 non-null   int64 
 7   subtlety              1872 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 131.6+ KB


In [25]:
df_calc = pd.get_dummies(df_calc, prefix=['LR','IV', 'CT', 'CD', "FS"], dtype=float)
df_mass = pd.get_dummies(df_mass, prefix=['LR','IV', 'MS', 'MM', "FS"], dtype=float)

df_calc_train = df_calc.loc[df_calc['FS_train'] == 1]
df_calc_test = df_calc.loc[df_calc['FS_test'] == 1]

df_mass_train = df_mass.loc[df_mass['FS_train'] == 1]
df_mass_test = df_mass.loc[df_mass['FS_test'] == 1]

df_calc_train = df_calc_train.drop(columns=['FS_test', 'FS_train'])
df_calc_test = df_calc_test.drop(columns=['FS_test', 'FS_train'])

df_mass_train = df_mass_train.drop(columns=['FS_test', 'FS_train'])
df_mass_test = df_mass_test.drop(columns=['FS_test', 'FS_train'])


<class 'pandas.core.frame.DataFrame'>
Index: 1316 entries, 1872 to 3187
Data columns (total 46 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   cancer                                       1316 non-null   int64  
 1   breast_density                               1316 non-null   int64  
 2   subtlety                                     1316 non-null   int64  
 3   LR_LEFT                                      1316 non-null   float64
 4   LR_RIGHT                                     1316 non-null   float64
 5   IV_CC                                        1316 non-null   float64
 6   IV_MLO                                       1316 non-null   float64
 7   MS_ARCHITECTURAL_DISTORTION                  1316 non-null   float64
 8   MS_ASYMMETRIC_BREAST_TISSUE                  1316 non-null   float64
 9   MS_FOCAL_ASYMMETRIC_DENSITY                  1316 non-null   float64
 10  MS

In [27]:
os.makedirs(os.path.dirname(FP_COOKED_CALC_TRAIN), exist_ok=True)
df_calc_train.to_csv(FP_COOKED_CALC_TRAIN, index=False)
df_calc_test.to_csv(FP_COOKED_CALC_TEST, index=False)
df_mass_train.to_csv(FP_COOKED_MASS_TRAIN, index=False)
df_mass_test.to_csv(FP_COOKED_MASS_TEST, index=False)