# CBIS-DDSM Data Transformation
Our aim here is to prepare the dataset for modeling to be conducted as part of the multivariate exploratory data analysis. 

The CBIS-DDSM has 45 calcification types, 9 calcification distributions, 20 mass shapes, and 19 mass margins, many of which are compound categories, in that two or more categories are combined. For instance, calcification type 'ROUND_AND_REGULAR-PUNCTATE-AMORPHOUS' indicates three different types: 'ROUND_AND_REGULAR', 'PUNCTATE', and 'AMORPHOUS'. Segregating these compound categories into separate categories will drastically reduce the number of categories to analyze. More importantly, it aligns our data and the analyses with the common morphological taxonomy. So, task one is to extract the unary morphological categories from the compound classifications.  

Once the unary categories are extracted, all nominal variables will be dummy encoded to values in [0,1]. Then, all model variables will be standardized to zero mean and unit variance. 

In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
pd.options.display.max_columns = 99

In [2]:
FP_CASES_CLEAN = "data/clean/cases.csv"
FP_CASES_COOKED = "data/cooked/cases.csv"

In [3]:
CALC_TYPES = ["AMORPHOUS","COARSE","DYSTROPHIC","EGGSHELL","FINE_LINEAR_BRANCHING","LARGE_RODLIKE","LUCENT_CENTERED",
"MILK_OF_CALCIUM","PLEOMORPHIC","PUNCTATE","ROUND_AND_REGULAR","SKIN","VASCULAR"]
CALC_DISTRIBUTIONS = ["CLUSTERED","LINEAR","REGIONAL","DIFFUSELY_SCATTERED","SEGMENTAL"]
MASS_SHAPES = ["IRREGULAR","ARCHITECTURAL_DISTORTION","OVAL","LYMPH_NODE","LOBULATED","FOCAL_ASYMMETRIC_DENSITY","ROUND","ASYMMETRIC_BREAST_TISSUE"]
MASS_MARGINS = ["SPICULATED","ILL_DEFINED","CIRCUMSCRIBED","OBSCURED","MICROLOBULATED"]

ENC_VARS = {'abnormality_type': {'prefix': 'AT', 'values': ['calcification', 'mass']},
            'left_or_right_breast': {'prefix': 'LR', 'values': ['LEFT', 'RIGHT']},
            'image_view': {'prefix': 'IV', 'values': ['CC', 'MLO']},            
            'calc_type': {'prefix': 'CT','values': CALC_TYPES},
            'calc_distribution': {'prefix': 'CD', 'values': CALC_DISTRIBUTIONS},
            'mass_shape': {'prefix': 'MS','values': MASS_SHAPES},
            'mass_margins': {'prefix': 'MM', 'values': MASS_MARGINS}}


## Feature Encoding

In [4]:
def encode_column(df, prefix, col, value):
    newcol = prefix + '_' + value
    df[newcol] = np.where(df[col].str.contains(value),1,0)
    return df

In [5]:
def encode_dataset(df: pd.DataFrame, enc_vars: dict) -> pd.DataFrame:
    for feature, data in enc_vars.items():
        for value in data['values']:
            df = encode_column(df=df, prefix=data['prefix'], col=feature, value=value)
    return df

In [6]:
df = pd.read_csv(FP_CASES_CLEAN)
# Convert the target from boolean to numeric.
df['cancer'] = np.where(df['cancer'] == True,1,0)
# Dummy encode new morphological categories.
df_enc = encode_dataset(df=df.copy(), enc_vars=ENC_VARS)
# Save data
os.makedirs(os.path.dirname(FP_CASES_COOKED), exist_ok=True)
df_enc.to_csv(FP_CASES_COOKED, index=False)
df_enc.head()

Unnamed: 0,patient_id,breast_density,left_or_right_breast,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,subtlety,fileset,mass_shape,mass_margins,case_id,cancer,AT_calcification,AT_mass,LR_LEFT,LR_RIGHT,IV_CC,IV_MLO,CT_AMORPHOUS,CT_COARSE,CT_DYSTROPHIC,CT_EGGSHELL,CT_FINE_LINEAR_BRANCHING,CT_LARGE_RODLIKE,CT_LUCENT_CENTERED,CT_MILK_OF_CALCIUM,CT_PLEOMORPHIC,CT_PUNCTATE,CT_ROUND_AND_REGULAR,CT_SKIN,CT_VASCULAR,CD_CLUSTERED,CD_LINEAR,CD_REGIONAL,CD_DIFFUSELY_SCATTERED,CD_SEGMENTAL,MS_IRREGULAR,MS_ARCHITECTURAL_DISTORTION,MS_OVAL,MS_LYMPH_NODE,MS_LOBULATED,MS_FOCAL_ASYMMETRIC_DENSITY,MS_ROUND,MS_ASYMMETRIC_BREAST_TISSUE,MM_SPICULATED,MM_ILL_DEFINED,MM_CIRCUMSCRIBED,MM_OBSCURED,MM_MICROLOBULATED
0,P_00005,3,RIGHT,CC,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,train,NOT APPLICABLE,NOT APPLICABLE,P_00005_RIGHT_calcification_CC_1,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,P_00005,3,RIGHT,MLO,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,train,NOT APPLICABLE,NOT APPLICABLE,P_00005_RIGHT_calcification_MLO_1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,P_00007,4,LEFT,CC,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,train,NOT APPLICABLE,NOT APPLICABLE,P_00007_LEFT_calcification_CC_1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,P_00007,4,LEFT,MLO,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,train,NOT APPLICABLE,NOT APPLICABLE,P_00007_LEFT_calcification_MLO_1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,P_00008,1,LEFT,CC,1,calcification,AMORPHOUS,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,train,NOT APPLICABLE,NOT APPLICABLE,P_00008_LEFT_calcification_CC_1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
