# CBIS-DDSM Feature Engineering
The CBIS-DDSM has 45 calcification types, 9 calcification distributions, 20 mass shapes, and 19 mass margins. Many of which are compound categories, in that two or more categories are combined. For instance, calcification type 'ROUND_AND_REGULAR-PUNCTATE-AMORPHOUS' indicates three different types: 'ROUND_AND_REGULAR', 'PUNCTATE', and 'AMORPHOUS'. This section will separate these compound categories into separate groups, one-hot encode them, and persist the data in separate datasets for downstream analysis. 

In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../..")))

import pandas as pd
import numpy as np
pd.options.display.max_columns = 99

In [2]:
FP_IN = "data/clean/cases.csv"
FP_OUT = "data/clean/cases_enc.csv"

In [3]:
CALC_TYPES = ["AMORPHOUS","COARSE","DYSTROPHIC","EGGSHELL","FINE_LINEAR_BRANCHING","LARGE_RODLIKE","LUCENT_CENTERED",
"MILK_OF_CALCIUM","PLEOMORPHIC","PUNCTATE","ROUND_AND_REGULAR","SKIN","VASCULAR"]
CALC_DISTRIBUTIONS = ["CLUSTERED","LINEAR","REGIONAL","DIFFUSELY_SCATTERED","SEGMENTAL"]
MASS_SHAPES = ["IRREGULAR","ARCHITECTURAL_DISTORTION","OVAL","LYMPH_NODE","LOBULATED","FOCAL_ASYMMETRIC_DENSITY","ROUND","ASYMMETRIC_BREAST_TISSUE"]
MASS_MARGINS = ["SPICULATED","ILL_DEFINED","CIRCUMSCRIBED","OBSCURED","MICROLOBULATED"]
MORPHOLOGY = {'calc_type': {'prefix': 'CT','values': CALC_TYPES}, 'calc_distribution': {'prefix': 'CD', 'values': CALC_DISTRIBUTIONS},'mass_shape': {'prefix': 'MS','values': MASS_SHAPES}, 'mass_margins': {'prefix': 'MM', 'values': MASS_MARGINS}}

In [4]:
def encode_column(df, prefix, col, value):
    newcol = prefix + '_' + value
    df[newcol] = np.where(df[col].str.contains(value),1,0)
    return df

In [5]:
def encode_dataset(df: pd.DataFrame, morphology: dict) -> pd.DataFrame:
    for feature, data in morphology.items():
        for value in data['values']:
            df = encode_column(df=df, prefix=data['prefix'], col=feature, value=value)
    return df

In [6]:
df = pd.read_csv(FP_IN)
df = encode_dataset(df=df, morphology=MORPHOLOGY)
df.info()
df.sample(10)
df.to_csv(FP_OUT, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3566 entries, 0 to 3565
Data columns (total 47 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   patient_id                   3566 non-null   object
 1   breast_density               3566 non-null   int64 
 2   left_or_right_breast         3566 non-null   object
 3   image_view                   3566 non-null   object
 4   abnormality_id               3566 non-null   int64 
 5   abnormality_type             3566 non-null   object
 6   calc_type                    3566 non-null   object
 7   calc_distribution            3566 non-null   object
 8   assessment                   3566 non-null   int64 
 9   pathology                    3566 non-null   object
 10  subtlety                     3566 non-null   int64 
 11  fileset                      3566 non-null   object
 12  mass_shape                   3566 non-null   object
 13  mass_margins                 3566

Unnamed: 0,patient_id,breast_density,left_or_right_breast,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,subtlety,fileset,mass_shape,mass_margins,case_id,cancer,CT_AMORPHOUS,CT_COARSE,CT_DYSTROPHIC,CT_EGGSHELL,CT_FINE_LINEAR_BRANCHING,CT_LARGE_RODLIKE,CT_LUCENT_CENTERED,CT_MILK_OF_CALCIUM,CT_PLEOMORPHIC,CT_PUNCTATE,CT_ROUND_AND_REGULAR,CT_SKIN,CT_VASCULAR,CD_CLUSTERED,CD_LINEAR,CD_REGIONAL,CD_DIFFUSELY_SCATTERED,CD_SEGMENTAL,MS_IRREGULAR,MS_ARCHITECTURAL_DISTORTION,MS_OVAL,MS_LYMPH_NODE,MS_LOBULATED,MS_FOCAL_ASYMMETRIC_DENSITY,MS_ROUND,MS_ASYMMETRIC_BREAST_TISSUE,MM_SPICULATED,MM_ILL_DEFINED,MM_CIRCUMSCRIBED,MM_OBSCURED,MM_MICROLOBULATED
2815,P_01356,2,LEFT,CC,1,mass,NOT APPLICABLE,NOT APPLICABLE,5,MALIGNANT,5,train,IRREGULAR,SPICULATED,P_01356_LEFT_mass_CC_1,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2435,P_00802,2,LEFT,MLO,2,mass,NOT APPLICABLE,NOT APPLICABLE,5,MALIGNANT,5,train,IRREGULAR,SPICULATED,P_00802_LEFT_mass_MLO_2,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
3201,P_00116,2,RIGHT,CC,1,mass,NOT APPLICABLE,NOT APPLICABLE,5,MALIGNANT,5,test,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,P_00116_RIGHT_mass_CC_1,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0
1468,P_02195,4,RIGHT,MLO,1,calcification,PLEOMORPHIC,CLUSTERED,0,MALIGNANT,4,train,NOT APPLICABLE,NOT APPLICABLE,P_02195_RIGHT_calcification_MLO_1,True,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
506,P_00635,3,LEFT,MLO,1,calcification,PUNCTATE,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,4,train,NOT APPLICABLE,NOT APPLICABLE,P_00635_LEFT_calcification_MLO_1,False,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1602,P_00344,4,LEFT,CC,1,calcification,PLEOMORPHIC,CLUSTERED,4,BENIGN,5,test,NOT APPLICABLE,NOT APPLICABLE,P_00344_LEFT_calcification_CC_1,False,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3296,P_00481,2,RIGHT,CC,1,mass,NOT APPLICABLE,NOT APPLICABLE,4,MALIGNANT,4,test,ROUND,SPICULATED,P_00481_RIGHT_mass_CC_1,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
119,P_00121,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,MALIGNANT,3,train,NOT APPLICABLE,NOT APPLICABLE,P_00121_LEFT_calcification_MLO_1,True,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
811,P_01040,3,RIGHT,MLO,1,calcification,PLEOMORPHIC,CLUSTERED,4,BENIGN,2,train,NOT APPLICABLE,NOT APPLICABLE,P_01040_RIGHT_calcification_MLO_1,False,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
259,P_00380,2,RIGHT,MLO,1,calcification,FINE_LINEAR_BRANCHING,CLUSTERED,4,BENIGN,5,train,NOT APPLICABLE,NOT APPLICABLE,P_00380_RIGHT_calcification_MLO_1,False,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
