In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../../..")))

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from studioai.preprocessing.encode import RankFrequencyEncoder

from bcd.data_prep.clean import CBISImputer
from bcd.data.dataset import CBISDataset

pd.options.display.max_rows = 999

In [2]:
FP_STAGED = "data/meta/2_staged/cbis.csv"
FP_CLEAN = "data/meta/3_clean/cbis.csv"

In [3]:
df = pd.read_csv(FP_STAGED)
df_orig = df.copy()

In [4]:
# Set invalid values for breast_density to NA
df['breast_density'] = df['breast_density'].replace(0, np.NAN)

In [5]:
# Set case and mass data to NOT APPLICABLE where appropriate.
df['subtlety'] = df['subtlety'].replace(0, np.NAN)

In [6]:
df.loc[df['calc_type'] == 'LUCENT_CENTER', 'calc_type'] = 'LUCENT_CENTERED'
df.loc[df['calc_type'] == 'ROUND_AND_REGULAR-LUCENT_CENTER-DYSTROPHIC', 'calc_type'] = 'ROUND_AND_REGULAR-LUCENT_CENTERED-DYSTROPHIC'
df.loc[df['calc_type'] == 'PUNCTATE-LUCENT_CENTER', 'calc_type'] = 'PUNCTATE-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'VASCULAR-COARSE-LUCENT_CENTER-ROUND_AND_REGULAR-PUNCTATE', 'calc_type'] = 'VASCULAR-COARSE-LUCENT_CENTERED-ROUND_AND_REGULAR-PUNCTATE'
df.loc[df['calc_type'] == 'ROUND_AND_REGULAR-LUCENT_CENTER', 'calc_type'] = 'ROUND_AND_REGULAR-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'LUCENT_CENTER-PUNCTATE', 'calc_type'] = 'LUCENT_CENTERED-PUNCTATE'
df.loc[df['calc_type'] == 'COARSE-ROUND_AND_REGULAR-LUCENT_CENTER', 'calc_type'] = 'COARSE-ROUND_AND_REGULAR-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'ROUND_AND_REGULAR-LUCENT_CENTER-PUNCTATE', 'calc_type'] = 'ROUND_AND_REGULAR-LUCENT_CENTERED-PUNCTATE'
df.loc[df['calc_type'] == 'COARSE-LUCENT_CENTER', 'calc_type'] = 'COARSE-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'PLEOMORPHIC-PLEOMORPHIC', 'calc_type'] = 'PLEOMORPHIC'

In [7]:
# Grab rows with missing data
null_mask = df.isnull().any(axis=1)
df_missing = df[null_mask]
msg = f"There are {df_missing.shape[0]} rows (approximately {round(df_missing.shape[0] / df_orig.shape[0] * 100,1)}% of the dataset) with missing data in the dataset."
print(msg)

There are 527 rows (approximately 14.8% of the dataset) with missing data in the dataset.


In [8]:
# %load -r 37-119 bcd/data_prep/clean.py
class CBISImputer:
    """Imputes the missing values in the case dataset using Multiple Imputation by Chained Equations

    Args:
        max_iter (int): Maximum number of imputation rounds to perform before returning
        the imputations computed during the final round.
        initial_strategy (str): Which strategy to use to initialize the missing values.
            Valid values include: {'mean', 'median', 'most_frequent', 'constant'},
            default=most_frequent'
        random_state (int): The seed of the pseudo random number generator to use.

    """

    def __init__(
        self,
        max_iter: int = 50,
        initial_strategy: str = "most_frequent",
        random_state: int = None,
    ) -> None:
        self._max_iter = max_iter
        self._initial_strategy = initial_strategy
        self._random_state = random_state
        self._encoded_values = {}
        self._dtypes = None
        self._enc = None
        self._imp = None

    def fit(self, df: pd.DataFrame) -> CBISImputer:
        """Fits the data to the imputer

        Instantiates the encoder, encodes the data and creates a
        map of columns to valid encoded values. We capture these
        values in order to map imputed values
        back to valid values before we inverse transform.

        Args:
            df (pd.DataFrame): Imputed DataFrame
        """
        self._dtypes = df.dtypes.astype(str).replace("0", "object").to_dict()
        self._enc = RankFrequencyEncoder()
        df_enc = self._enc.fit_transform(df=df)
        self._extract_encoded_values(df=df_enc)

        # Get complete cases for imputer training (fit)
        df_enc_complete = df_enc.dropna(axis=0)

        self._imp = IterativeImputer(
            max_iter=self._max_iter,
            initial_strategy=self._initial_strategy,
            random_state=self._random_state,
        )
        self._imp.fit(X=df_enc_complete.values)
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Performs the imputation and returns the imputed DataFrame

        Args:
            df (pd.DataFrame): Imputed DataFrame

        """
        df_enc = self._enc.transform(df=df)
        imp = self._imp.transform(X=df_enc.values)
        df_imp = pd.DataFrame(data=imp, columns=df.columns)
        df_imp = self._map_imputed_values(df=df_imp)
        df_imp = self._enc.inverse_transform(df=df_imp)
        df_imp = df_imp.astype(self._dtypes)
        return df_imp

    def _extract_encoded_values(self, df: pd.DataFrame) -> None:
        """Creates a dictionary of valid values by column."""
        for col in df.columns:
            valid = df[col].dropna()
            self._encoded_values[col] = valid.unique()

    def _map_imputed_values(self, df: pd.DataFrame) -> pd.DataFrame:
        """Maps values to valid values (used after imputation)"""
        for col in df.columns:
            values = np.array(sorted(self._encoded_values[col]))
            df[col] = df[col].apply(lambda x: values[np.argmin(np.abs(x - values))])
        return df

In [9]:
imp = CBISImputer(random_state=5)
_ = imp.fit(df=df)
df_clean = imp.transform(df=df)

In [10]:
os.makedirs(os.path.dirname(FP_CLEAN), exist_ok=True)
df_clean.to_csv(FP_CLEAN, index=False)

In [11]:
sample_cases = df_missing['mmg_id'].sample(5)
df_missing.loc[df_missing['mmg_id'].isin(sample_cases)]
df_clean.loc[df_clean['mmg_id'].isin(sample_cases)]

Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,...,rows,cols,aspect_ratio,size,file_size,min_pixel_value,max_pixel_value,mean_pixel_value,std_pixel_value,filepath
315,P_00452,2.0,RIGHT,MLO,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,...,4592,3104,0.68,14253568,28508238,0,65535,12872.63,16989.45,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
316,P_00452,2.0,RIGHT,MLO,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,...,4592,3104,0.68,14253568,28508238,0,65535,12872.63,16989.45,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
317,P_00452,2.0,RIGHT,MLO,3,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,...,4592,3104,0.68,14253568,28508238,0,65535,12872.63,16989.45,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
785,P_01003,1.0,LEFT,CC,1,calcification,COARSE,,2,BENIGN_WITHOUT_CALLBACK,...,4520,3056,0.68,13813120,27627336,0,65535,10926.29,15954.34,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
859,P_01099,2.0,LEFT,MLO,1,calcification,PUNCTATE,,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,0.72,23051008,46103114,0,65535,11203.17,14284.05,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
860,P_01099,2.0,LEFT,MLO,2,calcification,PUNCTATE,,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,0.72,23051008,46103114,0,65535,11203.17,14284.05,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
861,P_01099,2.0,LEFT,MLO,3,calcification,PUNCTATE,,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,0.72,23051008,46103114,0,65535,11203.17,14284.05,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
862,P_01099,2.0,LEFT,MLO,4,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,0.72,23051008,46103114,0,65535,11203.17,14284.05,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
863,P_01099,2.0,RIGHT,CC,1,calcification,LUCENT_CENTERED,,2,BENIGN_WITHOUT_CALLBACK,...,5672,3960,0.7,22461120,44923336,0,65535,10095.96,13431.61,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
864,P_01099,2.0,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,...,5672,3960,0.7,22461120,44923336,0,65535,10095.96,13431.61,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...


Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,...,rows,cols,aspect_ratio,size,file_size,min_pixel_value,max_pixel_value,mean_pixel_value,std_pixel_value,filepath
315,P_00452,2.0,RIGHT,MLO,1,calcification,VASCULAR,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4592,3104,1.0,14253568,28508238,0,65535,12873.0,16989.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
316,P_00452,2.0,RIGHT,MLO,2,calcification,VASCULAR,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4592,3104,1.0,14253568,28508238,0,65535,12873.0,16989.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
317,P_00452,2.0,RIGHT,MLO,3,calcification,VASCULAR,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4592,3104,1.0,14253568,28508238,0,65535,12873.0,16989.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
785,P_01003,1.0,LEFT,CC,1,calcification,COARSE,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4520,3056,1.0,13813120,27627336,0,65535,10926.0,15954.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
859,P_01099,2.0,LEFT,MLO,1,calcification,PUNCTATE,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,1.0,23051008,46103114,0,65535,11203.0,14284.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
860,P_01099,2.0,LEFT,MLO,2,calcification,PUNCTATE,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,1.0,23051008,46103114,0,65535,11203.0,14284.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
861,P_01099,2.0,LEFT,MLO,3,calcification,PUNCTATE,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,1.0,23051008,46103114,0,65535,11203.0,14284.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
862,P_01099,2.0,LEFT,MLO,4,calcification,VASCULAR,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,5672,4064,1.0,23051008,46103114,0,65535,11203.0,14284.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
863,P_01099,2.0,RIGHT,CC,1,calcification,LUCENT_CENTERED,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,5672,3960,1.0,22461120,44923336,0,65535,10096.0,13432.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...
864,P_01099,2.0,RIGHT,CC,2,calcification,VASCULAR,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,5672,3960,1.0,22461120,44923336,0,65535,10096.0,13432.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_010...


In [12]:
ds = CBISDataset(filepath=FP_CLEAN)
dqcc = ds.dqa.analyze_completeness()
print(dqcc.summary)



                          Completeness                          
                         Dataset | CBIS-DDSM
                         Records | 3568
                Complete Records | 3568
             Record Completeness | 1.0
                     Data Values | 96336
            Complete Data Values | 96336
         Data Value Completeness | 1.0




In [13]:
dqcu = ds.dqa.analyze_uniqueness()
print(dqcu.summary)



                           Uniqueness                           
                         Dataset | CBIS-DDSM
                         Records | 3568
                  Unique Records | 3568
               Record Uniqueness | 1.0
                     Data Values | 96336
              Unique Data Values | 19864
           Data Value Uniqueness | 0.206




In [14]:
dqcv = ds.dqa.analyze_validity()
print(dqcv.summary)



                            Validity                            
                         Dataset | CBIS-DDSM
                         Records | 3568
                   Valid Records | 3568
                 Record Validity | 1.0
                     Data Values | 96336
               Valid Data Values | 96336
             Data Value Validity | 1.0


