In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../../..")))

import pandas as pd
import numpy as np

from bcd.data_prep.cbis import CBISImputer
from bcd.data.dataset import CBISDataset

pd.options.display.max_rows = 999

In [2]:
FP_STAGED = "data/meta/2_staged/cbis.csv"
FP_CLEAN = "data/meta/3_clean/cbis.csv"

In [3]:
df = pd.read_csv(FP_STAGED)
df_orig = df.copy()

In [4]:
# Set invalid values for breast_density to NA
df['breast_density'] = df['breast_density'].replace(0, np.NAN)

In [5]:
# Set case and mass data to NOT APPLICABLE where appropriate.
df['subtlety'] = df['subtlety'].replace(0, np.NAN)

In [6]:
df.loc[df['calc_type'] == 'LUCENT_CENTER', 'calc_type'] = 'LUCENT_CENTERED'
df.loc[df['calc_type'] == 'ROUND_AND_REGULAR-LUCENT_CENTER-DYSTROPHIC', 'calc_type'] = 'ROUND_AND_REGULAR-LUCENT_CENTERED-DYSTROPHIC'
df.loc[df['calc_type'] == 'PUNCTATE-LUCENT_CENTER', 'calc_type'] = 'PUNCTATE-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'VASCULAR-COARSE-LUCENT_CENTER-ROUND_AND_REGULAR-PUNCTATE', 'calc_type'] = 'VASCULAR-COARSE-LUCENT_CENTERED-ROUND_AND_REGULAR-PUNCTATE'
df.loc[df['calc_type'] == 'ROUND_AND_REGULAR-LUCENT_CENTER', 'calc_type'] = 'ROUND_AND_REGULAR-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'LUCENT_CENTER-PUNCTATE', 'calc_type'] = 'LUCENT_CENTERED-PUNCTATE'
df.loc[df['calc_type'] == 'COARSE-ROUND_AND_REGULAR-LUCENT_CENTER', 'calc_type'] = 'COARSE-ROUND_AND_REGULAR-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'ROUND_AND_REGULAR-LUCENT_CENTER-PUNCTATE', 'calc_type'] = 'ROUND_AND_REGULAR-LUCENT_CENTERED-PUNCTATE'
df.loc[df['calc_type'] == 'COARSE-LUCENT_CENTER', 'calc_type'] = 'COARSE-LUCENT_CENTERED'
df.loc[df['calc_type'] == 'PLEOMORPHIC-PLEOMORPHIC', 'calc_type'] = 'PLEOMORPHIC'

In [7]:
# Grab rows with missing data
null_mask = df.isnull().any(axis=1)
df_missing = df[null_mask]
msg = f"There are {df_missing.shape[0]} rows (approximately {round(df_missing.shape[0] / df_orig.shape[0] * 100,1)}% of the dataset) with missing data in the dataset."
print(msg)

There are 527 rows (approximately 14.8% of the dataset) with missing data in the dataset.


In [8]:
imp = CBISImputer(random_state=5)
imp.fit(df=df)
df_clean = imp.transform(df=df)

<bcd.data_prep.cbis.CBISImputer at 0x7f952693cd60>

In [9]:
os.makedirs(os.path.dirname(FP_CLEAN), exist_ok=True)
df_clean.to_csv(FP_CLEAN, index=False)

In [10]:
sample_cases = df_missing['mmg_id'].sample(5)
df_missing.loc[df_missing['mmg_id'].isin(sample_cases)]
df_clean.loc[df_clean['mmg_id'].isin(sample_cases)]

Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,...,rows,cols,aspect_ratio,size,file_size,min_pixel_value,max_pixel_value,mean_pixel_value,std_pixel_value,filepath
105,P_00112,3.0,RIGHT,CC,1,calcification,ROUND_AND_REGULAR-EGGSHELL,,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,0.67,14155776,28312648,0,65535,8303.77,14536.42,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
106,P_00112,3.0,RIGHT,CC,2,calcification,ROUND_AND_REGULAR-EGGSHELL,,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,0.67,14155776,28312648,0,65535,8303.77,14536.42,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
107,P_00112,3.0,RIGHT,CC,3,calcification,ROUND_AND_REGULAR-EGGSHELL,,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,0.67,14155776,28312648,0,65535,8303.77,14536.42,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
108,P_00112,3.0,RIGHT,CC,4,calcification,ROUND_AND_REGULAR-EGGSHELL,,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,0.67,14155776,28312648,0,65535,8303.77,14536.42,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
109,P_00112,3.0,RIGHT,CC,5,calcification,ROUND_AND_REGULAR-EGGSHELL,,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,0.67,14155776,28312648,0,65535,8303.77,14536.42,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
110,P_00112,3.0,RIGHT,CC,6,calcification,ROUND_AND_REGULAR-EGGSHELL,,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,0.67,14155776,28312648,0,65535,8303.77,14536.42,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
313,P_00452,2.0,RIGHT,CC,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,...,4624,3096,0.67,14315904,28632904,0,65535,12295.14,16808.86,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
314,P_00452,2.0,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,...,4624,3096,0.67,14315904,28632904,0,65535,12295.14,16808.86,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
665,P_00840,4.0,RIGHT,CC,1,calcification,LUCENT_CENTERED,,2,BENIGN_WITHOUT_CALLBACK,...,4624,3064,0.66,14167936,28336968,0,65535,9452.57,16601.18,data/image/0_raw/CBIS-DDSM/Calc-Training_P_008...
1267,P_01628,3.0,RIGHT,CC,1,calcification,LUCENT_CENTERED,,2,BENIGN_WITHOUT_CALLBACK,...,5528,4112,0.74,22731136,45463368,0,65535,6273.16,13742.66,data/image/0_raw/CBIS-DDSM/Calc-Training_P_016...


Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,...,rows,cols,aspect_ratio,size,file_size,min_pixel_value,max_pixel_value,mean_pixel_value,std_pixel_value,filepath
105,P_00112,3.0,RIGHT,CC,1,calcification,ROUND_AND_REGULAR-EGGSHELL,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,1.0,14155776,28312648,0,65535,8304.0,14536.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
106,P_00112,3.0,RIGHT,CC,2,calcification,ROUND_AND_REGULAR-EGGSHELL,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,1.0,14155776,28312648,0,65535,8304.0,14536.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
107,P_00112,3.0,RIGHT,CC,3,calcification,ROUND_AND_REGULAR-EGGSHELL,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,1.0,14155776,28312648,0,65535,8304.0,14536.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
108,P_00112,3.0,RIGHT,CC,4,calcification,ROUND_AND_REGULAR-EGGSHELL,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,1.0,14155776,28312648,0,65535,8304.0,14536.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
109,P_00112,3.0,RIGHT,CC,5,calcification,ROUND_AND_REGULAR-EGGSHELL,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,1.0,14155776,28312648,0,65535,8304.0,14536.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
110,P_00112,3.0,RIGHT,CC,6,calcification,ROUND_AND_REGULAR-EGGSHELL,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4608,3072,1.0,14155776,28312648,0,65535,8304.0,14536.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_001...
313,P_00452,2.0,RIGHT,CC,1,calcification,VASCULAR,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4624,3096,1.0,14315904,28632904,0,65535,12295.0,16809.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
314,P_00452,2.0,RIGHT,CC,2,calcification,VASCULAR,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4624,3096,1.0,14315904,28632904,0,65535,12295.0,16809.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_004...
665,P_00840,4.0,RIGHT,CC,1,calcification,LUCENT_CENTERED,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,4624,3064,1.0,14167936,28336968,0,65535,9453.0,16601.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_008...
1267,P_01628,3.0,RIGHT,CC,1,calcification,LUCENT_CENTERED,SEGMENTAL,2,BENIGN_WITHOUT_CALLBACK,...,5528,4112,1.0,22731136,45463368,0,65535,6273.0,13743.0,data/image/0_raw/CBIS-DDSM/Calc-Training_P_016...


In [11]:
ds = CBISDataset(filepath=FP_CLEAN)
dqcc = ds.dqa.analyze_completeness()
print(dqcc.summary)



                          Completeness                          
                         Dataset | CBIS-DDSM
                         Records | 3568
                Complete Records | 3568
             Record Completeness | 1.0
                     Data Values | 96336
            Complete Data Values | 96336
         Data Value Completeness | 1.0




In [12]:
dqcu = ds.dqa.analyze_uniqueness()
print(dqcu.summary)



                           Uniqueness                           
                         Dataset | CBIS-DDSM
                         Records | 3568
                  Unique Records | 3568
               Record Uniqueness | 1.0
                     Data Values | 96336
              Unique Data Values | 19864
           Data Value Uniqueness | 0.206




In [13]:
dqcv = ds.dqa.analyze_validity()
print(dqcv.summary)



                            Validity                            
                         Dataset | CBIS-DDSM
                         Records | 3568
                   Valid Records | 3568
                 Record Validity | 1.0
                     Data Values | 96336
               Valid Data Values | 96336
             Data Value Validity | 1.0


