In [1]:
import os
if 'jbook' in os.getcwd():
    os.chdir(os.path.abspath(os.path.join("../../..")))

import pandas as pd
from IPython.display import HTML, display_html
import numpy as np

from bcd.analyze.dqa.cbis import CBISDQA
from bcd.data.dataset import CBISDataset

pd.set_option('display.max_colwidth', 200)

In [2]:
filepath = "data/meta/2_staged/cbis.csv"
cbis = CBISDataset(filepath=filepath)
cbis.dqa = CBISDQA

In [3]:
dqc = cbis.dqa.analyze_completeness()
print(dqc.summary, end=" ")



                          Completeness                          
                         Dataset | CBIS-DDSM
                         Records | 3568
                Complete Records | 3043
             Record Completeness | 0.853
                     Data Values | 96336
            Complete Data Values | 95809
         Data Value Completeness | 0.995

 

In [4]:
dqc.detail

Unnamed: 0,N,Complete,Missing,Completeness
patient_id,3568,3568,0,1.0
breast_density,3568,3568,0,1.0
laterality,3568,3568,0,1.0
image_view,3568,3568,0,1.0
abnormality_id,3568,3568,0,1.0
abnormality_type,3568,3568,0,1.0
calc_type,3568,3544,24,0.99
calc_distribution,3568,3129,439,0.88
assessment,3568,3568,0,1.0
pathology,3568,3568,0,1.0


In [5]:
cbis.dqa.get_incomplete_data(subset='calc_distribution')['pathology'].value_counts().to_frame()

Unnamed: 0_level_0,count
pathology,Unnamed: 1_level_1
BENIGN_WITHOUT_CALLBACK,434
BENIGN,5


In [6]:
dqu = cbis.dqa.analyze_uniqueness()
print(dqu.summary)



                           Uniqueness                           
                         Dataset | CBIS-DDSM
                         Records | 3568
                  Unique Records | 3568
               Record Uniqueness | 1.0
                     Data Values | 96336
              Unique Data Values | 23101
           Data Value Uniqueness | 0.24




In [7]:
dqv = cbis.dqa.analyze_validity()
print(dqv.summary)



                            Validity                            
                         Dataset | CBIS-DDSM
                         Records | 3568
                   Valid Records | 3009
                 Record Validity | 0.843
                     Data Values | 96336
               Valid Data Values | 95608
             Data Value Validity | 0.992




In [8]:
dqv.detail

Unnamed: 0,N,Valid,Invalid,Validity
mmg_id,3568,3568,0,1.0
patient_id,3568,3568,0,1.0
breast_density,3568,3566,2,1.0
laterality,3568,3568,0,1.0
image_view,3568,3568,0,1.0
abnormality_id,3568,3568,0,1.0
abnormality_type,3568,3568,0,1.0
calc_type,3568,3347,221,0.94
calc_distribution,3568,3129,439,0.88
mass_shape,3568,3564,4,1.0


In [9]:
cbis.dqa.get_invalid_data(subset='calc_type')['calc_type'].unique()

array([nan, 'ROUND_AND_REGULAR-LUCENT_CENTER-DYSTROPHIC',
       'PUNCTATE-LUCENT_CENTER',
       'VASCULAR-COARSE-LUCENT_CENTER-ROUND_AND_REGULAR-PUNCTATE',
       'LUCENT_CENTER', 'ROUND_AND_REGULAR-LUCENT_CENTER',
       'LUCENT_CENTER-PUNCTATE', 'PLEOMORPHIC-PLEOMORPHIC',
       'COARSE-ROUND_AND_REGULAR-LUCENT_CENTER',
       'ROUND_AND_REGULAR-LUCENT_CENTER-PUNCTATE', 'COARSE-LUCENT_CENTER'],
      dtype=object)

In [10]:
cbis.dqa.get_invalid_data(subset='breast_density')

Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,...,rows,cols,aspect_ratio,size,file_size,min_pixel_value,max_pixel_value,mean_pixel_value,std_pixel_value,filepath
1817,P_01743,0,RIGHT,CC,1,calcification,PLEOMORPHIC,DIFFUSELY_SCATTERED,5,MALIGNANT,...,4576,1831,0.4,8378656,16758290,0,65535,10701.68,19437.31,data/image/0_raw/CBIS-DDSM/Calc-Test_P_01743_RIGHT_CC/08-29-2017-DDSM-78699/1.000000-full mammogram images-62009/1-1.dcm
1818,P_01743,0,RIGHT,MLO,1,calcification,PLEOMORPHIC,DIFFUSELY_SCATTERED,5,MALIGNANT,...,5176,2716,0.52,14058016,28117014,0,65535,14108.36,20763.14,data/image/0_raw/CBIS-DDSM/Calc-Test_P_01743_RIGHT_MLO/08-29-2017-DDSM-15483/1.000000-full mammogram images-98404/1-1.dcm


In [11]:
df = cbis.dqa.get_complete_data()
df_similar_type = df.loc[(df['calc_type'] == 'PLEOMORPHIC') & (df['cancer'] == True)]
df_similar_dist = df.loc[(df['calc_distribution'] == 'DIFFUSELY_SCATTERED') & (df['cancer'] == True)]

In [12]:
df_similar_type['breast_density'].value_counts().to_frame()

Unnamed: 0_level_0,count
breast_density,Unnamed: 1_level_1
3,158
2,123
4,121
1,23
0,2


In [13]:
df_similar_dist['breast_density'].value_counts().to_frame()

Unnamed: 0_level_0,count
breast_density,Unnamed: 1_level_1
2,2
0,2
3,1


In [14]:
cbis.dqa.get_invalid_data(subset='subtlety')

Unnamed: 0,patient_id,breast_density,laterality,image_view,abnormality_id,abnormality_type,calc_type,calc_distribution,assessment,pathology,...,rows,cols,aspect_ratio,size,file_size,min_pixel_value,max_pixel_value,mean_pixel_value,std_pixel_value,filepath
2364,P_00710,2,RIGHT,CC,1,mass,NOT APPLICABLE,NOT APPLICABLE,0,BENIGN,...,4728,3112,0.66,14713536,29428168,0,65535,6370.67,12047.63,data/image/0_raw/CBIS-DDSM/Mass-Training_P_00710_RIGHT_CC/07-20-2016-DDSM-01521/1.000000-full mammogram images-11222/1-1.dcm
2365,P_00710,2,RIGHT,MLO,1,mass,NOT APPLICABLE,NOT APPLICABLE,0,BENIGN,...,4688,3120,0.67,14626560,29254222,0,65535,10743.82,15182.58,data/image/0_raw/CBIS-DDSM/Mass-Training_P_00710_RIGHT_MLO/07-20-2016-DDSM-11749/1.000000-full mammogram images-15311/1-1.dcm
