# Problem Description

## `parsing_metadata.py` doesn't correctly save ALL THE ROIs from the xmls to mask pngs.

Below  is an example, for which, even though `rois_df` has a `True` value in the `stored` column for all 'calcification' ROIs, only one ROI of mC is stored in the final mask.

To run the notebook put it in the `notebooks/` folder

In [52]:
from tqdm import tqdm

In [1]:
import sys; sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import SimpleITK as sitk
from database.dataset import INBreast_Dataset

from pathlib import Path
from tqdm import tqdm
import cv2
import os
import ast

from skimage import restoration
from dehazing import dehaze


We are using the latest dataset class from `dev` and re-running the latest version of parsing script from `dev`

In [4]:
db = INBreast_Dataset(
        return_lesions_mask=True,
        level='image',
        max_lesion_size_mm=1.0,
        extract_patches=False,
        extract_patches_method='all',  # 'centered'
        patch_size=256,
        stride=256,
        min_breast_fraction_roi=0.5,
        normalize=None,
        n_jobs=-1)

In [34]:
img_id =  22579730 # one image for which the problem exists

problem_index = db.img_df[db.img_df.img_id == img_id].index[0] # should be 38
assert problem_index == 38

print(len(db[38]['lesion_bboxes']))

17


So we would expect 17 separate ROI's in our mask that are calcifications

In [38]:
mask_path = db.full_mask_path / f'{img_id}_lesion_mask.png'
mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
print('original unfiltered mask:\t', np.unique(mask, return_counts=True))
print("However in the original mask we can find only 3 ROIs")
mask = db.adjust_mask_to_selected_lesions(mask, 38)
print('\nfiltered mask', np.unique(mask, return_counts=True))


original unfiltered mask:	 (array([ 0, 12, 13, 17], dtype=uint8), array([4030422,  120973,     104,   11829]))
However in the original mask we can find only 3 ROIs

filtered mask (array([ 0, 13], dtype=uint8), array([4163224,     104]))


We see that filtering reduced number of ROIs in the mask from 3 to 1, even though we initially had to have 17

From df below we can see that they all should have been stored in the mask, and that 12 and 17 are correctly filtered non calcification ROIs.

But we are still missing our 16 calcification ROIs

In [42]:
db.rois_df[db.rois_df.img_id == img_id][['img_id', 'area', 'index_in_image',  'number_of_points', 'micros', 'distortion', 'asymmetry', 'lesion_type', 'artifact',  'center', 'lesion_bbox',	'center_crop', 'stored', ]].sort_values(by=['index_in_image'])

Unnamed: 0,img_id,area,index_in_image,number_of_points,micros,distortion,asymmetry,lesion_type,artifact,center,lesion_bbox,center_crop,stored
886,22579730,0.0,1,1,1.0,0.0,0.0,calcification,False,"(2314, 920)","[(2314, 920), (2314, 920)]","(246, 920)",True
885,22579730,0.0,2,1,True,False,False,calcification,False,"(2309, 926)","[(2309, 926), (2309, 926)]","(251, 926)",True
883,22579730,0.001568,3,4,True,False,False,calcification,False,"(1818, 1161)","[(1816, 1157), (1821, 1165)]","(742, 1161)",True
882,22579730,0.0,4,1,True,False,False,calcification,False,"(1845, 1147)","[(1845, 1147), (1845, 1147)]","(715, 1147)",True
881,22579730,0.0,5,1,True,False,False,calcification,False,"(1988, 1160)","[(1988, 1160), (1988, 1160)]","(572, 1160)",True
896,22579730,0.0,6,1,True,False,False,calcification,False,"(1794, 1150)","[(1794, 1150), (1794, 1150)]","(766, 1150)",True
894,22579730,0.0,7,1,True,False,False,calcification,False,"(1767, 1158)","[(1767, 1158), (1767, 1158)]","(793, 1158)",True
888,22579730,0.0,8,1,True,False,False,calcification,False,"(1862, 1068)","[(1862, 1068), (1862, 1068)]","(698, 1068)",True
887,22579730,0.003825,9,6,True,False,False,calcification,False,"(1872, 1104)","[(1867, 1099), (1878, 1110)]","(688, 1104)",True
889,22579730,0.0,10,1,True,False,False,calcification,False,"(1864, 1094)","[(1864, 1094), (1864, 1094)]","(696, 1094)",True


We can also see that among those ROIs that weren't saved in the mask, some have are, so are not single pixel segmentations. And since `stored` == `True` for them, the problem probably wasn't the image boundary. 

In [41]:
db.rois_df[db.rois_df.img_id == img_id]

Unnamed: 0,case_id,img_id,side,view,area,center,center_crop,dev,index_in_image,max,...,micros,distortion,asymmetry,finding_notes,lesion_annot,pectoral_muscle,artifact,lesion_type,radius,partition
880,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(1840, 1187)","(720, 1187)",0.0,18,3300.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,0.0001,train
881,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(1988, 1160)","(572, 1160)",0.0,5,3448.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,0.0001,train
882,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(1845, 1147)","(715, 1147)",0.0,4,3617.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,0.0001,train
883,bbd6a3a35438c11b,22579730,R,MLO,0.001568,"(1818, 1161)","(742, 1161)",221.736923,3,3715.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,4.0001,train
884,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(1779, 1255)","(781, 1255)",0.0,19,3493.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,0.0001,train
885,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(2309, 926)","(251, 926)",0.0,2,1634.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,0.0001,train
886,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(2314, 920)","(246, 920)",0.0,1,1935.0,...,1.0,0.0,0.0,2 nódulos + micros,yes,False,False,calcification,0.0001,train
887,bbd6a3a35438c11b,22579730,R,MLO,0.003825,"(1872, 1104)","(688, 1104)",249.826996,9,3976.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,5.700977,train
888,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(1862, 1068)","(698, 1068)",0.0,8,3671.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,0.0001,train
889,bbd6a3a35438c11b,22579730,R,MLO,0.0,"(1864, 1094)","(696, 1094)",0.0,10,3458.0,...,True,False,False,2 nódulos + micros,yes,False,False,calcification,0.0001,train


In [53]:
bad_imgs = []
for i in tqdm(range(310), total=310):
    if db[i]['lesion_bboxes'] != len(db.rois_df[db.rois_df.img_id == db.img_df.iloc[i].img_id]):
        bad_imgs.append(i)

100%|██████████| 310/310 [00:49<00:00,  6.31it/s]


In [54]:
len(bad_imgs)/310

1.0