In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from IPython.display import display, HTML

# import matplotlib and numpy
import matplotlib.pyplot as plt 
import matplotlib.image as mpimage
import matplotlib.patches as patches

from matplotlib import cm
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1 import make_axes_locatable
import colorcet as cc

import numpy as np 
import pandas as pd
import cv2

# import operating system and glob libraries
import os
import time
from datetime import datetime
from pathlib import Path

cmap_grey = cc.cm.linear_grey_0_100_c0

In [3]:
commonPath     = Path("../../mammography-data/")
converted_dir  = commonPath/"converted_images"

cases = pd.read_csv(converted_dir/"converted_data_with_dimensions.csv", sep=";")
gp    = cases.groupby( ["purpose", "simple_pathology"])["case_id"].count()
print(f"Total:{len(cases)}")
print( gp )

dicomImage_issues = []
observationImage_issues = []


Total:3101
purpose  simple_pathology          
test     BENIGN                         381
         BENIGN;MALIGNANT                 4
         MALIGNANT                      260
train    BENIGN                        1346
         BENIGN;MALIGNANT                 5
         BENIGN;MALIGNANT;MALIGNANT       2
         MALIGNANT                     1097
         MALIGNANT;BENIGN                 6
Name: case_id, dtype: int64


In [4]:
equalized_dir = commonPath/"mammography-dogscats-match-equalization-BINS-CHX/full"
tile_dir      = commonPath/"mammography-dogscats-match-equalization-BINS-CHX/tiles/"
#print( list(equalized_dir.iterdir()) )

In [5]:
def centerOfObs(mask):
    #turn it to binary so that vi can use m00 as number of pixels i the roi. 
    #That is we do not give special treatment to different rois that touch 
    m = (mask.copy()>0).astype(np.uint8)
    img,contours,hierarchy = cv2.findContours( m, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE )
    centers = []
    bbs = []
    area = []
    # loop over the contours
    for c in contours:
        #ignore isonlated 1 pixels at the border og the observation
        if c.shape[0] < 3 : continue 

        c  = np.squeeze(c)
        bb = cv2.boundingRect(c)
        #arrange the bb totopleft , bottom right matrix coordinates
        bb = np.asarray([bb[i] for i in [1,0,3,2]])
        
        # compute the center of the contour
        moments = cv2.moments(c)
        m00 = moments["m00"]
        m10 = moments["m10"]
        m01 = moments["m01"]
        #print( f"{m00}, {m10}, {m01}")
        if m00 > 0:
            cX = int(m10 / m00)
            cY = int(m01 / m00)
            centers.append([cY,cX])
            bbs.append( bb )
            area.append(int(m00))
    return np.asarray(centers), np.asarray(bbs), np.asarray(area)

In [6]:
def pixel_count(im,value): return np.sum(im.flatten()==value)

In [7]:
from enum import Enum
class ObsType(Enum):
    EMPTY=0           ,"empty"                   # outside the breast
    NORMAL=1          ,"normal"                  # tissue with no segments
    BENIGN_CALC=2     ,"benign_calcification"    # tissue with benign calcification
    MALIGNANT_CALC=3  ,"malignant calcification"  # tissue with malignant calcification
    BENIGN_MASS=4     ,"benign_mass"              # tissue with a benign node
    MALIGNANT_MASS=5  ,"malignant:mass"           # tissue with a malignant node
def enum2int(e): return e.value[0]

In [8]:
def offset2Tiles(of, im_dir, ma_dir, case_id, im, mask, sz, obsArea, ignoreObs=False, min_Air=0.5):
    case_ids,tile_ids,fnImages, fnMasks, pAir,pHealthy, \
        pBenign_calc, pMalignant_calc, pBenign_mass, pMalignant_mass, \
            rCenter,cCenter,tile_size, obsAreas = [],[],[],[],[],[],[],[],[],[],[],[],[],[]

    noTilePixels = sz[0]*sz[1]
    for o in of:
        im_t  = im[o[0]:o[0]+sz[0], o[1]:o[1]+sz[1]]
        p_Air = pixel_count(im_t,ObsType.EMPTY)
        if p_Air/float(noTilePixels) > min_Air : continue
            
        ma               = mask[o[0]:o[0]+sz[0], o[1]:o[1]+sz[1]]
        p_Benign_calc    = pixel_count(ma,enum2int(ObsType.BENIGN_CALC))
        p_Malignant_calc = pixel_count(ma,enum2int(ObsType.MALIGNANT_CALC))
        p_Benign_mass    = pixel_count(ma,enum2int(ObsType.BENIGN_MASS))
        p_Malignant_mass = pixel_count(ma,enum2int(ObsType.MALIGNANT_MASS))
        p_Healthy        = pixel_count(ma,enum2int(ObsType.NORMAL))
        
        #if ignoreObs then skip tiles with benign og malignant tissue 
        if ignoreObs and (p_Healthy+p_Air) < noTilePixels: continue
            
        tile_id    = f"{case_id}_tile_{o[0]}_{o[1]}" 
        image_file = tile_id +".png" #f"{im_dir}/{tile_id}.png"
        mask_file  = tile_id +".png" #f"{ma_dir}/{tile_id}.png"

        case_ids.append(   str(case_id) )
        tile_ids.append(   tile_id )
        fnImages.append(   str(image_file) )
        fnMasks.append(    str(mask_file) )
        pAir.append(       p_Air )
        pHealthy.append(   p_Healthy )
        pBenign_calc.append(    p_Benign_calc )
        pMalignant_calc.append( p_Malignant_calc )
        pBenign_mass.append(    p_Benign_mass )
        pMalignant_mass.append( p_Malignant_mass )
        rCenter.append( round( (o[0]+0.5*sz[0])/im.shape[0] ,2) )
        cCenter.append( round( (o[1]+0.5*sz[1])/im.shape[1] ,2) )
        tile_size.append(  noTilePixels )
        obsAreas.append(   obsArea )
        
        cv2.imwrite( str( im_dir/image_file), (65535.0*im_t + 0.5).astype(np.uint16) )    
        cv2.imwrite( str( ma_dir/mask_file), ma )

    result = pd.DataFrame( 
        np.column_stack([
                    case_ids, tile_ids, pAir,  pHealthy, \
                    pBenign_calc, pMalignant_calc, pBenign_mass, pMalignant_mass, \
            rCenter, cCenter, tile_size, obsAreas,  fnImages, fnMasks ]),
        columns = ["case_id","tile_id","empty","normal", \
                   "benign_calc","malignant_calc", "benign_mass", "malignant_mass", \
                   "rCenter","cCenter","tile_size","obsAreas","fnImage","fnMask"])
        
    return result

In [9]:
def imageCoord(xy_arrayIndex) : return (xy_arrayIndex[1], xy_arrayIndex[0] )
#sz :size of the tiles
def generate_tile2file( im_dir, ma_dir, case_id, im, mask, tilesPRObservation, sz=[256,256] ):
    sz = np.asarray(sz)

    #generate tiles on a regular grid
    nb = np.ceil(im.shape / np.asarray(sz)).astype(int)
    last_of = im.shape - sz
    #calculate alle offset
    of = []
    for i in range(0,nb[0]):
        r = i*sz[0] if i+1<nb[0] else last_of[0]
        for j in range(0,nb[1]):
            c = j*sz[1] if j+1<nb[1] else last_of[1]
            of.append((r,c)) 
    #get alle tile that do not overlap with observations (ignoreObs=True)
    allResult = offset2Tiles(of, im_dir, ma_dir, case_id, im, mask, sz, obsArea=0, ignoreObs=True)

    #generate tiles that overlap the observations
    centers,bbs, area = centerOfObs(mask)
    for b,a in zip(bbs,area): 
        inset  = np.minimum(sz//2, b[2:4]//2)
        tl_min = b[0:2] + inset - sz
        tl_min = np.maximum(tl_min, np.zeros_like(tl_min))
        tl_max = b[0:2] + b[2:4] - inset
        tl_max = np.minimum(tl_max+sz, im.shape) -sz
        #print("b: ", b, " tl_min: ", tl_min, " tl_max: ",tl_max, " inset: ", inset, "area:", round(a/(sz[0]*sz[1]),2) )

        of = np.random.uniform( low=tl_min, high=tl_max, size=[tilesPRObservation,2] ).astype(int)
        result = offset2Tiles(of, im_dir, ma_dir, case_id, im, mask, sz, obsArea=a, )
        if allResult is None:    allResult = result 
        elif result is not None: allResult = allResult.append( result )
    return allResult


In [10]:
def generateTiles( cases, pathTrg, nbCommon, scaling, size ):
    
    dfTiles = None
    nb_cases = len(cases)
    print(f"Cases to process:{nb_cases}" )
    for i in range(len(cases)):
        r = cases.iloc[i]
        case_id, fnXray, fnMask = str(r.case_id), Path(r.fnXray), Path(r.fnMask)
                                  
        drImTrg  = pathTrg.joinpath(*fnXray.parts[nbCommon:]).parent
        drMskTrg = pathTrg.joinpath(*fnMask.parts[nbCommon:]).parent
        drImTrg.mkdir(parents=True, exist_ok=True)
        drMskTrg.mkdir(parents=True, exist_ok=True)
        
        im    = cv2.imread(str(fnXray),  cv2.IMREAD_ANYDEPTH)/65536.0
        mask  = cv2.imread(str(fnMask), cv2.IMREAD_ANYDEPTH).astype(np.uint8)
    
        if min(im.shape) >= size*scaling and scaling < 1.:
            dsize = (round(im.shape[1]*scaling),round(im.shape[0]*scaling))
            im    = cv2.resize(im,   dsize=dsize, interpolation=cv2.INTER_AREA)
            mask  = cv2.resize(mask, dsize=dsize, interpolation=cv2.INTER_NEAREST)
        elif min(im.shape) < size:
            s     = float(size)/min(im.shape)
            dsize = (round(im.shape[1]*s),round(im.shape[0]*s))
            im    = cv2.resize(im,   dsize=dsize, interpolation=cv2.INTER_AREA)
            mask  = cv2.resize(mask, dsize=dsize, interpolation=cv2.INTER_NEAREST)
       
        prObservation=3
        result  = generate_tile2file( drImTrg, drMskTrg, case_id, im, mask, 
                                      tilesPRObservation=prObservation, sz=[size,size] )
        dfTiles = result if dfTiles is None else dfTiles.append( result )
        
        if i-(int(i/50)*50) == 0 :
            percentage = int(np.around( i*100 / nb_cases))
            print( f"current case: {r.case_id}  Processed: {percentage}%  Generated tiles: ", dfTiles.shape[0] if dfTiles is not None else 0 )
    return dfTiles

In [11]:
start = time.perf_counter()

#build dataframe with case_id, fnXray, fnMask
files       = list(equalized_dir.glob("**/obs_masks/*.png"))
mskFiles    = {str(f.stem[:-9]):str(f) for f in files}

files       = list(equalized_dir.glob("**/xrays/*.png"))
xFiles      = {str(f.stem):str(f) for f in files}

caseFiles   = [ np.asarray([ k, xFiles[k], mskFiles[k] ]) for k in mskFiles.keys() ]
dfCaseFiles = pd.DataFrame(data=caseFiles, columns=["case_id","fnXray","fnMask"])

%time dfTiles = generateTiles(dfCaseFiles, Path(tile_dir), len(equalized_dir.parts), scaling = 1.0, size=512)

if dfTiles is not None: 
    print("total number og tiles generated: ", dfTiles.shape)
    tile_cases = pd.merge(cases, dfTiles, how="right", left_on='case_id', right_on='case_id')
    tile_cases = tile_cases.drop(["obs_id"],axis=1)
    tile_cases.to_csv(tile_dir /"tiles.csv", sep=";", index=False)
    print("tile_cases.shape: ",tile_cases.shape, "\nsaved at: ", tile_dir/"tiles.csv")
    display(tile_cases.head())
else: print("dfTiles is ZERO")
    
print("runTime ", time.perf_counter() - start )


Cases to process:3101
current case: Mass-Test_P_01183_LEFT_MLO  Processed: 0%  Generated tiles:  20
current case: Mass-Test_P_01551_LEFT_MLO  Processed: 2%  Generated tiles:  854
current case: Mass-Test_P_01518_LEFT_CC  Processed: 3%  Generated tiles:  1775
current case: Mass-Test_P_01477_LEFT_CC  Processed: 5%  Generated tiles:  2642
current case: Calc-Test_P_00643_LEFT_MLO  Processed: 6%  Generated tiles:  3711
current case: Mass-Test_P_01599_LEFT_MLO  Processed: 8%  Generated tiles:  4739
current case: Calc-Test_P_01224_RIGHT_CC  Processed: 10%  Generated tiles:  5744
current case: Mass-Test_P_01106_LEFT_MLO  Processed: 11%  Generated tiles:  6749
current case: Calc-Test_P_01030_RIGHT_CC  Processed: 13%  Generated tiles:  7844
current case: Mass-Test_P_01787_LEFT_MLO  Processed: 15%  Generated tiles:  8735
current case: Mass-Test_P_01797_LEFT_MLO  Processed: 16%  Generated tiles:  9853
current case: Calc-Test_P_00497_LEFT_MLO  Processed: 18%  Generated tiles:  10770
current case: Ma

Unnamed: 0.1,Unnamed: 0,case_id,height,width,simple_pathology,patient_id,purpose,density,left_right,projection,...,benign_calc,malignant_calc,benign_mass,malignant_mass,rCenter,cCenter,tile_size,obsAreas,fnImage,fnMask
0,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,0,0,0,0.21,0.2,262144,0,Calc-Test_P_00038_LEFT_CC_tile_512_0.png,Calc-Test_P_00038_LEFT_CC_tile_512_0.png
1,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,0,0,0,0.36,0.2,262144,0,Calc-Test_P_00038_LEFT_CC_tile_1024_0.png,Calc-Test_P_00038_LEFT_CC_tile_1024_0.png
2,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,0,0,0,0.36,0.59,262144,0,Calc-Test_P_00038_LEFT_CC_tile_1024_512.png,Calc-Test_P_00038_LEFT_CC_tile_1024_512.png
3,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,0,0,0,0.5,0.59,262144,0,Calc-Test_P_00038_LEFT_CC_tile_1536_512.png,Calc-Test_P_00038_LEFT_CC_tile_1536_512.png
4,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,0,0,0,0.64,0.2,262144,0,Calc-Test_P_00038_LEFT_CC_tile_2048_0.png,Calc-Test_P_00038_LEFT_CC_tile_2048_0.png


runTime  3368.006689283
