In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from IPython.display import display, HTML

# import matplotlib and numpy
import matplotlib.pyplot as plt 
import matplotlib.image as mpimage
import matplotlib.patches as patches

from matplotlib import cm
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1 import make_axes_locatable
import colorcet as cc

import numpy as np 
import pandas as pd
import cv2

# import operating system and glob libraries
import os
import time
from datetime import datetime
from pathlib import Path

dicomImage_issues = []
observationImage_issues = []

In [3]:
cmap_grey = cc.cm.linear_grey_0_100_c0

In [4]:
commonPath     = Path("../../mammography-data/")
converted_dir  = commonPath/"converted_images/"

cases = pd.read_csv(converted_dir/"converted_data_with_dimensions.csv", sep=";")
gp    = cases.groupby( ["purpose", "simple_pathology"])["case_id"].count()
print(f"Total:{len(cases)}")
print( gp )


Total:3101
purpose  simple_pathology          
test     BENIGN                         381
         BENIGN;MALIGNANT                 4
         MALIGNANT                      260
train    BENIGN                        1346
         BENIGN;MALIGNANT                 5
         BENIGN;MALIGNANT;MALIGNANT       2
         MALIGNANT                     1097
         MALIGNANT;BENIGN                 6
Name: case_id, dtype: int64


In [5]:
equalized_dir = commonPath/"mammography-dogscats-match-equalization-BINS-CHX/"
tile_dir      = commonPath/"mammography-dogscats-match-equalization-BINS-CHX/tiles/"

In [6]:
def simplePlot(img, title, cmap, figsize=(8,8)):
    fig = plt.figure(figsize=figsize )
    plt.title(title)
    plt.imshow(img, cmap)    
    plt.show()

In [7]:
def doublePlot(mainTitle, title1, img1, title2, img2, colormap, width = 16 ):

    height = round( (width*img1.shape[0]) /img1.shape[1] )

    fig = plt.figure(figsize = (width,height)) 
    plt.tight_layout()
    #fig.suptitle(mainTitle)

    ax1 = fig.add_subplot(221)
    ax1.set_title(title1)
    im1 = ax1.imshow(img1, cmap=colormap)
    divider = make_axes_locatable(ax1)
    cax = divider.append_axes("bottom", "5%", pad="3%")
    colorbar = fig.colorbar(im1, cax=cax, orientation="horizontal")

    ax2 = fig.add_subplot(222)
    ax2.set_title(title2)

    im2 = ax2.imshow(img2, cmap=colormap)
    divider2 = make_axes_locatable(ax2)
    cax2 = divider2.append_axes("bottom", "5%", pad="3%")
    colorbar2 = fig.colorbar(im2, cax=cax2, orientation="horizontal")


    plt.show()    
    return

In [8]:
def plotHistogramsBeforeAndAfter(title, before, after, limits=[0.01, 0.99], x=None):
    fig=plt.figure(figsize = (12,12), dpi=100) 
    plt.suptitle(title, fontsize=15)
    plt.tight_layout()

    nb_plot_bins = 512
    ax1 = fig.add_subplot(221)
    ax1.set_title("before")
    plt.hist(before.flatten(), nb_plot_bins, limits )

    ax2 = fig.add_subplot(222)
    ax2.set_title("After: " )
    plt.hist(after.flatten(), nb_plot_bins, limits )
    plt.show()

In [9]:
def centerOfObs(mask):
    img,contours,hierarchy = cv2.findContours( mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE )
    centers = []
    bbs = []
    area = []
    # loop over the contours
    for c in contours:
        #ignore isonlated 1 pixels at the border og the observation
        if c.shape[0] < 3 : continue 

        c  = np.squeeze(c)
        bb = cv2.boundingRect(c)
        #arrange the bb totopleft , bottom right matrix coordinates
        bb = np.asarray([bb[i] for i in [1,0,3,2]])
        
        # compute the center of the contour
        moments = cv2.moments(c)
        m00 = moments["m00"]
        m10 = moments["m10"]
        m01 = moments["m01"]
        #print( f"{m00}, {m10}, {m01}")
        if m00 > 0:
            cX = int(m10 / m00)
            cY = int(m01 / m00)
            centers.append([cY,cX])
            bbs.append( bb )
            area.append(int(m00))
            #cv2.circle(img, (cX, cY), 7, (0, 0, 0), -1)
            #print( f"cX:{cX}, cY:{cY}")
        #cv2.rectangle(img, (bb[0],bb[1]), (bb[0]+bb[2], bb[1]+bb[3]), (255, 255, 255), cv2.LINE_8);
        #print(contours)
    #simplePlot(img, "center of mask", cmap_grey)
    #doublePlot("centerOfObs","mask",mask,  "image from countour", img, cmap_grey, width = 6)
    return np.asarray(centers), np.asarray(bbs), np.asarray(area)

In [10]:
#def pixel_ratio(im,value): return round(np.sum(im.flatten()==value)/(im.shape[0]*im.shape[1]), 3)
def pixel_count(im,value): return np.sum(im.flatten()==value)

In [11]:
def offset2Tiles(of, im_dir, ma_dir, case_id, im, mask, sz, obsArea, ignoreObs=False, min_Air=0.5):
    case_ids,tile_ids,fnImages, fnMasks, pAir,pHealthy,pBenign,\
        pMalignant,rCenter,cCenter,tile_size, obsAreas = [],[],[],[],[],[],[],[],[],[],[],[]

    noTilePixels = sz[0]*sz[1]
    for o in of:
        im_t  = im[o[0]:o[0]+sz[0], o[1]:o[1]+sz[1]]
        p_Air = pixel_count(im_t,0)
        #ignore tiles that are mostly air
        if p_Air/noTilePixels > min_Air : continue
            
        ma    = mask[o[0]:o[0]+sz[0], o[1]:o[1]+sz[1]]
        p_Benign, p_Malignant = pixel_count(ma,128), pixel_count(ma,255)
        p_Healthy = noTilePixels - p_Air - p_Benign - p_Malignant
        
        #if ignoreObs then skip tiles with benign og malignant tissue 
        if ignoreObs and (p_Benign+p_Malignant)/noTilePixels > 0.0: continue
            
        tile_id    = f"{case_id}_tile_{o[0]}_{o[1]}" 
        image_file = tile_id +".png" #f"{im_dir}/{tile_id}.png"
        mask_file  = tile_id +".png" #f"{ma_dir}/{tile_id}.png"
        #image_file = testOrTrain + "/" + tile_id +".png" #f"{im_dir}/{tile_id}.png"
        #mask_file  = testOrTrain + "/" + tile_id +".png" #f"{ma_dir}/{tile_id}.png"

        case_ids.append(   str(case_id) )
        tile_ids.append(   tile_id )
        fnImages.append(   str(image_file) )
        fnMasks.append(    str(mask_file) )
        pAir.append(       p_Air )
        pHealthy.append(   p_Healthy )
        pBenign.append(    p_Benign )
        pMalignant.append( p_Malignant )
        rCenter.append( round( (o[0]+0.5*sz[0])/im.shape[0] ,2) )
        cCenter.append( round( (o[1]+0.5*sz[1])/im.shape[1] ,2) )
        tile_size.append(  noTilePixels )
        obsAreas.append(    obsArea )
        
        cv2.imwrite( str( im_dir/image_file), (65535.0*im_t + 0.5).astype(np.uint16) )    
        cv2.imwrite( str( ma_dir/mask_file), ma )

        #if o[0]==672 and o[1]==0 :
        #    print(f"offset:{o[0]},{o[1]}: healthy: {pHealthy} air :{pAir}, benign: {pBening} malignant: {pMalignant}")
        #    #simplePlot(t,f"tile: {o}",cmap_grey, figsize=(4,4))
        #    doublePlot("tiles","im",t, "mask", m, cmap_grey, width = 8)

        #print(f"r:{o[0]}:{o[0]+sz[0]} c:{o[1]}:{o[1]+sz[1]}")
    result = pd.DataFrame( 
        np.column_stack([
                    case_ids, tile_ids, pAir,  pHealthy, \
                    pBenign, pMalignant, rCenter, cCenter, tile_size, obsAreas,  fnImages, fnMasks ]),
        columns = ["case_id","tile_id","pAir","pHealthy", \
                   "pBenign","pMalignant","rCenter","cCenter","tile_size","obsAreas","fnImage","fnMask"])
        
    return result

In [12]:
def imageCoord(xy_arrayIndex) : return (xy_arrayIndex[1], xy_arrayIndex[0] )
#sz :size of the tiles
def generate_tile2file( im_dir, ma_dir, case_id, im, mask, tilesPRObservation, sz=[256,256] ):
    sz = np.asarray(sz)

    #generate tiles on a regular grid
    nb = np.ceil(im.shape / np.asarray(sz)).astype(int)
    last_of = im.shape - sz
    #calculate alle offset
    of = []
    for i in range(0,nb[0]):
        r = i*sz[0] if i+1<nb[0] else last_of[0]
        for j in range(0,nb[1]):
            c = j*sz[1] if j+1<nb[1] else last_of[1]
            of.append((r,c)) 
    #print( f"nb tiles:{len(of)} of: {of}")
    #get alle tile that do not overlap with observations (ignoreObs=True)
    allResult = offset2Tiles(of, im_dir, ma_dir, case_id, im, mask, sz, obsArea=0, ignoreObs=True)

    #generate tiles that overlap the observations
    centers,bbs, area = centerOfObs(mask)
    for b,a in zip(bbs,area): 
        inset  = np.minimum(sz//2, b[2:4]//2)
        tl_min = b[0:2] + inset - sz
        tl_min = np.maximum(tl_min, np.zeros_like(tl_min))
        tl_max = b[0:2] + b[2:4] - inset
        tl_max = np.minimum(tl_max+sz, im.shape) -sz
        #print("b: ", b, " tl_min: ", tl_min, " tl_max: ",tl_max, " inset: ", inset, "area:", round(a/(sz[0]*sz[1]),2) )

        of = np.random.uniform( low=tl_min, high=tl_max, size=[tilesPRObservation,2] ).astype(int)
        result = offset2Tiles(of, im_dir, ma_dir, case_id, im, mask, sz, obsArea=a, )
        if allResult is None:    allResult = result 
        elif result is not None: allResult = allResult.append( result )
            
        #print("of:\n",of)
        #cv2.rectangle(mask, pt1=imageCoord(tl_min), pt2=imageCoord(tl_min+sz), color=(255, 255, 255), thickness=4, lineType=cv2.LINE_8 )
        #cv2.rectangle(mask, pt1=imageCoord(tl_max), pt2=imageCoord(tl_max+sz), color=(255, 255, 255), thickness=4, lineType=cv2.LINE_8 )
        """
        for o in of:
            tl = o
            lr = tl + sz
            print("tl: ", tl, " lr: ",lr)
            cv2.rectangle(mask, pt1=imageCoord(tl), pt2=imageCoord(lr), color=(255, 255, 255), 
                          thickness=4, lineType=cv2.LINE_8 )
        #cv2.rectangle(im, (tl[0],tl[1]), (lr[0], lr[1]), (255, 255, 255), cv2.LINE_8);
#   doublePlot("tiles","im",im, "mask", mask, cmap_grey, width = 16)
    simplePlot(mask, "tiles",cmap_grey, figsize =(16,16))
        """        
    return allResult


In [13]:
def generateTiles( cases, pathTrg, nbCommon, scaling ):
    
    dfTiles = None
    nb_cases = len(cases)
    for i in range(len(cases)):
        r = cases.iloc[i]
        case_id, fnXray, fnMask = str(r.case_id), Path(r.fnXray), Path(r.fnMask)
                                  
        drImTrg  = pathTrg.joinpath(*fnXray.parts[nbCommon:]).parent
        drMskTrg = pathTrg.joinpath(*fnMask.parts[nbCommon:]).parent
        drImTrg.mkdir(parents=True, exist_ok=True)
        drMskTrg.mkdir(parents=True, exist_ok=True)
        #print(f"source image:{fnXray}\ntarget dir:{drImTrg}")
        #print(f"source mask :{fnMask}\ntarget dir:{drMskTrg}")
        
        im    = cv2.imread(str(fnXray),  cv2.IMREAD_ANYDEPTH)/65536.0
        mask  = cv2.imread(str(fnMask), cv2.IMREAD_ANYDEPTH).astype(np.uint8)
    
        if im.shape[1] >= 512:
            dsize = (round(im.shape[1]*scaling),round(im.shape[0]*scaling))
            im    = cv2.resize(im,   dsize=dsize, interpolation=cv2.INTER_AREA)
            mask  = cv2.resize(mask, dsize=dsize, interpolation=cv2.INTER_NEAREST)
        elif im.shape[1] < 256: continue
       
        #print("nb values==0: ", np.sum(im==0.0), "nb values==0.01: ", np.sum(im==0.01))
        #print("nb values==0: ", np.sum(im2==0.0), "nb values==0.01: ", np.sum(im2==0.01))
        #print("u_before: ", len(np.unique(im)), " u_after: ", len(np.unique(im2)) )
        #doublePlot(r.case_id,"x-ray before",im,  "x_ray with global equalization", mask, cmap_grey, width = 12)
        #doublePlot(r.case_id,"x-ray before",im2, "x_ray with global equalization", mask, cmap_grey, width = 12)
    
        prObservation=5
        result = generate_tile2file( drImTrg, drMskTrg, case_id, im, mask, tilesPRObservation=prObservation )
        if dfTiles is None:    dfTiles = result 
        elif result is not None: dfTiles = dfTiles.append( result )
        
        if i-(int(i/50)*50) == 0 :
            percentage = int(np.around( i*100 / nb_cases))
            print( f"current case: {r.case_id}  Processed: {percentage}%  Generated tiles: ", dfTiles.shape[0] if dfTiles is not None else 0 )
    return dfTiles

In [14]:
"""
testcases =  [
#"Mass-Training_P_00797_LEFT_CC",
#"Mass-Training_P_00419_LEFT_MLO",
#"Mass-Test_P_00969_LEFT_CC",
#"Calc-Training_P_00937_RIGHT_MLO",
#"Mass-Training_P_00997_LEFT_CC" #the algo get width wrong because it is less<224
"Calc-Test_P_00038_LEFT_CC"
]
cases = cases[ np.isin( cases.case_id, testcases ) ]
"""
#tile_cases = cases[ np.isin( cases.purpose, ["training"x] ) ]
start = time.perf_counter()

#build dataframe with case_id, fnXray, fnMask
files       = list(equalized_dir.glob("**/full/*/obs_masks/*.png"))
mskFiles    = {str(f.stem[:-9]):str(f) for f in files}

files       = list(equalized_dir.glob("**/full/*/xrays/*.png"))
xFiles      = {str(f.stem):str(f) for f in files}

caseFiles   = [ np.asarray([ k, xFiles[k], mskFiles[k] ]) for k in mskFiles.keys() ]

dfCaseFiles = pd.DataFrame(data=caseFiles, columns=["case_id","fnXray","fnMask"])
dfTiles = generateTiles(dfCaseFiles, Path(tile_dir), len(equalized_dir.parts)+1,  scaling = 0.5)


print("runTime ", time.perf_counter() - start )

if dfTiles is not None: print("total number og tiles generated: ", dfTiles.shape)


current case: Mass-Test_P_01183_LEFT_MLO  Processed: 0%  Generated tiles:  38
current case: Mass-Test_P_01551_LEFT_MLO  Processed: 2%  Generated tiles:  1577
current case: Mass-Test_P_01518_LEFT_CC  Processed: 3%  Generated tiles:  3201
current case: Mass-Test_P_01477_LEFT_CC  Processed: 5%  Generated tiles:  4797
current case: Calc-Test_P_00643_LEFT_MLO  Processed: 6%  Generated tiles:  6628
current case: Mass-Test_P_01599_LEFT_MLO  Processed: 8%  Generated tiles:  8399
current case: Calc-Test_P_01224_RIGHT_CC  Processed: 10%  Generated tiles:  10152
current case: Mass-Test_P_01106_LEFT_MLO  Processed: 11%  Generated tiles:  11909
current case: Calc-Test_P_01030_RIGHT_CC  Processed: 13%  Generated tiles:  13764
current case: Mass-Test_P_01787_LEFT_MLO  Processed: 15%  Generated tiles:  15366
current case: Mass-Test_P_01797_LEFT_MLO  Processed: 16%  Generated tiles:  17250
current case: Calc-Test_P_00497_LEFT_MLO  Processed: 18%  Generated tiles:  18914
current case: Mass-Test_P_00947_

In [16]:
if dfTiles is not None:
    tile_cases = pd.merge(cases, dfTiles, how="right", left_on='case_id', right_on='case_id')
    tile_cases = tile_cases.drop(["obs_id"],axis=1)
    tile_cases.to_csv(tile_dir /"tiles.csv", sep=";", index=False)
    print("tile_cases.shape: ",tile_cases.shape, " dfTiles.shape: ", dfTiles.shape)


tile_cases.shape:  (106955, 28)  dfTiles.shape:  (106955, 12)


In [17]:
#dfTiles.fnImage.head()
display(tile_cases.head())
tile_dir  /"tiles.csv"

Unnamed: 0.1,Unnamed: 0,case_id,height,width,simple_pathology,patient_id,purpose,density,left_right,projection,...,pAir,pHealthy,pBenign,pMalignant,rCenter,cCenter,tile_size,obsAreas,fnImage,fnMask
0,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,9280,56256,0,0,0.07,0.2,65536,0,Calc-Test_P_00038_LEFT_CC_tile_0_0.png,Calc-Test_P_00038_LEFT_CC_tile_0_0.png
1,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,65536,0,0,0.21,0.2,65536,0,Calc-Test_P_00038_LEFT_CC_tile_256_0.png,Calc-Test_P_00038_LEFT_CC_tile_256_0.png
2,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,6841,58695,0,0,0.21,0.59,65536,0,Calc-Test_P_00038_LEFT_CC_tile_256_256.png,Calc-Test_P_00038_LEFT_CC_tile_256_256.png
3,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,65536,0,0,0.36,0.2,65536,0,Calc-Test_P_00038_LEFT_CC_tile_512_0.png,Calc-Test_P_00038_LEFT_CC_tile_512_0.png
4,0,Calc-Test_P_00038_LEFT_CC,3601,1296,BENIGN,P_00038,test,2.0,LEFT,CC,...,0,65536,0,0,0.36,0.59,65536,0,Calc-Test_P_00038_LEFT_CC_tile_512_256.png,Calc-Test_P_00038_LEFT_CC_tile_512_256.png


PosixPath('../../mammography-data/mammography-dogscats-match-equalization-BINS-CHX/tiles/tiles.csv')