In [1]:
import numpy as np, pandas as pd
import glob,os,sys,argparse,shutil,gc,copy,warnings,random,logging,multiprocessing,time
from datetime import timedelta
from wsi import slide,filters,tiles,util
import PIL,pyvips

START_TIME = time.time()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()


In [2]:
STAGE = 'train'
BASE_PAGE = 5

DATA_DIR = os.environ['DIR_RAW_DATA']
RAW_IMG_DIR = f'{DATA_DIR}/images'

PAGE_IX_MULS ={i:2**(BASE_PAGE-i) for i in range(BASE_PAGE+1)}
BASE_DIR = os.path.join(os.environ['DIR_PROCESSED_DATA'] ,'workspace')

DIR_OUTPUT_TILES = f'{BASE_DIR}/{STAGE}/tiles'
PAGES_TO_EXTRACT = [3]

df = pd.read_csv(f'{DATA_DIR}/train.csv',sep=',')

df['slide_id'] = df.filename.str.split('.').str[0]
df['og_bbox'] = df.apply(lambda x: [x.x1,x.y1,x.x2,x.y2] ,axis=1)
df['corr_bbox'] = df.apply(lambda x: [x.x1,x.y1,min(x.max_x,x.x2),min(x.max_y,x.y2)] ,axis=1)

df['w'] = df.x2-df.x1
df['h'] = df.y2-df.y1
df['xpad'] = df.max_x - df.x2
df['ypad'] = df.max_y - df.y2

bads = ['7HxL729fl6_b_38839_78455_39604_79279', 'HpWI7vJms2_a_71182_57113_71902_57462', 'i9xm71KbYG_b_31467_113976_31986_114541', 'JvxiXClFKl_a_20316_9218_20451_9502', 'JvxiXClFKl_a_69751_37180_70029_37225', 'Lzx7XfUujk_a_13308_34726_13336_34737', 'Lzx7XfUujk_b_10468_84122_10792_84991', 'Lzx7XfUujk_b_15486_145827_15738_146587', 'Lzx7XfUujk_b_17181_145345_17593_145910', 'Lzx7XfUujk_b_53455_82217_54007_82559', 'Lzx7XfUujk_b_63207_142583_63569_143562', 'Lzx7XfUujk_b_64715_142167_65022_142754', 'Lzx7XfUujk_b_67782_116477_68140_117554', 'Lzx7XfUujk_b_8802_84908_9241_85324', 'rzsagNFXMn_a_15458_148084_15486_148204', 'TFrBjcO8nJ_b_63103_127967_63818_128712', 'WipCgQtJPE_b_25648_118162_26120_118743', 'yJxYpOCh6m_b_18928_24013_18941_24038']
df['uniqid'] = df.slide_id + '_'+ df.og_bbox.apply(lambda x: '_'.join([str(i) for i in x]))
df_bads = df[df.uniqid.isin(bads)].reset_index(drop=True)
df = df[~df.uniqid.isin(bads)].reset_index(drop=True)
print(f'dropped {len(df_bads)}',len(df))

dropped 18 909


In [3]:
meta = df.drop_duplicates(subset='filename').reset_index(drop=True)
meta.shape

(247, 15)

In [5]:
MIN_SCORE_THRESH = 0.1; MAX_TILES_PER_PAGE=512

In [7]:
def save_tiles_for_page(cur_page,base_tile_sz,slide_name,df_tiles):
    patch_size = PATCH_SIZES_ACT[cur_page]
    slide_img = pyvips.Image.new_from_file(f'{RAW_IMG_DIR}/{slide_name}.tif', page=cur_page)
    RES_MUL = PAGE_IX_MULS[cur_page] #2**(base_page-cur_page)
    
    dir_output = f'{DIR_OUTPUT_TILES}/{base_tile_sz}/{cur_page}_{patch_size}/{slide_name}' #b,p,sz
    dir_output_img = f'{dir_output}/img' #b,p,sz
    os.makedirs(dir_output_img,exist_ok=True)
    ds_tiles=[]
    save_ctr=0
    for idx, row in df_tiles.iterrows():
        if save_ctr>=MAX_TILES_PER_PAGE: ##generated maximum tiles for page, exit
            break
        y = row['Row Start']
        x = row['Col Start']

        if (y<0 or x<0):
            #print(f"skipping: {slide_name}, bad coords x:{x} y:{y}")  
            continue
        
        
        x1 = x*RES_MUL
        y1 = y*RES_MUL
        
        region_width = region_height = patch_size#PATCH_SIZES_ACT[cur_page]
        if x1 + region_width >slide_img.width:
            _diff=slide_img.width-(x1+region_width)
            region_width = slide_img.width - x1
            #TODO better drop bad tiles logic
            if idx>0:
                # print(f'skipping {slide_name} since {x1} + {region_width} >{slide_img.width} by {_diff}, new width {region_width}')
                continue
        if y1 + region_height >slide_img.height:
            _diff=slide_img.height-(y1+region_height)
            region_height = slide_img.height - y1
            if idx>0:
                # print(f'skipping {slide_name} since {y1} + {region_height} >{slide_img.height} by {_diff} new height {region_height}')
                continue
                
        try:
            region = pyvips.Region.new(slide_img).fetch(x1, y1, region_width, region_height)
            img = np.ndarray(
                buffer=region,
                dtype=np.uint8,
                shape=(region_height, region_width, 3)) #rgb image
            
            img = PIL.Image.fromarray(img)
            # img.save(f'{dir_output_img}/{row.tile_id}_{y1}_{x1}.jpeg', quality=90)
            img.save(f'{dir_output_img}/{row.tile_id}_{y1}_{x1}.png', quality=90)
            save_ctr+=1

            row['w']=region_width
            row['h']=region_height
            row['swidth']=slide_img.width
            row['sheight']=slide_img.height
            
            ds_tiles.append(row)
            
        except Exception as ex:
            #print(f'Failed for {slide_name}. x: {x}, y: {y} x1: {x1}, y1: {y1} reg_w: {region_width}, reg_h: {region_height} ')
            #print(f'slide width: {slide_img.width} height: {slide_img.height}  cur_page: {cur_page}' )
            print(ex)
        
    d = pd.DataFrame(ds_tiles)
    d.to_csv(f'{dir_output}/tile_meta.csv',index=False)

def generate_tiles_for_slide_list(slide_names,base_tile_sz,pages_to_extract):
    for slide_name in slide_names:
        # ##generate tiles
        df = pd.read_csv(f'{slide.TILE_DATA_DIR}/{slide_name}-tile_data.csv',skiprows=14).sort_values(by='Score',ascending=False)
        df['og_ntiles'] = len(df)
        df = df[df.Score>=MIN_SCORE_THRESH]
        
        # for th in SCORE_THRESHS:
        #     df1=df[df.Score>th]
        #     if len(df1)>=MIN_TILES_PER_PAGE:
        #         break
        
        # if th==0:
        #     # df1 = df1.head(MIN_TILES_PER_PAGE)
        #     print(f'Ignoring Score filter: {slide_name}')
        #     continue
        # else:
        #     #print('found data at th ',th)
        #     df = df1
            
            
        df = df.reset_index(drop=True)
        df['tile_id'] = df.index
        df['slide_id'] = slide_name
        
        #df['filename'] = df['slide_id'] + '.tif'
        for page in pages_to_extract:
            save_tiles_for_page(page,base_tile_sz,slide_name,df)
        #gen_tiles(RAW_IMG_DIR,base_tile_sz,df,pages_to_extract)


def multiprocess_generate_tiles(slides_list,base_tile_sz,pages_to_extract):
    num_slides = len(slides_list)

    num_processes = min(multiprocessing.cpu_count(),5)
    pool = multiprocessing.Pool(num_processes)

    if num_processes > num_slides:
        num_processes = num_slides
    
    slides_per_process = num_slides / num_processes
    tasks = []
    for num_process in range(1, num_processes + 1):
        start_index = (num_process - 1) * slides_per_process + 1
        end_index = num_process * slides_per_process
        start_index = int(start_index)
        end_index = int(end_index)
        sublist = slides_list[start_index - 1:end_index]
        tasks.append((sublist,base_tile_sz,pages_to_extract))
        #print(f"Task # {num_process} Process slides {sublist}")
    
  # start tasks
    results = []
    for t in tasks:
        results.append(pool.apply_async(generate_tiles_for_slide_list, t))

    for result in results:
        _ = result.get()



In [None]:
NAMES = meta.slide_id.tolist()#[:10]

BASE_TILE_SIZES = [128,160,192,224,256]

for BASE_TILE_SZ in BASE_TILE_SIZES:
    tiles.TILE_SIZE_BASE = BASE_TILE_SZ
    slide.TILE_DATA_DIR = os.path.join(slide.BASE_DIR, f"tile_data/{BASE_TILE_SZ}")
    slide.TOP_TILES_DIR = os.path.join(slide.BASE_DIR, f"top_tiles/{BASE_TILE_SZ}")
    PATCH_SIZES_ACT ={i:BASE_TILE_SZ*2**(BASE_PAGE-i) for i in range(BASE_PAGE)} #patch size to extract for each page
    
    
    print(f'##### GENERATING TILE META {BASE_TILE_SZ} tile sizes: {PATCH_SIZES_ACT} #####')
    tiles.multiprocess_filtered_images_to_tiles(image_list=NAMES, display=False, save_summary=False, save_data=True, save_top_tiles=False)
    for PAGE in PAGES_TO_EXTRACT:
        SIZE = PATCH_SIZES_ACT[PAGE]
        print(f'##### GENERATING TILES {BASE_TILE_SZ}_{PAGE}_{SIZE} #####')
        multiprocess_generate_tiles(NAMES,BASE_TILE_SZ,PAGES_TO_EXTRACT)
    
    elapsed = time.time() - START_TIME
    print(f'##### DONE GENERATING TILES {BASE_TILE_SZ}_{PAGE}_{SIZE} TOTAL TIME: {timedelta(seconds=elapsed)} #####')
    gc.collect()
