In [1]:
import shutil
from glob import glob
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

### Lets make a truncated directory with the images within ROI and lets only copy over the slide (not mask)

In [2]:
img_names_df = pd.read_json('../../../data/zoom_1_256_256_partition/meta/all_info_0.4_grayscale_tol_with_roi.json',
                   dtype=False).reset_index(drop=True)

In [3]:
img_names_df_trunc = img_names_df[(img_names_df['is_roi'] == 1) & 
                            (img_names_df['non_gray_ratio'] > 0.2)].copy()

In [4]:
img_names_df_trunc.shape

(116768, 7)

In [5]:
img_names_df_trunc.head()

Unnamed: 0,file_name,img_id,include,label,non_gray_ratio,type,is_roi
34,tumor_slide_035_split_176_75.png,35,1,0,0.942184,train,1
133,tumor_slide_035_split_140_68.png,35,1,0,0.919449,train,1
171,tumor_slide_035_split_193_53.png,35,1,0,0.412689,train,1
202,tumor_slide_035_split_186_64.png,35,1,0,0.722992,train,1
203,tumor_slide_035_split_190_72.png,35,1,0,0.950317,train,1


In [6]:
img_names_df_trunc['label'].value_counts()

0    102613
1     14155
Name: label, dtype: int64

In [7]:
img_names_df['label'].value_counts()

0    1251920
1      14374
Name: label, dtype: int64

In [22]:
img_names_df_trunc['type'].value_counts()

train    73600
test     23626
val      19542
Name: type, dtype: int64

In [8]:
meta_info_df = pd.read_json('../../../data/tumor_img_meta_info.json', dtype=False)
train_test_split_df = pd.read_json('../../../data/train_val_test_split.json', dtype=False)

In [9]:
meta_info_df = meta_info_df.merge(train_test_split_df, on='img_id')

In [10]:
meta_info_df.head()

Unnamed: 0,img_id,level_5_img_size,level_5_mask_area,level_dimensions,level_downsamples,mask_img_filename,mask_proportion,slide_img_filename,type
0,1,21123072,11116,"[[97792, 221184], [48896, 110592], [24448, 552...","[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, ...",tumor_001_mask.tif,0.052625,tumor_001.tif,val
1,2,20976384,938,"[[97792, 219648], [48896, 109824], [24448, 549...","[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, ...",tumor_002_mask.tif,0.004472,tumor_002.tif,test
2,5,20976384,2323,"[[97792, 219648], [48896, 109824], [24448, 549...","[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, ...",tumor_005_mask.tif,0.011074,tumor_005.tif,train
3,12,20585216,1115,"[[97792, 215552], [48896, 107776], [24448, 538...","[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, ...",tumor_012_mask.tif,0.005417,tumor_012.tif,val
4,16,21123072,156981,"[[97792, 221184], [48896, 110592], [24448, 552...","[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, ...",tumor_016_mask.tif,0.743173,tumor_016.tif,train


In [11]:
meta_info_df.shape

(21, 9)

In [None]:
SOURCE_PARTITION_DIR = '/home/sjb/Projects/Columbia/Applied_DL/CamelyonProject/data/zoom_2_256_256_zoom_1_pair'
DEST_PARTITION_DIR = '/home/sjb/Projects/Columbia/Applied_DL/CamelyonProject/data/zoom_2_256_256_zoom_1_pair_truncated'

In [13]:
include_images = set(img_names_df_trunc['file_name'])

In [14]:
len(include_images)

116768

In [15]:
for split_type in ['train', 'val']:
    
    dest_sub_dir = os.path.join(DEST_PARTITION_DIR, split_type)
    
    if not os.path.exists(dest_sub_dir):
        os.mkdir(dest_sub_dir)
        print('Creating directory', dest_sub_dir)
    
    filtered_df = meta_info_df[meta_info_df['type'] == split_type]
    img_id_list = filtered_df['img_id'].values
    
    for img_type in ['slide']:
        dest_sub_sub_dir = os.path.join(dest_sub_dir, img_type)
        
        if not os.path.exists(dest_sub_sub_dir):
            os.mkdir(dest_sub_sub_dir)
            print('Creating directory', dest_sub_sub_dir)
        
        for img_id in tqdm(img_id_list):
            file_prefix = 'tumor_{}_{}_split'.format(img_type, img_id)
            
            source_file_paths_long = glob(os.path.join(SOURCE_PARTITION_DIR,
                                                       split_type,
                                                       img_type,
                                                       file_prefix) + '*.png')
            source_file_paths_base = [os.path.basename(x) for x in source_file_paths_long]
            
            for source_file_path, source_file_name in zip(source_file_paths_long,
                                                          source_file_paths_base):
                if source_file_name in include_images:
                    dest_file_path = os.path.join(dest_sub_sub_dir, source_file_name)
                    shutil.copy2(source_file_path, dest_file_path)



  0%|          | 0/13 [00:00<?, ?it/s]

Creating directory /home/sjb/Projects/Columbia/Applied_DL/CamelyonProject/data/zoom_1_256_256_partition_truncated/train
Creating directory /home/sjb/Projects/Columbia/Applied_DL/CamelyonProject/data/zoom_1_256_256_partition_truncated/train/slide


100%|██████████| 13/13 [01:10<00:00, 11.55s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

Creating directory /home/sjb/Projects/Columbia/Applied_DL/CamelyonProject/data/zoom_1_256_256_partition_truncated/val
Creating directory /home/sjb/Projects/Columbia/Applied_DL/CamelyonProject/data/zoom_1_256_256_partition_truncated/val/slide


100%|██████████| 4/4 [02:40<00:00, 43.97s/it]


## Sanity Check

In [16]:
infos = []

In [17]:
for split_type in ['train', 'val', ]:
    sub_dir = os.path.join(DEST_PARTITION_DIR, split_type)
    
    for img_type in ['slide']:
        sub_sub_dir = os.path.join(sub_dir, img_type)
        
        valid_files = glob(sub_sub_dir + '/*.png')
        
        uniq_imgs = np.unique([os.path.basename(x).split('_')[2] for x in valid_files])
        
        info = {
            'split_type': split_type,
            'img_type': img_type,
            'n_png': len(valid_files),
            'uniq_imgs': uniq_imgs
        }
        infos.append(info)
        
        

In [18]:
info_df = pd.DataFrame(infos)

In [19]:
info_df.head()

Unnamed: 0,img_type,n_png,split_type,uniq_imgs
0,slide,73600,train,"[005, 016, 019, 023, 035, 059, 075, 081, 084, ..."
1,slide,19542,val,"[001, 012, 031, 064]"


In [20]:
info_df.shape

(2, 4)

In [21]:
info_df['n_png'].sum()

93142