In [1]:
import imutils.paths
import json
import os
import numpy as np
import matplotlib.pyplot as plt
from skimage.draw import polygon, ellipse, circle
import shutil
from tqdm import tqdm

np.random.seed(42)

# train/val/test -> 70/20/10
# train -> images and its regions in json format
directory = '/home/choppy/TOC/datasets/alcohol/alcohol_marks/alcohol_full' # path to alcohol folder

## Read all valid marks inside .json files

In [2]:
total_images = 0

json_file_list = sorted([*imutils.paths.list_files(directory, contains='via_region_data')])

for json_file in tqdm(json_file_list):
    js = json.load(open(json_file, 'rb'))
    path = json_file.split('via_region_data.json')[0]
    images_list = list(map(lambda x: os.path.join(path, x['filename']), list(js.values())))
    total_images += len(images_list)

100%|██████████| 444/444 [00:00<00:00, 1066.09it/s]


In [3]:
str(total_images).zfill(6)

'021455'

In [4]:
json_file_list = sorted([*imutils.paths.list_files(directory, contains='via_region_data')])
all_images_list = []
all_regions = []
newname_images = []

for json_file in tqdm(json_file_list):
    js = json.load(open(json_file, 'rb'))
    path = json_file.split('via_region_data.json')[0]
    images_list = list(map(lambda x: os.path.join(path, x['filename']), list(js.values())))
    # all_images_list += images_list   

    for json_key, imdir in zip(js.keys(), images_list):
        regions = js[json_key]['regions']
        if regions:
            for reg in regions:
                # fuse sclera marks
                if 'sclera' in reg['region_attributes']['Eye']:
                    reg['region_attributes']['Eye'] = 'sclera'
            all_regions.append(regions)
            all_images_list.append(imdir)

100%|██████████| 444/444 [00:00<00:00, 663.17it/s]


In [5]:
for i in range(len(all_images_list)):
    imdir = all_images_list[i]
    sensor = imdir.split('/')[-4]
    if sensor.lower() not in ['iritech', 'lg', 'gemini']:
        sensor = imdir.split('/')[-3]
    imname = os.path.basename(imdir)
    imname = f'{i}'.zfill(7) + '_' + sensor + '_' + imname
    newname_images.append(imname)

In [6]:
newname_images[1000], newname_images[-1]

('0001000_Iritech_E_0_1_0_R_M_N_N_1982_75_2017.bmp',
 '0021308_gemini_1_15499172424147453_002632_99.png')

In [7]:
len(all_regions), len(newname_images), len(all_images_list)

(21309, 21309, 21309)

In [8]:
regions_dict = {}

for i in range(len(newname_images)):
    key = newname_images[i]
    regions_dict[key] = all_regions[i]

In [9]:
os.makedirs('alcohol/all_images', exist_ok=True)
json.dump(regions_dict, open('alcohol/all_images/regions.json', 'w'), indent=4)

for i in tqdm(range(len(all_images_list))):
    shutil.copyfile(all_images_list[i], 'alcohol/all_images/'+newname_images[i])

100%|██████████| 21309/21309 [02:42<00:00, 131.46it/s]


In [10]:
images_list = sorted([*imutils.paths.list_images('alcohol/all_images/')])
js = json.load(open('alcohol/all_images/regions.json'))

print(len(images_list), 'images')
print(len(js), 'regions')

21309 images
21309 regions


# Images and labels must have the same size

In [11]:
assert len(images_list) == len(js)

# train / test / validation split

In [12]:
indexes = np.arange(len(images_list))
indexes[:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [13]:
np.random.shuffle(indexes)
indexes[:10]

array([10890, 13073,  9931, 14720, 19994, 13792, 10613,  1087,  8591,
        3485])

In [14]:
# train / 70
# val / 20
# test / 10

train_len = int(np.floor(len(indexes)*.7))
train_len

14916

In [15]:
val_len = int(np.floor(len(indexes)*.2) + train_len)
val_len

19177

In [16]:
test_len = int(np.floor(len(indexes)*.1) + val_len)
test_len

21307

In [17]:
train_split_indexes = indexes[:train_len]
val_split_indexes = indexes[train_len:val_len]
# val_split_indexes = indexes[test_len:val_len]
test_split_indexes = indexes[val_len:]

print(train_split_indexes.size, test_split_indexes.size, val_split_indexes.size)
print(sum([train_split_indexes.size, test_split_indexes.size, val_split_indexes.size]))

14916 2132 4261
21309


In [18]:
os.makedirs('alcohol/train', exist_ok=True)

os.makedirs('alcohol/test', exist_ok=True)

os.makedirs('alcohol/val', exist_ok=True)

In [19]:
train_regions = {}
test_regions = {}
val_regions = {}

In [20]:
for idx in tqdm(train_split_indexes, desc='train_split'):
    shutil.copyfile(images_list[idx], 'alcohol/train/'+os.path.basename(images_list[idx]))
    train_regions[os.path.basename(images_list[idx])] = js[os.path.basename(images_list[idx])]
json.dump(train_regions, open('alcohol/train/regions.json', 'w'), indent=4)

train_split: 100%|██████████| 14916/14916 [06:18<00:00, 39.44it/s] 


In [21]:
for idx in tqdm(test_split_indexes, desc='test_split'):
    shutil.copyfile(images_list[idx], 'alcohol/test/'+os.path.basename(images_list[idx]))
    test_regions[os.path.basename(images_list[idx])] = js[os.path.basename(images_list[idx])]
json.dump(test_regions, open('alcohol/test/regions.json', 'w'), indent=4)

test_split: 100%|██████████| 2132/2132 [00:55<00:00, 38.71it/s]


In [22]:
for idx in tqdm(val_split_indexes, desc='val_split'):
    shutil.copyfile(images_list[idx], 'alcohol/val/'+os.path.basename(images_list[idx]))
    val_regions[os.path.basename(images_list[idx])] = js[os.path.basename(images_list[idx])]
json.dump(val_regions, open('alcohol/val/regions.json', 'w'), indent=4)

val_split: 100%|██████████| 4261/4261 [01:52<00:00, 37.81it/s]  
