# Cholec Data Organization

- After downloading the Cholec Segmentation dataset from Kaggle, we organize the dataset into two main folders:
    - `images` contains the input images for our semantic segmentation network
    - `gt` contains the groundtruth image masks for our semantic segmentation network

In [None]:
import os
from shutil import copy2
import numpy as np

In [None]:
cholec_path = "../../../Downloads/synapse/"
print(len(cholec_path))
files = []

In [None]:
for dirname, _, filenames in os.walk(cholec_path):
    for filename in filenames:
        curr_file = os.path.join(dirname, filename)
        video_num = curr_file[33:35]
        new_file_name = video_num + "_" + filename
        if "color" in filename:
            copy2(curr_file, "../../../Downloads/cholec/gt/")
            original_name = "../../../Downloads/cholec/gt/" + filename
            new_name = "../../../Downloads/cholec/gt/" + new_file_name
            os.rename(original_name, new_name)

In [None]:
for dirname, _, filenames in os.walk(cholec_path):
    for filename in filenames:
        curr_file = os.path.join(dirname, filename)
        video_num = curr_file[33:35]
        new_file_name = video_num + "_" + filename
        if "mask" not in filename:
            copy2(curr_file, "../../../Downloads/cholec/images/")
            original_name = "../../../Downloads/cholec/images/" + filename
            new_name = "../../../Downloads/cholec/images/" + new_file_name
            os.rename(original_name, new_name)

# Split Data

- The below code splits our data from `images` and `gt` into `train`/`val`/`test` sets with the following ratios:
    - `train` = 0.7
    - `val` = 0.15
    - `test` = 0.15

- This split code is adapted from the follStanford CS230 Blog Pot

In [1]:
import glob
import random
import os
from PIL import Image
from tqdm import tqdm

In [2]:
image_list = glob.glob("../../../../Downloads/cholec/images/*")
gt_list = glob.glob("../../../../Downloads/cholec/gt/*")

In [3]:
image_list.sort()
gt_list.sort()

In [4]:
random.seed(2021)

In [5]:
combined_list = list(zip(image_list, gt_list))
random.shuffle(combined_list)
image_list, gt_list = zip(*combined_list)

In [6]:
split_1 = int(0.15 * len(image_list))
split_2 = int(0.3 * len(image_list))

test_images, test_gt = image_list[:split_1], gt_list[:split_1]
val_images, val_gt = image_list[split_1:split_2], gt_list[split_1:split_2]
train_images, train_gt = image_list[split_2:], gt_list[split_2:]

In [7]:
print(len(train_images) / len(image_list))
print(len(test_images) / len(image_list))
print(len(val_images) / len(image_list))

0.7
0.15
0.15


In [8]:
print(len(train_gt) / len(gt_list))
print(len(test_gt) / len(gt_list))
print(len(val_gt) / len(gt_list))

0.7
0.15
0.15


In [9]:
im_filenames = {'train_images': train_images,'val_images': val_images, 'test_images': test_images}
gt_filenames = {'train_gt': train_gt,'val_gt': val_gt, 'test_gt': test_gt}

In [10]:
im_output_dir = "../../../../Downloads/cholec"

In [11]:
def resize_and_save(filename, output_dir):
    """Resize the image contained in `filename` and save it to the `output_dir`"""
    image = Image.open(filename)
    save_path = os.path.join(output_dir, filename.split('/')[-1])
    image.save(save_path)

In [12]:
for split in ['train_images', 'val_images', 'test_images']:
    output_dir_split = os.path.join(im_output_dir, split)
    print(output_dir_split)
    
    
    if not os.path.exists(output_dir_split):
        os.mkdir(output_dir_split)
    else:
        print("Warning: dir {} already exists".format(output_dir_split))
    
    print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split))
    for filename in tqdm(im_filenames[split]):
        resize_and_save(filename, output_dir_split)


print("Done building dataset")

  0%|          | 1/5656 [00:00<16:40,  5.65it/s]

../../../../Downloads/cholec/train_images
Processing train_images data, saving preprocessed data to ../../../../Downloads/cholec/train_images


100%|██████████| 5656/5656 [07:12<00:00, 13.07it/s]
  0%|          | 2/1212 [00:00<01:49, 11.09it/s]

../../../../Downloads/cholec/val_images
Processing val_images data, saving preprocessed data to ../../../../Downloads/cholec/val_images


100%|██████████| 1212/1212 [01:32<00:00, 13.07it/s]
  0%|          | 2/1212 [00:00<01:52, 10.78it/s]

../../../../Downloads/cholec/test_images
Processing test_images data, saving preprocessed data to ../../../../Downloads/cholec/test_images


100%|██████████| 1212/1212 [01:32<00:00, 13.05it/s]

Done building dataset





In [13]:
gt_output_dir = "../../../../Downloads/cholec"

In [14]:
for split in ['train_gt', 'val_gt', 'test_gt']:
    output_dir_split = os.path.join(gt_output_dir, split)
    print(output_dir_split)
    
    
    if not os.path.exists(output_dir_split):
        os.mkdir(output_dir_split)
    else:
        print("Warning: dir {} already exists".format(output_dir_split))
    
    print("Processing {} data, saving preprocessed data to {}".format(split, output_dir_split))
    for filename in tqdm(gt_filenames[split]):
        resize_and_save(filename, output_dir_split)


print("Done building dataset")

  0%|          | 5/5656 [00:00<01:54, 49.53it/s]

../../../../Downloads/cholec/train_gt
Processing train_gt data, saving preprocessed data to ../../../../Downloads/cholec/train_gt


100%|██████████| 5656/5656 [01:27<00:00, 64.76it/s]
  1%|          | 7/1212 [00:00<00:19, 62.15it/s]

../../../../Downloads/cholec/val_gt
Processing val_gt data, saving preprocessed data to ../../../../Downloads/cholec/val_gt


100%|██████████| 1212/1212 [00:18<00:00, 64.73it/s]
  1%|          | 7/1212 [00:00<00:19, 60.85it/s]

../../../../Downloads/cholec/test_gt
Processing test_gt data, saving preprocessed data to ../../../../Downloads/cholec/test_gt


100%|██████████| 1212/1212 [00:18<00:00, 64.73it/s]

Done building dataset



