In [None]:
import shutil
import os
import numpy as np
import time
import cv2
import tqdm
from glob import glob
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# output of the train_test_split code
ROOT = "/content/drive/MyDrive/Images"
all_images = glob(f"{ROOT}/out_pics/*.jpg")
all_masks = glob(f"{ROOT}/out_masks/*.tif")

# images split into train/val/test
# 60% train, 20% val, 20% test 
train, test = train_test_split(all_images, test_size = 0.4, random_state = 21)
test, val = train_test_split(test, test_size = 0.5, random_state = 21)
print(f"{len(train)} training images")
print(f"{len(valid)} validation images")
print(f"{len(test)} testing images")

# make mask split using the filepaths from
# train, test, val image sets
def get_masks(image_paths):
  masks = []
  for s in train:
    end = s.split('out_pics')[1]
    end = end[:-3] + "tif"
    s = s.split('out_pics')[0] + 'out_masks' + end
    masks.append(s)
  return masks

train_masks = get_masks(train)
valid_masks = get_masks(valid)
test_masks = get_masks(test)
print(f"{len(train_masks)} training masks")
print(f"{len(valid_masks)} validation masks")
print(f"{len(test_masks)} testing masks")

# make all necessary folders
def make_folders(setname):
  os.makedirs(setname, exist_ok=True)
  os.chdir(setname)
  os.makedirs("images", exist_ok=True)
  os.makedirs("masks", exist_ok=True)
  os.chdir("..")

os.chdir(ROOT)
make_folders("train")
make_folders("test")
make_folders("valid")

# move all images into train, test, and split folders
def move(src_paths, setname, destname):
  failed_moves = 0
  for img in src_paths:
    try:
      dest = move(img, f"{ROOT}/{setname}/{destname}")
    except:
      print(f"{img} did not move to {setname}/{destname}")
      failed_moves += 1
  return failed_moves

failed_moves = 0
failed_moves += move(train, 'train', 'images')
failed_moves += move(train, 'train', 'masks')
failed_moves += move(train, 'valid', 'images')
failed_moves += move(train, 'valid', 'masks')
failed_moves += move(train, 'test', 'images')
failed_moves += move(train, 'test', 'masks')
print(f"\n{failed_moves} failed moves")

In [None]:
# remount drive & see if images/masks have loaded
# in their moved locations
# note: may need to change ROOT based on your file structure
# also, target loading numbers may be different
# based on amount of data being used

drive.mount("/content/drive", force_remount=True)
test = glob(f"{ROOT}/train/images/*.jpg")
print(f"train images loaded: {len(test)}/768")
test = glob(f"{ROOT}/train/masks/*.tif")
print(f"train masks loaded: {len(test)}/768")
test = glob(f"{ROOT}/valid/images/*.jpg")
print(f"valid images loaded: {len(test)}/256")
test = glob(f"{ROOT}/valid/masks/*.tif")
print(f"valid masks loaded: {len(test)}/256")
test = glob(f"{ROOT}/test/images/*.jpg")
print(f"test images loaded: {len(test)}/256")
test = glob(f"{ROOT}/test/masks/*.tif")
print(f"test masks loaded: {len(test)}/256")

In [None]:
# checking to see if the image & mask files match

a = set(train)
b = set(train_masks)
c = set()
d = set()

for s in a:
  s = s.split('/')[-1]
  c.add(s[:-4])
for s in b:
  s = s.split('/')[-1]
  d.add(s[:-4])

if (c.difference(d)) == set():
  print("All good - mask and image files match!")
else:
  print("Oops, there are some unmatching mask and image files:")
  print(c.difference(d))