In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [64]:
import pathlib
from sklearn.model_selection import train_test_split
from shutil import copy
import os

DATAPATH = pathlib.Path( "/home/john/data/Hey-Waldo/64/")
OUTPUTPATH = pathlib.Path("/home/john/data/WheresWally/")

def create_experiment_folders(datapath, outputpath, test_size=0.2, valid_size=0.2):

    datapath = pathlib.Path( datapath)
    outputpath = pathlib.Path(outputpath)

    outputpath.mkdir(exist_ok=True)

    class_names = os.listdir(datapath)
    print(f"Found {len(class_names)} classes: {class_names} ")

    trn = outputpath/"train"
    trn.mkdir(exist_ok=True)
    val = outputpath/"valid"
    val.mkdir(exist_ok=True)
    tst = outputpath/"test"
    tst.mkdir(exist_ok=True)

    for cl in class_names:
        elems = os.listdir(datapath/cl)
        trainVal_set, test_set = train_test_split(elems, test_size=test_size)
        train_set, val_set = train_test_split(trainVal_set, test_size=valid_size)
        print(f'{cl}: train: {len(train_set)} val: {len(val_set)} test: {len(test_set)}')
        for set_path, s in zip([trn, val, tst], [train_set, val_set, test_set]):
            for i in s:
                (set_path/cl).mkdir(exist_ok=True)
                copy(datapath/cl/i, set_path/cl/i)

    print(f'Finished building experiment folders in {outputpath}')
    
    

In [65]:
create_experiment_folders(DATAPATH, OUTPUTPATH)

Found 2 classes: ['notwaldo', 'waldo'] 
notwaldo: train: 3415 val: 854 test: 1068
waldo: train: 24 val: 7 test: 8
Finished building experiment folders in /home/john/data/WheresWally


# Unfortunatly, neither of the other 2 datasets provided images in the folder form :(

In [39]:
ships_path = pathlib.Path("/home/john/data/ships-in-satellite-imagery/shipsnet/")

In [41]:
image_list = os.listdir(ships_path)

In [46]:
s[0] == "0"

True

In [50]:
not_ship = []
is_ship = []
for s in image_list:
    if s[0] == '0':
        not_ship.append(s)
    elif s[0] == '1':
        is_ship.append(s)
    else:
        print(f"ERROR {s}")
    
        

In [55]:
full_data_path = ships_path/"fulldata"
full_data_path.mkdir(exist_ok=True)
for label, files in [("is_ship",is_ship),("not_ship",not_ship)]:
    folder = full_data_path/label
    folder.mkdir(exist_ok=True)
    for f in files:
        copy(ships_path/f, folder/f)
    print(f"copied {len(files)} files to {folder}")
        

copied 700 files to /home/john/data/ships-in-satellite-imagery/shipsnet/fulldata/is_ship
copied 2100 files to /home/john/data/ships-in-satellite-imagery/shipsnet/fulldata/not_ship


In [61]:
create_experiment_folders(full_data_path, "/home/john/data/sat-ships/")

Found 2 classes: ['is_ship', 'not_ship'] 
is_ship: train: 448 val: 112 test: 140
not_ship: train: 1344 val: 336 test: 420
Finished building experiment folders in /home/john/data/sat-ships


# Ok, now let's do the same for the arabic ones... 