This notebook performs a simple train/cv/test split for the very first classification model.

In [18]:
from pathlib import Path
import shutil

import numpy as np

In [19]:
BASE_PATH = Path("~/data/labeled").expanduser()
DEST_BASE_PATH = BASE_PATH.parent / "split-balanced"
DEST_BASE_PATH.mkdir(exist_ok=True)

In [20]:
label_pos = "BirdHome"
label_neg = "BirdRoaming"

In [21]:
def glob_print(label):
    path = BASE_PATH / label
    glob = list(path.glob("*.jpeg"))
    print(f"{path.name}: {len(glob)} images")
    return glob

In [22]:
files_pos = glob_print(label_pos)
files_neg = glob_print(label_neg)
max_single_class = min(len(files_pos), len(files_neg))

files_pos = sorted(np.random.choice(files_pos, size=max_single_class, replace=False))
files_neg = sorted(np.random.choice(files_neg, size=max_single_class, replace=False))

BirdHome: 4221 images
BirdRoaming: 1444 images


In [23]:
SPLIT = [.6, .2, .2]
SPLIT_NAMES = ["train", "cv", "test"]

In [24]:
for files in [files_neg, files_pos]:
    label = files[0].parent.name
    
    n = len(files)
    current_idx = 0
    for split_frac, split_name in zip(SPLIT, SPLIT_NAMES):
        dest_path = DEST_BASE_PATH / split_name / label
        dest_path.mkdir(exist_ok=True, parents=True)
        
        num_to_select = round(n * split_frac)
        sel_files = files[current_idx:current_idx + num_to_select]    
        print(f"Selected {len(sel_files)} jpegs to copy to {dest_path}")
        
        for jpeg in sel_files:
            shutil.copy(jpeg, dest_path)
        
        current_idx += num_to_select    

Selected 866 jpegs to copy to /home/jvlier/data/split-balanced/train/BirdRoaming
Selected 289 jpegs to copy to /home/jvlier/data/split-balanced/cv/BirdRoaming
Selected 289 jpegs to copy to /home/jvlier/data/split-balanced/test/BirdRoaming
Selected 866 jpegs to copy to /home/jvlier/data/split-balanced/train/BirdHome
Selected 289 jpegs to copy to /home/jvlier/data/split-balanced/cv/BirdHome
Selected 289 jpegs to copy to /home/jvlier/data/split-balanced/test/BirdHome
