In [6]:
import os, shutil, random
from pathlib import Path

In [2]:
projections_root = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/projections'
subset_root = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000'
os.makedirs(subset_root, exist_ok=True)

Create subset of data with 1,000 samples per class (n_arms). This will result in 7,000 samples total per default, 2ds, and phips sub-folders. 21,000 samples total. 

In [31]:
%%time
def reservoir_sample_files(folder_path, k, seed=42):
    """Randomly sample k file paths from a directory with reservoir sampling."""
    random.seed(seed)
    reservoir = []
    with os.scandir(folder_path) as it:
        for i, entry in enumerate(it):
            if not entry.is_file():
                continue
            if len(reservoir) < k:
                sample = entry.name.rsplit('-',1)[0]
                reservoir.append(sample)
            else:
                j = random.randint(0, i)
                if j < k:
                    sample = entry.name.rsplit('-',1)[0]
                    reservoir[j] = sample
    return reservoir

def create_subset(source_dir, dest_dir, 
    num_samples_per_subfolder=10, 
    seed=42
):
    random.seed(seed)
    source_dir = Path(source_dir)
    dest_dir = Path(dest_dir)
    # Make sure destination directory exists
    dest_dir.mkdir(parents=True, exist_ok=True)
    prefix_dict = {}
    first_folder = True
    # Loop through top-level categories
    for top_folder in source_dir.iterdir():
        if top_folder.is_dir():
            view_type = str(top_folder).rsplit('/', 1)[1]
            # Now loop through second-level folders
            for sub_folder in top_folder.iterdir():
                if sub_folder.is_dir():
                    n_arms = str(sub_folder).rsplit('/', 1)[1]
                    if first_folder==True:
                        selected_files_prefix = reservoir_sample_files(sub_folder, 
                                                                num_samples_per_subfolder, 
                                                                seed)
                        prefix_dict[n_arms] = selected_files_prefix
                    selected_files_prefix = prefix_dict[n_arms]
                    selected_files = [(f+f'-{view_type}.png') for f in selected_files_prefix]
                    # Create destination subfolder
                    rel_path = sub_folder.relative_to(source_dir)
                    target_subfolder = dest_dir / rel_path
                    target_subfolder.mkdir(parents=True, exist_ok=True)
                    # Copy files
                    for filename in selected_files:
                        file_path = os.path.join(sub_folder, filename)
                        shutil.copy(file_path, target_subfolder)
                    print(f"Copied {len(selected_files)} files to {target_subfolder}")
            first_folder=False # only need prefix list the first time
# main
create_subset(
    source_dir=projections_root, 
    dest_dir=subset_root, 
    num_samples_per_subfolder=1000
)

Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/phips/6
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/phips/5
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/phips/8
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/phips/9
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/phips/7
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/phips/10
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/phips/4
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/default/6
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/default/5
Copied 1000 files to /glade/derecho/scratch/joko/synth-ros/params_20