In [1]:
import os, shutil, random
from pathlib import Path
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
projections_root = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/projections'
subset_root = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000'
os.makedirs(subset_root, exist_ok=True)

# Create Subset

Create subset of data with 1,000 samples per class (n_arms). This will result in 7,000 samples total per default, 2ds, and phips sub-folders. 21,000 samples total. 

In [None]:
%%time
def reservoir_sample_files(folder_path, k, seed=42):
    """Randomly sample k file paths from a directory with reservoir sampling."""
    random.seed(seed)
    reservoir = []
    with os.scandir(folder_path) as it:
        for i, entry in enumeratecd sy(it):
            if not entry.is_file():
                continue
            if len(reservoir) < k:
                sample = entry.name.rsplit('-',1)[0]
                reservoir.append(sample)
            else:
                j = random.randint(0, i)
                if j < k:
                    sample = entry.name.rsplit('-',1)[0]
                    reservoir[j] = sample
    return reservoir

def create_subset(source_dir, dest_dir, 
    num_samples_per_subfolder=10, 
    seed=42
):
    random.seed(seed)
    source_dir = Path(source_dir)
    dest_dir = Path(dest_dir)
    # Make sure destination directory exists
    dest_dir.mkdir(parents=True, exist_ok=True)
    prefix_dict = {}
    first_folder = True
    # Loop through top-level categories
    for top_folder in source_dir.iterdir():
        if top_folder.is_dir():
            view_type = str(top_folder).rsplit('/', 1)[1]
            # Now loop through second-level folders
            for sub_folder in top_folder.iterdir():
                if sub_folder.is_dir():
                    n_arms = str(sub_folder).rsplit('/', 1)[1]
                    if first_folder==True:
                        selected_files_prefix = reservoir_sample_files(sub_folder, 
                                                                num_samples_per_subfolder, 
                                                                seed)
                        prefix_dict[n_arms] = selected_files_prefix
                    selected_files_prefix = prefix_dict[n_arms]
                    selected_files = [(f+f'-{view_type}.png') for f in selected_files_prefix]
                    # Create destination subfolder
                    rel_path = sub_folder.relative_to(source_dir)
                    target_subfolder = dest_dir / rel_path
                    target_subfolder.mkdir(parents=True, exist_ok=True)
                    # Copy files
                    for filename in selected_files:
                        file_path = os.path.join(sub_folder, filename)
                        shutil.copy(file_path, target_subfolder)
                    print(f"Copied {len(selected_files)} files to {target_subfolder}")
            first_folder=False # only need prefix list the first time
# main
create_subset(
    source_dir=projections_root, 
    dest_dir=subset_root, 
    num_samples_per_subfolder=1000
)

# Now re-organize the directory structure for the subset dataset

In [None]:
def flat_split_dataset(
    source_dir,
    dest_dir,
    split_ratio=(0.7, 0.15, 0.15),
    seed=42,
    valid_exts={".jpg", ".jpeg", ".png"}
):
    random.seed(seed)
    source_dir = Path(source_dir)
    dest_dir = Path(dest_dir)

    assert sum(split_ratio) == 1.0, "Split ratios must sum to 1.0"

    class_dirs = [d for d in source_dir.iterdir() if d.is_dir()]

    # Create split folders
    for split in ["train", "val", "test"]:
        (dest_dir / split).mkdir(parents=True, exist_ok=True)

    for class_dir in class_dirs:
        class_name = class_dir.name
        print(f"Processing class: {class_name}")

        # Get image files
        image_files = [f for f in class_dir.iterdir() if f.suffix.lower() in valid_exts]
        # print(image_files)
    
        # Split into train and temp (val + test)
        train_files, temp_files = train_test_split(
            image_files,
            train_size=split_ratio[0],
            random_state=seed,
            shuffle=True,
        )

        # Split temp into val and test
        val_ratio = split_ratio[1] / (split_ratio[1] + split_ratio[2])
        val_files, test_files = train_test_split(
            temp_files,
            train_size=val_ratio,
            random_state=seed,
            shuffle=True,
        )

        split_map = {
            "train": train_files,
            "val": val_files,
            "test": test_files
        }

        for split, files in split_map.items():
            for file_path in files:
                dest_path = dest_dir / split / file_path.name
                shutil.copy(file_path, dest_path)

        print(f"  → {len(train_files)} train, {len(val_files)} val, {len(test_files)} test")

In [None]:
flat_split_dataset(
    source_dir="/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000/default",
    dest_dir="/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000_default_split"
)

# Create labels file

In [None]:
# load data with labels
ros_data = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/data/ros-data-merged.txt'
df = pd.read_csv(ros_data)
df.head()

In [None]:
print(df.columns)

In [None]:
str(df.n_arms[0])

In [None]:
# create labels
labels = []
# get all file paths in a list 
ds_dir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000_default_split'
ds_path = Path(ds_dir)
all_files = list(ds_path.rglob("*"))  # includes all files and folders
all_files = [f for f in all_files if f.is_file() and f.suffix.lower() == ".png"] # only png files
all_files[0]

In [None]:
test = all_files[0]
test.as_posix().rsplit('/',2)[1]

In [None]:
# create labels
labels = []
# get all file paths in a list 
ds_dir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/subset_n1000_default_split'
ds_path = Path(ds_dir)
all_files = list(ds_path.rglob("*"))  # includes all files and folders
all_files = [f for f in all_files if f.is_file() and f.suffix.lower() == ".png"] # only png files
def get_labels(filename, split, id, df): 
    data = df[df.id==int(id)].iloc[0]
    n_arms = str(int(data['n_arms'])) # string 
    rho_eff = data['rho_eff']
    sa_eff = data['sa_eff']
    return [filename, split, n_arms, rho_eff, sa_eff]
# get labels for each files and append
for f in all_files:
    filename = f.name
    id = filename.split('-')[2]
    split = f.as_posix().rsplit('/', 2)[1]
    record = get_labels(filename, split, id, df)
    labels.append(record)
# turn into dataframe and save as csv 
colnames = ['filename', 'split', 'n_arms', 'rho_eff', 'sa_eff']
df_labels = pd.DataFrame(labels, columns=colnames)
savepath = os.path.join(ds_dir, 'labels.csv')
df_labels.to_csv(savepath, index=False)

In [None]:
# test read
df_labels = pd.read_csv(savepath)
df_labels.head()

# Scratch

In [None]:
from PIL import Image
test_file = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/projections/phips/4/ros-projection-000000-000-phips.png'
image = Image.open(test_file)

In [None]:
print(image.size)
print(image.mode)