In [1]:
import h5py
import pandas as pd
import os
import numpy as np

# Create sorted filename list

In [16]:
views = ['default', 'phips', '2ds']
save_dir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/'
# read tabular data
data_path = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/ros-tabular-data.parquet'
df = pd.read_parquet(data_path)
for view in views:
    print(f"Processing view: {view}")
    df_subset = df[df['view']==view]
    df_sorted = df_subset.sort_values(by=['ros_id', 'proj_id'])
    print('finished sorting...')
    # get filenames as list
    filenames = df_sorted['filename'].tolist()
    # save as text file
    print(f"Saving filenames to {save_dir}")
    savepath = os.path.join(save_dir, f'filenames_sorted_{view}.txt')
    with open(savepath, 'w') as f:
        for filename in filenames:
            f.write(f"{filename}\n")

Processing view: default
finished sorting...
Saving filenames to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/
Processing view: phips
finished sorting...
Saving filenames to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/
Processing view: 2ds
finished sorting...
Saving filenames to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/


# Create randomly shuffled filenames

In [2]:
views = ['default', 'phips', '2ds']
save_dir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/'
n_rand = 666
np.random.seed(n_rand)
# read tabular data
data_path = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/ros-tabular-data.parquet'
df = pd.read_parquet(data_path)
indices = np.arange(7_000_000)
shuffled_indices = np.random.permutation(indices)
for view in views:
    print(f"Processing view: {view}")
    df_subset = df[df['view']==view]
    df_sorted = df_subset.sort_values(by=['ros_id', 'proj_id'])
    print('finished sorting...')
    df_sorted.reset_index(drop=True, inplace=True)
    print('finished shuffling...')
    # get filenames as list
    filenames = df_sorted['filename'].iloc[shuffled_indices].tolist()
    # save as text file
    print(f"Saving filenames to {save_dir}")
    savepath = os.path.join(save_dir, f'filenames_shuffled_{view}.txt')
    with open(savepath, 'w') as f:
        for filename in filenames:
            f.write(f"{filename}\n")

Processing view: default
finished sorting...
finished shuffling...
Saving filenames to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/
Processing view: phips
finished sorting...
finished shuffling...
Saving filenames to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/
Processing view: 2ds
finished sorting...
finished shuffling...
Saving filenames to /glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/


# Examine h5 files

In [3]:
# read filenames from hdf5 file
hdf_file = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/imgs-ml-ready/sorted/default_sorted.h5'
with h5py.File(hdf_file, 'r') as f:
    filenames = f['filenames'][:]
filenames[:10]

array([b'ros-projection-062418-021-default.png',
       b'ros-projection-063447-032-default.png',
       b'ros-projection-062825-059-default.png',
       b'ros-projection-067703-057-default.png',
       b'ros-projection-067836-003-default.png',
       b'ros-projection-060345-051-default.png',
       b'ros-projection-061123-087-default.png',
       b'ros-projection-062664-088-default.png',
       b'ros-projection-063753-036-default.png',
       b'ros-projection-060758-005-default.png'], dtype='|S256')

In [4]:
# read filenames from hdf5 file
hdf_file = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/imgs-ml-ready/default.h5'
with h5py.File(hdf_file, 'r') as f:
    filenames = f['filenames'][:]
filenames[:10]

array([b'ros-projection-062418-021-default.png',
       b'ros-projection-063447-032-default.png',
       b'ros-projection-062825-059-default.png',
       b'ros-projection-067703-057-default.png',
       b'ros-projection-067836-003-default.png',
       b'ros-projection-060345-051-default.png',
       b'ros-projection-061123-087-default.png',
       b'ros-projection-062664-088-default.png',
       b'ros-projection-063753-036-default.png',
       b'ros-projection-060758-005-default.png'], dtype='|S256')

In [12]:
# read filenames from hdf5 file
hdf_file = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/imgs-ml-ready/phips.h5'
with h5py.File(hdf_file, 'r') as f:
    filenames = f['filenames'][:]
filenames[:10]

array([b'ros-projection-068064-025-phips.png',
       b'ros-projection-063031-067-phips.png',
       b'ros-projection-060865-084-phips.png',
       b'ros-projection-068702-077-phips.png',
       b'ros-projection-063978-098-phips.png',
       b'ros-projection-065948-050-phips.png',
       b'ros-projection-063904-075-phips.png',
       b'ros-projection-066038-089-phips.png',
       b'ros-projection-067454-069-phips.png',
       b'ros-projection-067414-005-phips.png'], dtype='|S256')

In [2]:
# read tabular data
data_path = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/ros-tabular-data.parquet'
df = pd.read_parquet(data_path)
df_default = df[df['view']=='default']
print(df_default.shape)
df_default.head()

(7000000, 23)


Unnamed: 0,filename,unique_id,ros_id,proj_id,view,a,c,f_r0,f_hp,f_h0,...,sa_eff,rho_eff,aspect_ratio,aspect_ratio_elip,extreme_pts,contour_area,contour_perimeter,area_ratio,complexity,circularity
1050000,ros-projection-024953-013-default.png,024953_013_default,24953,13,default,20.955594,97.69879,1.199427,0.957635,0.889027,...,0.219582,0.022356,0.94086,0.809387,48.63239,3836.5,655.452877,0.208787,0.804566,0.112218
1050001,ros-projection-024298-067-default.png,024298_067_default,24298,67,default,26.827208,78.268505,1.171522,1.127847,0.809171,...,0.351269,0.056389,0.97124,0.896102,51.867469,5894.0,609.528999,0.327295,0.722699,0.199357
1050002,ros-projection-029809-000-default.png,029809_000_default,29809,0,default,20.697343,80.169697,1.098286,0.839677,0.974711,...,0.316608,0.046194,0.596813,0.586093,43.754286,4282.0,521.989895,0.269727,0.697406,0.197484
1050003,ros-projection-028644-063-default.png,028644_063_default,28644,63,default,27.383479,107.252723,0.800426,1.083508,1.131781,...,0.279258,0.037315,0.898351,0.842052,48.900249,4558.0,572.357426,0.271461,0.732954,0.174844
1050004,ros-projection-025290-037-default.png,025290_037_default,25290,37,default,31.87249,150.829674,0.973202,1.15934,1.094094,...,0.207776,0.022235,0.725217,0.699946,52.065914,4285.0,680.239676,0.201858,0.793888,0.116369


In [3]:
df_default_sorted = df_default.sort_values(by=['ros_id', 'proj_id'])
df_default_sorted.head()

Unnamed: 0,filename,unique_id,ros_id,proj_id,view,a,c,f_r0,f_hp,f_h0,...,sa_eff,rho_eff,aspect_ratio,aspect_ratio_elip,extreme_pts,contour_area,contour_perimeter,area_ratio,complexity,circularity
16418272,ros-projection-000000-000-default.png,000000_000_default,0,0,default,19.677777,26.434892,0.922732,1.120977,1.199872,...,0.471492,0.131525,0.760534,0.745769,51.118582,7602.5,489.310749,0.482908,0.543065,0.399022
16337892,ros-projection-000000-001-default.png,000000_001_default,0,1,default,19.677777,26.434892,0.922732,1.120977,1.199872,...,0.471492,0.131525,0.857027,0.919697,46.469748,6954.0,451.788883,0.473983,0.50514,0.428128
16237900,ros-projection-000000-002-default.png,000000_002_default,0,2,default,19.677777,26.434892,0.922732,1.120977,1.199872,...,0.471492,0.131525,0.697876,0.647976,49.808226,8278.0,514.925967,0.450684,0.53495,0.392325
16418320,ros-projection-000000-003-default.png,000000_003_default,0,3,default,19.677777,26.434892,0.922732,1.120977,1.199872,...,0.471492,0.131525,0.756598,0.84588,50.007499,7630.5,498.516807,0.446881,0.540699,0.385836
3718024,ros-projection-000000-004-default.png,000000_004_default,0,4,default,19.677777,26.434892,0.922732,1.120977,1.199872,...,0.471492,0.131525,0.866028,0.862108,44.601009,7094.0,456.676186,0.510381,0.523868,0.427449


In [4]:
# get filenames from sorted df_default
filenames_default = df_default_sorted['filename'].values

In [5]:
import h5py
import numpy as np

hdf_file = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/imgs-ml-ready/default.h5'
with h5py.File(hdf_file, 'r') as f:
    filenames = f['filenames'][:].astype(str)  # decode if needed

# external_order: your list/array of filenames in the desired order
filename_to_index = {name: i for i, name in enumerate(filenames)}
indices = np.array([filename_to_index[name] for name in filenames_default])

In [6]:
# Sort indices for h5py
print(indices[:10])
sort_idx = np.argsort(indices)
print(sort_idx[:10])
indices_sorted = indices[sort_idx]
print(indices_sorted[:10])

[1645241 1170274 1004258 1673850 1514339 1856851 1224854 1405272 1123028
 1689300]
[6241821 6344732 6282559 6770357 6783603 6034551 6112387 6266488 6375336
 6075805]
[0 1 2 3 4 5 6 7 8 9]


In [7]:
unsort_idx = np.argsort(sort_idx)
print(unsort_idx[:10])

[1645241 1170274 1004258 1673850 1514339 1856851 1224854 1405272 1123028
 1689300]


In [9]:
np.array_equal(indices_sorted, np.arange(7_000_000))

True

In [None]:
# with h5py.File(hdf_file, 'r') as f:
#     filenames_sorted = f['filenames'][indices_sorted]
#     images_sorted = f['images'][indices_sorted]
#     n_arms_sorted = f['n_arms'][indices_sorted]
#     rho_eff_sorted = f['rho_eff'][indices_sorted]
#     sa_eff_sorted = f['sa_eff'][indices_sorted]

In [12]:
# Unsort to match external_filenames order
unsort_idx = np.argsort(sort_idx)
filenames_reordered = filenames[unsort_idx]
print(filenames_reordered[:200])

['ros-projection-000000-000-default.png'
 'ros-projection-000000-001-default.png'
 'ros-projection-000000-002-default.png'
 'ros-projection-000000-003-default.png'
 'ros-projection-000000-004-default.png'
 'ros-projection-000000-005-default.png'
 'ros-projection-000000-006-default.png'
 'ros-projection-000000-007-default.png'
 'ros-projection-000000-008-default.png'
 'ros-projection-000000-009-default.png'
 'ros-projection-000000-010-default.png'
 'ros-projection-000000-011-default.png'
 'ros-projection-000000-012-default.png'
 'ros-projection-000000-013-default.png'
 'ros-projection-000000-014-default.png'
 'ros-projection-000000-015-default.png'
 'ros-projection-000000-016-default.png'
 'ros-projection-000000-017-default.png'
 'ros-projection-000000-018-default.png'
 'ros-projection-000000-019-default.png'
 'ros-projection-000000-020-default.png'
 'ros-projection-000000-021-default.png'
 'ros-projection-000000-022-default.png'
 'ros-projection-000000-023-default.png'
 'ros-projection

In [8]:
# read tabular data
data_path = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/ros-tabular-data.parquet'
df = pd.read_parquet(data_path)
df_2ds = df[df['view']=='2ds']
print(df_2ds.shape)
df_2ds.head()

(7000000, 23)


Unnamed: 0,filename,unique_id,ros_id,proj_id,view,a,c,f_r0,f_hp,f_h0,...,sa_eff,rho_eff,aspect_ratio,aspect_ratio_elip,extreme_pts,contour_area,contour_perimeter,area_ratio,complexity,circularity
0,ros-projection-007347-085-2ds.png,007347_085_2ds,7347,85,2ds,31.233891,84.989032,0.936455,0.818327,1.057092,...,0.31751,0.064757,0.53943,0.458701,51.799493,5456.5,445.345234,0.309982,0.505857,0.345725
1,ros-projection-009656-040-2ds.png,009656_040_2ds,9656,40,2ds,11.94591,49.107549,1.025795,0.915659,1.066461,...,0.194514,0.025743,0.376434,0.39195,44.749302,3315.5,395.144226,0.245474,0.571417,0.266838
2,ros-projection-002710-078-2ds.png,002710_078_2ds,2710,78,2ds,24.946133,63.507982,0.985212,1.110469,0.990414,...,0.317656,0.063475,0.853006,0.686887,58.403633,7549.0,633.244728,0.349796,0.681698,0.236568
3,ros-projection-006468-095-2ds.png,006468_095_2ds,6468,95,2ds,17.849968,25.95642,1.162849,0.909022,1.072718,...,0.481623,0.133458,0.62589,0.630274,48.327011,7489.5,483.605119,0.450465,0.522866,0.402421
4,ros-projection-004114-062-2ds.png,004114_062_2ds,4114,62,2ds,26.554307,73.456699,0.874707,0.939581,0.986178,...,0.282056,0.050547,0.647354,0.661365,54.861188,5718.5,538.859952,0.319061,0.651347,0.24748


In [9]:
# read tabular data
data_path = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2/ros-tabular-data.parquet'
df = pd.read_parquet(data_path)
df_phips = df[df['view']=='phips']
print(df_phips.shape)
df_phips.head()

(7000000, 23)


Unnamed: 0,filename,unique_id,ros_id,proj_id,view,a,c,f_r0,f_hp,f_h0,...,sa_eff,rho_eff,aspect_ratio,aspect_ratio_elip,extreme_pts,contour_area,contour_perimeter,area_ratio,complexity,circularity
210000,ros-projection-014594-074-phips.png,014594_074_phips,14594,74,phips,21.252757,75.52368,0.951059,0.996672,1.122356,...,0.268148,0.04044,0.663873,0.659896,46.483868,4553.0,531.705624,0.258692,0.683361,0.202379
210001,ros-projection-010571-039-phips.png,010571_039_phips,10571,39,phips,14.170335,30.619789,1.012859,1.096243,1.181607,...,0.369812,0.074645,0.923866,0.915664,48.046202,5830.5,560.139175,0.365484,0.692617,0.23352
210002,ros-projection-010822-027-phips.png,010822_027_phips,10822,27,phips,23.35117,55.754342,1.102658,0.87949,0.849975,...,0.363803,0.069088,0.972138,0.69265,47.840882,5622.5,487.60512,0.328526,0.58742,0.297169
210003,ros-projection-016829-064-phips.png,016829_064_phips,16829,64,phips,32.726273,70.239021,1.245951,0.897048,1.179614,...,0.397625,0.081532,0.93416,0.851817,46.365228,6569.5,550.256921,0.440168,0.672966,0.272654
210004,ros-projection-018460-080-phips.png,018460_080_phips,18460,80,phips,26.26575,37.368659,1.223532,0.837535,1.108414,...,0.528155,0.149016,0.809115,0.720523,49.736807,6992.0,479.73001,0.414137,0.527898,0.381784


In [10]:
print("df_default index range:", df_default.index.min(), "to", df_default.index.max())
print("df_phips index range:", df_phips.index.min(), "to", df_phips.index.max())
print("df_2ds index range:", df_2ds.index.min(), "to", df_2ds.index.max())

df_default index range: 1050000 to 20579999
df_phips index range: 210000 to 20999999
df_2ds index range: 0 to 20789999


In [11]:
# print the number of unique index values in each dataframe
print("Number of unique index values in df_default:", df_default.index.nunique())
print("Number of unique index values in df_phips:", df_phips.index.nunique())   
print("Number of unique index values in df_2ds:", df_2ds.index.nunique())

Number of unique index values in df_default: 7000000
Number of unique index values in df_phips: 7000000
Number of unique index values in df_2ds: 7000000
