In [1]:
import pandas as pd
import os

In [2]:
filedir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2'
filename = 'ros-tabular-data.parquet'
filepath = os.path.join(filedir, filename)
df = pd.read_parquet(filepath)

In [3]:
df.head()

Unnamed: 0,filename,unique_id,ros_id,proj_id,view,a,c,f_r0,f_hp,f_h0,...,sa_eff,rho_eff,aspect_ratio,aspect_ratio_elip,extreme_pts,contour_area,contour_perimeter,area_ratio,complexity,circularity
0,ros-projection-007347-085-2ds.png,007347_085_2ds,7347,85,2ds,31.233891,84.989032,0.936455,0.818327,1.057092,...,0.31751,0.064757,0.53943,0.458701,51.799493,5456.5,445.345234,0.309982,0.505857,0.345725
1,ros-projection-009656-040-2ds.png,009656_040_2ds,9656,40,2ds,11.94591,49.107549,1.025795,0.915659,1.066461,...,0.194514,0.025743,0.376434,0.39195,44.749302,3315.5,395.144226,0.245474,0.571417,0.266838
2,ros-projection-002710-078-2ds.png,002710_078_2ds,2710,78,2ds,24.946133,63.507982,0.985212,1.110469,0.990414,...,0.317656,0.063475,0.853006,0.686887,58.403633,7549.0,633.244728,0.349796,0.681698,0.236568
3,ros-projection-006468-095-2ds.png,006468_095_2ds,6468,95,2ds,17.849968,25.95642,1.162849,0.909022,1.072718,...,0.481623,0.133458,0.62589,0.630274,48.327011,7489.5,483.605119,0.450465,0.522866,0.402421
4,ros-projection-004114-062-2ds.png,004114_062_2ds,4114,62,2ds,26.554307,73.456699,0.874707,0.939581,0.986178,...,0.282056,0.050547,0.647354,0.661365,54.861188,5718.5,538.859952,0.319061,0.651347,0.24748


In [6]:
print(df.columns)

Index(['filename', 'unique_id', 'ros_id', 'proj_id', 'view', 'a', 'c', 'f_r0',
       'f_hp', 'f_h0', 'n_arms', 'sa', 'vol', 'sa_eff', 'rho_eff',
       'aspect_ratio', 'aspect_ratio_elip', 'extreme_pts', 'contour_area',
       'contour_perimeter', 'area_ratio', 'complexity', 'circularity'],
      dtype='object')


In [4]:
# filter by view
df_default = df[df['view'] == 'default']
df_2ds  = df[df['view'] == '2ds']
df_phips  = df[df['view'] == 'phips']

In [5]:
print(df_default.shape, df_2ds.shape, df_phips.shape)

(7000000, 23) (7000000, 23) (7000000, 23)


In [7]:
# order by ros_id and then proj_id
df_default = df_default.sort_values(['ros_id', 'proj_id'])
df_2ds = df_2ds.sort_values(['ros_id', 'proj_id'])
df_phips = df_phips.sort_values(['ros_id', 'proj_id'])

In [11]:
# reset index for all dataframes
df_default = df_default.reset_index(drop=True)
df_2ds = df_2ds.reset_index(drop=True)
df_phips = df_phips.reset_index(drop=True)

In [13]:
# add column called key that is a combination of ros_id and proj_id
df_default['key'] = df_default['ros_id'].astype(str) + '_' + df_default['proj_id'].astype(str)
df_2ds['key'] = df_2ds['ros_id'].astype(str) + '_' + df_2ds['proj_id'].astype(str)
df_phips['key'] = df_phips['ros_id'].astype(str) + '_' + df_phips['proj_id'].astype(str)

In [15]:
# only keep the columns that are needed for ML
columns_subset = ['key', 'aspect_ratio', 'aspect_ratio_elip', 'extreme_pts', 
                  'contour_area', 'contour_perimeter', 'area_ratio', 
                  'complexity', 'circularity','rho_eff', 'sa_eff', 'n_arms']
df_default_subset = df_default[columns_subset]
df_2ds_subset = df_2ds[columns_subset]
df_phips_subset = df_phips[columns_subset]

In [24]:
# combine default and 2ds on key
df_default_2ds = pd.merge(df_default_subset, df_2ds_subset, on='key', suffixes=('_default', '_2ds'))
df_default_2ds.head()

Unnamed: 0,key,aspect_ratio_default,aspect_ratio_elip_default,extreme_pts_default,contour_area_default,contour_perimeter_default,area_ratio_default,complexity_default,circularity_default,rho_eff_default,...,aspect_ratio_elip_2ds,extreme_pts_2ds,contour_area_2ds,contour_perimeter_2ds,area_ratio_2ds,complexity_2ds,circularity_2ds,rho_eff_2ds,sa_eff_2ds,n_arms_2ds
0,0_0,0.760534,0.745769,51.118582,7602.5,489.310749,0.482908,0.543065,0.399022,0.131525,...,0.875256,44.225982,6677.0,433.646749,0.513359,0.504437,0.446189,0.131525,0.471492,4.0
1,0_1,0.857027,0.919697,46.469748,6954.0,451.788883,0.473983,0.50514,0.428128,0.131525,...,0.777229,49.061186,7555.0,513.788885,0.443472,0.570234,0.359646,0.131525,0.471492,4.0
2,0_2,0.697876,0.647976,49.808226,8278.0,514.925967,0.450684,0.53495,0.392325,0.131525,...,0.858947,48.515461,8718.5,533.144223,0.556797,0.588941,0.385445,0.131525,0.471492,4.0
3,0_3,0.756598,0.84588,50.007499,7630.5,498.516807,0.446881,0.540699,0.385836,0.131525,...,0.786811,51.515623,8002.5,515.712764,0.483161,0.567124,0.378111,0.131525,0.471492,4.0
4,0_4,0.866028,0.862108,44.601009,7094.0,456.676186,0.510381,0.523868,0.427449,0.131525,...,0.784926,48.189081,8015.5,526.315796,0.461753,0.574173,0.36362,0.131525,0.471492,4.0


In [25]:
# check that sa_eff, rho_eff, and n_arms are the same for default and 2ds
assert df_default_2ds['sa_eff_default'].equals(df_default_2ds['sa_eff_2ds'])
assert df_default_2ds['rho_eff_default'].equals(df_default_2ds['rho_eff_2ds'])
assert df_default_2ds['n_arms_default'].equals(df_default_2ds['n_arms_2ds'])

In [26]:
# delete the second sa_eff and rho_eff columns
df_default_2ds = df_default_2ds.drop(columns=['sa_eff_2ds', 'rho_eff_2ds', 'n_arms_2ds'])
# rename the columns
df_default_2ds = df_default_2ds.rename(columns={'sa_eff_default': 'sa_eff', 
                                                'rho_eff_default': 'rho_eff', 
                                                'n_arms_default': 'n_arms'})
# delete key column
df_default_2ds = df_default_2ds.drop(columns=['key'])
print(df_default_2ds.columns)

Index(['aspect_ratio_default', 'aspect_ratio_elip_default',
       'extreme_pts_default', 'contour_area_default',
       'contour_perimeter_default', 'area_ratio_default', 'complexity_default',
       'circularity_default', 'rho_eff', 'sa_eff', 'n_arms',
       'aspect_ratio_2ds', 'aspect_ratio_elip_2ds', 'extreme_pts_2ds',
       'contour_area_2ds', 'contour_perimeter_2ds', 'area_ratio_2ds',
       'complexity_2ds', 'circularity_2ds'],
      dtype='object')


In [28]:
df_default_2ds.shape

(7000000, 19)

In [29]:
# combine default and phips on key
df_default_phips = pd.merge(df_default_subset, df_phips_subset, on='key', suffixes=('_default', '_phips'))
# check that sa_eff, rho_eff, and n_arms are the same for default and phips
assert df_default_phips['sa_eff_default'].equals(df_default_phips['sa_eff_phips'])
assert df_default_phips['rho_eff_default'].equals(df_default_phips['rho_eff_phips'])
assert df_default_phips['n_arms_default'].equals(df_default_phips['n_arms_phips'])
# delete the second sa_eff and rho_eff columns
df_default_phips = df_default_phips.drop(columns=['sa_eff_phips', 'rho_eff_phips', 'n_arms_phips'])
# rename the columns
df_default_phips = df_default_phips.rename(columns={'sa_eff_default': 'sa_eff', 
                                                    'rho_eff_default': 'rho_eff', 
                                                    'n_arms_default': 'n_arms'})
# delete key column
df_default_phips = df_default_phips.drop(columns=['key'])
print(df_default_phips.columns)
print(df_default_phips.shape)

Index(['aspect_ratio_default', 'aspect_ratio_elip_default',
       'extreme_pts_default', 'contour_area_default',
       'contour_perimeter_default', 'area_ratio_default', 'complexity_default',
       'circularity_default', 'rho_eff', 'sa_eff', 'n_arms',
       'aspect_ratio_phips', 'aspect_ratio_elip_phips', 'extreme_pts_phips',
       'contour_area_phips', 'contour_perimeter_phips', 'area_ratio_phips',
       'complexity_phips', 'circularity_phips'],
      dtype='object')
(7000000, 19)


In [30]:
# save new merged dataframes
savedir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2'
df_default_2ds.to_parquet(os.path.join(savedir, 'ros-tabular-data-stereo-default-2ds.parquet'), index=False)
df_default_phips.to_parquet(os.path.join(savedir, 'ros-tabular-data-stereo-default-phips.parquet'), index=False)

# Create stereo tabular files from shuffled parquets

- recreate stereo data for good measure...
- last edit: june 3, 2025

In [17]:
# default + 2ds 
df1_path = '/home/jko/synth-ros-data/tabular-data-v2/shuffled/ros-tabular-data-shuffled-default.parquet'
df2_path = '/home/jko/synth-ros-data/tabular-data-v2/shuffled/ros-tabular-data-shuffled-2ds.parquet'
df1 = pd.read_parquet(df1_path)
df2 = pd.read_parquet(df2_path)
# combine dataframes (side by side)
df1_renamed = df1.add_suffix('_1')
df2_renamed = df2.add_suffix('_2')
df_stereo = pd.concat([df1_renamed, df2_renamed], axis=1)
# clean up columns
columns_subset = ['ros_id_1', 'proj_id_1', 'view_1', 'a_1',
       'c_1', 'f_r0_1', 'f_hp_1', 'f_h0_1', 'n_arms_1', 'sa_1', 'vol_1',
       'sa_eff_1', 'rho_eff_1', 'aspect_ratio_1', 'aspect_ratio_elip_1',
       'extreme_pts_1', 'contour_area_1', 'contour_perimeter_1',
       'area_ratio_1', 'complexity_1', 'circularity_1', 'view_2', 'aspect_ratio_2', 'aspect_ratio_elip_2', 'extreme_pts_2',
       'contour_area_2', 'contour_perimeter_2', 'area_ratio_2', 'complexity_2',
       'circularity_2']
df_stereo = df_stereo[columns_subset]
rename_dict = {
    'ros_id_1':'ros_id',
    'proj_id_1':'proj_id',
    'a_1':'a',
    'c_1':'c',
    'f_r0_1':'f_r0',
    'f_hp_1':'f_hp',
    'f_h0_1':'f_h0',
    'n_arms_1':'n_arms',
    'sa_1':'sa',
    'vol_1':'vol',
    'sa_eff_1':'sa_eff',
    'rho_eff_1':'rho_eff'
}
df_stereo = df_stereo.rename(columns=rename_dict)
# save file 
save_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled'
filename = 'ros-tabular-data-stereo-default-2ds-shuffled.parquet'
filepath = os.path.join(save_dir, filename)
df_stereo.to_parquet(filepath, index=False)

In [18]:
# QA: check created stereo parquet 
filepath = '/home/jko/synth-ros-data/tabular-data-v2/shuffled/ros-tabular-data-stereo-default-2ds-shuffled.parquet'
df = pd.read_parquet(filepath)
df.head()

Unnamed: 0,ros_id,proj_id,view_1,a,c,f_r0,f_hp,f_h0,n_arms,sa,...,circularity_1,view_2,aspect_ratio_2,aspect_ratio_elip_2,extreme_pts_2,contour_area_2,contour_perimeter_2,area_ratio_2,complexity_2,circularity_2
0,21990,26,default,17.45864,43.077772,1.189638,1.032805,1.137534,6.0,68249.972471,...,0.220227,2ds,0.905313,0.816939,54.446648,6762.5,611.251865,0.306524,0.673085,0.227445
1,46703,31,default,12.647542,51.060213,0.983185,0.862107,1.090951,8.0,61562.306164,...,0.119358,2ds,0.877923,0.846997,53.182322,6030.5,790.933108,0.325025,0.830911,0.121139
2,61326,60,default,20.868725,40.825651,0.94441,0.888785,1.03987,10.0,123963.219683,...,0.28281,2ds,0.934106,0.900084,46.420463,8761.0,668.724958,0.545181,0.734669,0.246189
3,12156,11,default,31.350973,133.259627,0.90835,0.800681,0.861034,5.0,295143.308252,...,0.169213,2ds,0.844804,0.693414,52.760662,4557.0,590.994948,0.219445,0.721485,0.163954
4,17163,91,default,24.711198,33.377526,1.135275,1.197088,1.100092,5.0,75146.531856,...,0.361445,2ds,0.853943,0.833836,46.103145,7650.5,536.634556,0.522619,0.632515,0.333843


In [19]:
df.tail()

Unnamed: 0,ros_id,proj_id,view_1,a,c,f_r0,f_hp,f_h0,n_arms,sa,...,circularity_1,view_2,aspect_ratio_2,aspect_ratio_elip_2,extreme_pts_2,contour_area_2,contour_perimeter_2,area_ratio_2,complexity_2,circularity_2
6999995,31486,6,default,19.79122,84.921713,1.005007,1.198505,0.898053,7.0,165413.503848,...,0.137772,2ds,0.769962,0.798301,54.635011,6076.5,742.967603,0.296554,0.797856,0.138332
6999996,16976,94,default,13.702366,78.161821,0.90632,1.040585,0.904197,5.0,67687.359131,...,0.12082,2ds,0.754176,0.496991,55.490427,3757.0,702.891477,0.178053,0.819785,0.09556
6999997,18753,73,default,32.290098,113.685144,1.019038,0.945371,1.098285,5.0,263548.399771,...,0.169437,2ds,0.856608,0.79064,53.553128,5312.5,650.849845,0.27084,0.75902,0.157597
6999998,45320,98,default,22.71874,111.681915,1.227355,1.063774,1.125782,8.0,268193.454791,...,0.100634,2ds,0.998154,0.883091,46.143797,3441.0,662.700572,0.230271,0.836721,0.09846
6999999,52328,76,default,18.545971,53.55982,1.058392,0.834564,1.065051,9.0,113323.863379,...,0.179387,2ds,0.864557,0.729803,44.712799,5798.0,607.470126,0.419211,0.757332,0.197441


In [22]:
df.columns

Index(['ros_id', 'proj_id', 'view_1', 'a', 'c', 'f_r0', 'f_hp', 'f_h0',
       'n_arms', 'sa', 'vol', 'sa_eff', 'rho_eff', 'aspect_ratio_1',
       'aspect_ratio_elip_1', 'extreme_pts_1', 'contour_area_1',
       'contour_perimeter_1', 'area_ratio_1', 'complexity_1', 'circularity_1',
       'view_2', 'aspect_ratio_2', 'aspect_ratio_elip_2', 'extreme_pts_2',
       'contour_area_2', 'contour_perimeter_2', 'area_ratio_2', 'complexity_2',
       'circularity_2'],
      dtype='object')

In [23]:
# default + phips
df1_path = '/home/jko/synth-ros-data/tabular-data-v2/shuffled/ros-tabular-data-shuffled-default.parquet'
df2_path = '/home/jko/synth-ros-data/tabular-data-v2/shuffled/ros-tabular-data-shuffled-phips.parquet'
df1 = pd.read_parquet(df1_path)
df2 = pd.read_parquet(df2_path)
# combine dataframes (side by side)
df1_renamed = df1.add_suffix('_1')
df2_renamed = df2.add_suffix('_2')
df_stereo = pd.concat([df1_renamed, df2_renamed], axis=1)
# clean up columns
columns_subset = ['ros_id_1', 'proj_id_1', 'view_1', 'a_1',
       'c_1', 'f_r0_1', 'f_hp_1', 'f_h0_1', 'n_arms_1', 'sa_1', 'vol_1',
       'sa_eff_1', 'rho_eff_1', 'aspect_ratio_1', 'aspect_ratio_elip_1',
       'extreme_pts_1', 'contour_area_1', 'contour_perimeter_1',
       'area_ratio_1', 'complexity_1', 'circularity_1', 'view_2', 'aspect_ratio_2', 'aspect_ratio_elip_2', 'extreme_pts_2',
       'contour_area_2', 'contour_perimeter_2', 'area_ratio_2', 'complexity_2',
       'circularity_2']
df_stereo = df_stereo[columns_subset]
rename_dict = {
    'ros_id_1':'ros_id',
    'proj_id_1':'proj_id',
    'a_1':'a',
    'c_1':'c',
    'f_r0_1':'f_r0',
    'f_hp_1':'f_hp',
    'f_h0_1':'f_h0',
    'n_arms_1':'n_arms',
    'sa_1':'sa',
    'vol_1':'vol',
    'sa_eff_1':'sa_eff',
    'rho_eff_1':'rho_eff'
}
df_stereo = df_stereo.rename(columns=rename_dict)
# save file 
save_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled'
filename = 'ros-tabular-data-stereo-default-phips-shuffled.parquet'
filepath = os.path.join(save_dir, filename)
df_stereo.to_parquet(filepath, index=False)

In [24]:
# QA: check created stereo parquet 
filepath = '/home/jko/synth-ros-data/tabular-data-v2/shuffled/ros-tabular-data-stereo-default-phips-shuffled.parquet'
df = pd.read_parquet(filepath)
df.head()

Unnamed: 0,ros_id,proj_id,view_1,a,c,f_r0,f_hp,f_h0,n_arms,sa,...,circularity_1,view_2,aspect_ratio_2,aspect_ratio_elip_2,extreme_pts_2,contour_area_2,contour_perimeter_2,area_ratio_2,complexity_2,circularity_2
0,21990,26,default,17.45864,43.077772,1.189638,1.032805,1.137534,6.0,68249.972471,...,0.220227,phips,0.81115,0.768786,55.782502,6578.5,624.766587,0.334246,0.708487,0.211788
1,46703,31,default,12.647542,51.060213,0.983185,0.862107,1.090951,8.0,61562.306164,...,0.119358,phips,0.898018,0.822794,49.045738,5415.0,707.862039,0.293273,0.800444,0.135804
2,61326,60,default,20.868725,40.825651,0.94441,0.888785,1.03987,10.0,123963.219683,...,0.28281,phips,0.903549,0.764968,47.643861,9196.0,667.837657,0.559473,0.724344,0.2591
3,12156,11,default,31.350973,133.259627,0.90835,0.800681,0.861034,5.0,295143.308252,...,0.169213,phips,0.953307,0.687583,50.888972,4635.5,596.634557,0.23232,0.72983,0.16364
4,17163,91,default,24.711198,33.377526,1.135275,1.197088,1.100092,5.0,75146.531856,...,0.361445,phips,0.831544,0.847918,48.20529,8187.0,563.220343,0.517592,0.641265,0.324323


# Create subset of 700k

- use the shuffled dataset 
- take the first 700k as the subset assuming it's adequately shuffled

In [26]:
shuffled_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled'
dest_dir = '/home/jko/synth-ros-data/tabular-data-v2/shuffled_small'
filenames = ['ros-tabular-data-shuffled-2ds.parquet', 
'ros-tabular-data-stereo-default-2ds-shuffled.parquet',
'ros-tabular-data-shuffled-default.parquet',
'ros-tabular-data-stereo-default-phips-shuffled.parquet',
'ros-tabular-data-shuffled-phips.parquet']

In [29]:
# subset each full dataset 
n_subset = 700_000
for fname in filenames:
    name_no_ext = fname.split('.')[0]
    fpath = os.path.join(shuffled_dir, fname)
    df = pd.read_parquet(fpath)
    df_subset = df.head(n_subset)
    out_fname = f'{name_no_ext}-subset-{n_subset}.parquet'
    out_filepath = os.path.join(dest_dir, out_fname)
    df_subset.to_parquet(out_filepath, index=False)

In [32]:
# QA: check the subset parquet files
filepaths = [os.path.join(dest_dir, fname) for fname in os.listdir(dest_dir) if os.path.isfile(os.path.join(dest_dir, fname))]
for f in filepaths:
    df = pd.read_parquet(f)
    print(f'Checking {f}...')
    print(df.head())

Checking /home/jko/synth-ros-data/tabular-data-v2/shuffled_small/ros-tabular-data-shuffled-2ds-subset-700000.parquet...
                            filename       unique_id  ros_id  proj_id view  \
0  ros-projection-021990-026-2ds.png  021990_026_2ds   21990       26  2ds   
1  ros-projection-046703-031-2ds.png  046703_031_2ds   46703       31  2ds   
2  ros-projection-061326-060-2ds.png  061326_060_2ds   61326       60  2ds   
3  ros-projection-012156-011-2ds.png  012156_011_2ds   12156       11  2ds   
4  ros-projection-017163-091-2ds.png  017163_091_2ds   17163       91  2ds   

           a           c      f_r0      f_hp      f_h0  ...    sa_eff  \
0  17.458640   43.077772  1.189638  1.032805  1.137534  ...  0.370000   
1  12.647542   51.060213  0.983185  0.862107  1.090951  ...  0.336052   
2  20.868725   40.825651  0.944410  0.888785  1.039870  ...  0.689925   
3  31.350973  133.259627  0.908350  0.800681  0.861034  ...  0.231581   
4  24.711198   33.377526  1.135275  1.197088  