In [1]:
import pandas as pd
import os

In [2]:
filedir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2'
filename = 'ros-tabular-data.parquet'
filepath = os.path.join(filedir, filename)
df = pd.read_parquet(filepath)

In [3]:
df.head()

Unnamed: 0,filename,unique_id,ros_id,proj_id,view,a,c,f_r0,f_hp,f_h0,...,sa_eff,rho_eff,aspect_ratio,aspect_ratio_elip,extreme_pts,contour_area,contour_perimeter,area_ratio,complexity,circularity
0,ros-projection-007347-085-2ds.png,007347_085_2ds,7347,85,2ds,31.233891,84.989032,0.936455,0.818327,1.057092,...,0.31751,0.064757,0.53943,0.458701,51.799493,5456.5,445.345234,0.309982,0.505857,0.345725
1,ros-projection-009656-040-2ds.png,009656_040_2ds,9656,40,2ds,11.94591,49.107549,1.025795,0.915659,1.066461,...,0.194514,0.025743,0.376434,0.39195,44.749302,3315.5,395.144226,0.245474,0.571417,0.266838
2,ros-projection-002710-078-2ds.png,002710_078_2ds,2710,78,2ds,24.946133,63.507982,0.985212,1.110469,0.990414,...,0.317656,0.063475,0.853006,0.686887,58.403633,7549.0,633.244728,0.349796,0.681698,0.236568
3,ros-projection-006468-095-2ds.png,006468_095_2ds,6468,95,2ds,17.849968,25.95642,1.162849,0.909022,1.072718,...,0.481623,0.133458,0.62589,0.630274,48.327011,7489.5,483.605119,0.450465,0.522866,0.402421
4,ros-projection-004114-062-2ds.png,004114_062_2ds,4114,62,2ds,26.554307,73.456699,0.874707,0.939581,0.986178,...,0.282056,0.050547,0.647354,0.661365,54.861188,5718.5,538.859952,0.319061,0.651347,0.24748


In [6]:
print(df.columns)

Index(['filename', 'unique_id', 'ros_id', 'proj_id', 'view', 'a', 'c', 'f_r0',
       'f_hp', 'f_h0', 'n_arms', 'sa', 'vol', 'sa_eff', 'rho_eff',
       'aspect_ratio', 'aspect_ratio_elip', 'extreme_pts', 'contour_area',
       'contour_perimeter', 'area_ratio', 'complexity', 'circularity'],
      dtype='object')


In [4]:
# filter by view
df_default = df[df['view'] == 'default']
df_2ds  = df[df['view'] == '2ds']
df_phips  = df[df['view'] == 'phips']

In [5]:
print(df_default.shape, df_2ds.shape, df_phips.shape)

(7000000, 23) (7000000, 23) (7000000, 23)


In [7]:
# order by ros_id and then proj_id
df_default = df_default.sort_values(['ros_id', 'proj_id'])
df_2ds = df_2ds.sort_values(['ros_id', 'proj_id'])
df_phips = df_phips.sort_values(['ros_id', 'proj_id'])

In [11]:
# reset index for all dataframes
df_default = df_default.reset_index(drop=True)
df_2ds = df_2ds.reset_index(drop=True)
df_phips = df_phips.reset_index(drop=True)

In [13]:
# add column called key that is a combination of ros_id and proj_id
df_default['key'] = df_default['ros_id'].astype(str) + '_' + df_default['proj_id'].astype(str)
df_2ds['key'] = df_2ds['ros_id'].astype(str) + '_' + df_2ds['proj_id'].astype(str)
df_phips['key'] = df_phips['ros_id'].astype(str) + '_' + df_phips['proj_id'].astype(str)

In [15]:
# only keep the columns that are needed for ML
columns_subset = ['key', 'aspect_ratio', 'aspect_ratio_elip', 'extreme_pts', 
                  'contour_area', 'contour_perimeter', 'area_ratio', 
                  'complexity', 'circularity','rho_eff', 'sa_eff', 'n_arms']
df_default_subset = df_default[columns_subset]
df_2ds_subset = df_2ds[columns_subset]
df_phips_subset = df_phips[columns_subset]

In [24]:
# combine default and 2ds on key
df_default_2ds = pd.merge(df_default_subset, df_2ds_subset, on='key', suffixes=('_default', '_2ds'))
df_default_2ds.head()

Unnamed: 0,key,aspect_ratio_default,aspect_ratio_elip_default,extreme_pts_default,contour_area_default,contour_perimeter_default,area_ratio_default,complexity_default,circularity_default,rho_eff_default,...,aspect_ratio_elip_2ds,extreme_pts_2ds,contour_area_2ds,contour_perimeter_2ds,area_ratio_2ds,complexity_2ds,circularity_2ds,rho_eff_2ds,sa_eff_2ds,n_arms_2ds
0,0_0,0.760534,0.745769,51.118582,7602.5,489.310749,0.482908,0.543065,0.399022,0.131525,...,0.875256,44.225982,6677.0,433.646749,0.513359,0.504437,0.446189,0.131525,0.471492,4.0
1,0_1,0.857027,0.919697,46.469748,6954.0,451.788883,0.473983,0.50514,0.428128,0.131525,...,0.777229,49.061186,7555.0,513.788885,0.443472,0.570234,0.359646,0.131525,0.471492,4.0
2,0_2,0.697876,0.647976,49.808226,8278.0,514.925967,0.450684,0.53495,0.392325,0.131525,...,0.858947,48.515461,8718.5,533.144223,0.556797,0.588941,0.385445,0.131525,0.471492,4.0
3,0_3,0.756598,0.84588,50.007499,7630.5,498.516807,0.446881,0.540699,0.385836,0.131525,...,0.786811,51.515623,8002.5,515.712764,0.483161,0.567124,0.378111,0.131525,0.471492,4.0
4,0_4,0.866028,0.862108,44.601009,7094.0,456.676186,0.510381,0.523868,0.427449,0.131525,...,0.784926,48.189081,8015.5,526.315796,0.461753,0.574173,0.36362,0.131525,0.471492,4.0


In [25]:
# check that sa_eff, rho_eff, and n_arms are the same for default and 2ds
assert df_default_2ds['sa_eff_default'].equals(df_default_2ds['sa_eff_2ds'])
assert df_default_2ds['rho_eff_default'].equals(df_default_2ds['rho_eff_2ds'])
assert df_default_2ds['n_arms_default'].equals(df_default_2ds['n_arms_2ds'])

In [26]:
# delete the second sa_eff and rho_eff columns
df_default_2ds = df_default_2ds.drop(columns=['sa_eff_2ds', 'rho_eff_2ds', 'n_arms_2ds'])
# rename the columns
df_default_2ds = df_default_2ds.rename(columns={'sa_eff_default': 'sa_eff', 
                                                'rho_eff_default': 'rho_eff', 
                                                'n_arms_default': 'n_arms'})
# delete key column
df_default_2ds = df_default_2ds.drop(columns=['key'])
print(df_default_2ds.columns)

Index(['aspect_ratio_default', 'aspect_ratio_elip_default',
       'extreme_pts_default', 'contour_area_default',
       'contour_perimeter_default', 'area_ratio_default', 'complexity_default',
       'circularity_default', 'rho_eff', 'sa_eff', 'n_arms',
       'aspect_ratio_2ds', 'aspect_ratio_elip_2ds', 'extreme_pts_2ds',
       'contour_area_2ds', 'contour_perimeter_2ds', 'area_ratio_2ds',
       'complexity_2ds', 'circularity_2ds'],
      dtype='object')


In [28]:
df_default_2ds.shape

(7000000, 19)

In [29]:
# combine default and phips on key
df_default_phips = pd.merge(df_default_subset, df_phips_subset, on='key', suffixes=('_default', '_phips'))
# check that sa_eff, rho_eff, and n_arms are the same for default and phips
assert df_default_phips['sa_eff_default'].equals(df_default_phips['sa_eff_phips'])
assert df_default_phips['rho_eff_default'].equals(df_default_phips['rho_eff_phips'])
assert df_default_phips['n_arms_default'].equals(df_default_phips['n_arms_phips'])
# delete the second sa_eff and rho_eff columns
df_default_phips = df_default_phips.drop(columns=['sa_eff_phips', 'rho_eff_phips', 'n_arms_phips'])
# rename the columns
df_default_phips = df_default_phips.rename(columns={'sa_eff_default': 'sa_eff', 
                                                    'rho_eff_default': 'rho_eff', 
                                                    'n_arms_default': 'n_arms'})
# delete key column
df_default_phips = df_default_phips.drop(columns=['key'])
print(df_default_phips.columns)
print(df_default_phips.shape)

Index(['aspect_ratio_default', 'aspect_ratio_elip_default',
       'extreme_pts_default', 'contour_area_default',
       'contour_perimeter_default', 'area_ratio_default', 'complexity_default',
       'circularity_default', 'rho_eff', 'sa_eff', 'n_arms',
       'aspect_ratio_phips', 'aspect_ratio_elip_phips', 'extreme_pts_phips',
       'contour_area_phips', 'contour_perimeter_phips', 'area_ratio_phips',
       'complexity_phips', 'circularity_phips'],
      dtype='object')
(7000000, 19)


In [30]:
# save new merged dataframes
savedir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/tabular-data-v2'
df_default_2ds.to_parquet(os.path.join(savedir, 'ros-tabular-data-stereo-default-2ds.parquet'), index=False)
df_default_phips.to_parquet(os.path.join(savedir, 'ros-tabular-data-stereo-default-phips.parquet'), index=False)