In [3]:
import pandas as pd 
import os

April 18, 2025: Not all 1 million files were generated. Need to find files that were not generated and then start from where we left off. 

In [14]:
files_dict = {}
root_dir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/projections-v2'
subfolders = ['default', '2ds', 'phips']
def get_files(folder):
    # returns all files as list in folder
    files = []
    for dirpath, _, filenames in os.walk(folder):
        for file in filenames:
            filename = file.rsplit('/', 1)[-1]                   # Get the filename: 'ros-projection-002275-043-default.png'
            parts = filename.split('-')                          # ['ros', 'projection', '002275', '043', 'default.png']
            result = f"{parts[2]}-{parts[3]}"                    # '002275-043'
            files.append(result)
    return files
for f in subfolders:
    folder = os.path.join(root_dir, f)
    files_dict[f] = get_files(folder)

In [17]:
print(len(files_dict['default']))
print(len(files_dict['2ds']))
print(len(files_dict['phips']))

913947
913897
913896


In [16]:
# get files that were fully processed
common_ids = list(set(files_dict['default']) & set(files_dict['2ds']) & set(files_dict['phips']))
len(common_ids)

913896

In [20]:
# Sort numerically based on the two parts
sorted_ids = sorted(common_ids, key=lambda x: tuple(map(int, x.split('-'))))
sorted_ids[:10]

['000000-000',
 '000000-001',
 '000000-002',
 '000000-003',
 '000000-004',
 '000000-005',
 '000000-006',
 '000000-007',
 '000000-008',
 '000000-009']

In [29]:
from collections import defaultdict

# Your list of strings like '002042-019'
data = sorted_ids # your list here

# Step 1: Build mapping from ID → set of n_rand values
id_to_nrand = defaultdict(set)
for entry in data:
    id_part, n_rand = entry.split('-')
    id_to_nrand[id_part].add(n_rand)

# Step 2: Expected sets
expected_ids = {f"{i:06d}" for i in range(10000)}         # 000000 to 009999
expected_nrand = {f"{i:03d}" for i in range(100)}         # 000 to 099

# Step 3: Find missing or incomplete IDs
incomplete_ids = []
for id_ in expected_ids:
    if id_ not in id_to_nrand:
        incomplete_ids.append(id_)  # missing entirely
    elif id_to_nrand[id_] != expected_nrand:
        incomplete_ids.append(id_)  # present but incomplete

print(f"Number of incomplete IDs: {len(incomplete_ids)}")
print(len(incomplete_ids))

Number of incomplete IDs: 888
888


In [30]:
888*100 + len(common_ids)

1002696

# Create new ros-data-merged.txt

Merge old ros-data with new ros-data and place v2 in data dir

In [9]:
import os
import pandas as pd

In [1]:
path_old = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/data/ros-data-merged.txt'
path_new = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/data-v2/ros-data-merged.txt'
df_old = pd.read_csv(path_old)
df_new = pd.read_csv(path_new)

In [2]:
print(df_old.shape)
df_old.head()

(70000, 11)


Unnamed: 0,id,a,c,f_r0,f_hp,f_h0,n_arms,sa,vol,sa_eff,rho_eff
0,35000,13.817545,23.540122,0.867449,0.961225,0.943525,7,33247.371151,181231.94769,0.57603,0.138993
1,35001,13.817545,23.540122,0.867449,0.961225,0.943525,7,33022.961732,184536.198465,0.560433,0.137205
2,35002,13.817545,23.540122,0.867449,0.961225,0.943525,7,33530.598131,187678.178569,0.585745,0.145727
3,35003,13.817545,23.540122,0.867449,0.961225,0.943525,7,32592.037777,166089.393327,0.543729,0.120358
4,35004,13.817545,23.540122,0.867449,0.961225,0.943525,7,32892.678757,179534.751064,0.583751,0.142747


In [4]:
print(df_new.shape)
df_new.head()

(10000, 11)


Unnamed: 0,id,a,c,f_r0,f_hp,f_h0,n_arms,sa,vol,sa_eff,rho_eff
0,6000,18.853575,44.355912,0.803497,0.950749,1.093555,4,44934.342645,329883.10055,0.362739,0.080466
1,6001,18.853575,44.355912,0.803497,0.950749,1.093555,4,42663.086171,308125.52866,0.317901,0.066652
2,6002,18.853575,44.355912,0.803497,0.950749,1.093555,4,45063.01673,340197.868636,0.345141,0.076687
3,6003,18.853575,44.355912,0.803497,0.950749,1.093555,4,47337.486621,361178.174563,0.339395,0.073739
4,6004,18.853575,44.355912,0.803497,0.950749,1.093555,4,45971.094763,371909.871954,0.389581,0.097574


In [6]:
# remove rosettes with 4 arms in df_old and add df_new to df_final
df = df_old.copy()
df = df[df['n_arms']!=4]
df = pd.concat([df, df_new], ignore_index=True)
df.shape

(70000, 11)

In [7]:
len(df.id.unique())

70000

In [10]:
# save final merged ros-data 
save_dir = '/glade/derecho/scratch/joko/synth-ros/params_200_50_20250403/data'
filename = 'ros-data-merged-v2.txt'
filepath = os.path.join(save_dir, filename)
df.to_csv(filepath, sep=',', index=False)