In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Set to True to force a full refresh of the data
full_refresh = True

In [3]:
repo_root = os.popen('git rev-parse --show-toplevel').read().strip()

In [4]:
data_folder = os.path.join(repo_root, 'multi-parameter-estimation', 'data')

# Get list of data directories
data_dirs = os.listdir(data_folder)
data_dirs = [d for d in data_dirs if os.path.isdir(os.path.join(data_folder, d))]

# skip old-data
if 'old-data' in data_dirs:
    data_dirs.remove('old-data')

data_dirs.sort()
data_dirs

['2025-05-30--14h-02m-13s',
 '2025-05-30--14h-03m-06s',
 '2025-05-30--14h-04m-07s',
 '2025-05-30--14h-05m-08s',
 '2025-05-30--14h-19m-25s',
 '2025-05-30--14h-22m-01s',
 '2025-05-30--14h-24m-36s',
 '2025-05-30--14h-27m-02s',
 '2025-05-30--14h-43m-17s',
 '2025-05-30--14h-48m-08s',
 '2025-05-30--14h-52m-48s',
 '2025-05-30--14h-57m-52s']

In [5]:
new_data_dirs = data_dirs.copy()

if not full_refresh:
    for d in data_dirs:
        if os.path.exists(os.path.join(data_folder, d, "chunked_coincidences_n=200.csv")):
            new_data_dirs.remove(d)

new_data_dirs

['2025-05-30--14h-02m-13s',
 '2025-05-30--14h-03m-06s',
 '2025-05-30--14h-04m-07s',
 '2025-05-30--14h-05m-08s',
 '2025-05-30--14h-19m-25s',
 '2025-05-30--14h-22m-01s',
 '2025-05-30--14h-24m-36s',
 '2025-05-30--14h-27m-02s',
 '2025-05-30--14h-43m-17s',
 '2025-05-30--14h-48m-08s',
 '2025-05-30--14h-52m-48s',
 '2025-05-30--14h-57m-52s']

In [6]:
def load_chunks(data_dir):
    if not os.path.exists(os.path.join(data_folder, data_dir, "chunked_coincidences_n=40.csv")):
        print(f"Skipping {data_dir} as it does not have the required file.")
        return pd.DataFrame()
    coincidences = pd.read_csv(os.path.join(data_folder, data_dir, "chunked_coincidences_n=40.csv"))
    coincidences["data_dir"] = data_dir
    return coincidences

chunks_df = pd.concat([load_chunks(d) for d in new_data_dirs], ignore_index=True)
chunks_df 

Unnamed: 0,data_dir,C,DB_H,DB_V,SB,N
0,2025-05-30--14h-02m-13s,2.0,37.0,0.0,0.0,39.0
1,2025-05-30--14h-02m-13s,0.0,39.0,0.0,0.0,39.0
2,2025-05-30--14h-02m-13s,0.5,40.0,0.0,0.0,40.5
3,2025-05-30--14h-02m-13s,1.0,38.0,0.0,0.0,39.0
4,2025-05-30--14h-02m-13s,0.5,40.0,0.0,0.0,40.5
...,...,...,...,...,...,...
1594,2025-05-30--14h-57m-52s,1.0,36.0,0.0,2.0,39.0
1595,2025-05-30--14h-57m-52s,0.0,37.0,0.0,3.5,40.5
1596,2025-05-30--14h-57m-52s,0.5,35.0,0.0,3.5,39.0
1597,2025-05-30--14h-57m-52s,0.5,35.0,0.0,4.0,39.5


In [7]:
# drop columns that are not needed
def k_wise_sum(group, k):
    # Drop last rows if not divisible by k
    n = len(group) - (len(group) % k)
    group = group.iloc[:n].reset_index(drop=True)
    # Sum every k rows
    kwise = group.groupby(group.index // k).sum()
    # Restore data_dir from the first row of each group
    kwise['data_dir'] = group['data_dir'].iloc[::k].values
    return kwise

# Example usage for k=3
df_80 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=2)).reset_index(drop=True)
df_120 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=3)).reset_index(drop=True)
df_160 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=4)).reset_index(drop=True)
df_200 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=5)).reset_index(drop=True)
df_200


  df_80 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=2)).reset_index(drop=True)
  df_120 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=3)).reset_index(drop=True)
  df_160 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=4)).reset_index(drop=True)
  df_200 = chunks_df.groupby('data_dir', group_keys=False).apply(lambda g: k_wise_sum(g, k=5)).reset_index(drop=True)


Unnamed: 0,data_dir,C,DB_H,DB_V,SB,N
0,2025-05-30--14h-02m-13s,4.0,194.0,0.0,0.0,198.0
1,2025-05-30--14h-02m-13s,4.0,195.0,0.0,1.0,200.0
2,2025-05-30--14h-02m-13s,3.0,195.0,0.0,1.5,199.5
3,2025-05-30--14h-02m-13s,3.5,195.0,0.0,0.5,199.0
4,2025-05-30--14h-02m-13s,3.5,196.0,0.0,0.5,200.0
...,...,...,...,...,...,...
312,2025-05-30--14h-57m-52s,3.5,187.0,0.0,10.0,200.5
313,2025-05-30--14h-57m-52s,2.5,186.0,0.0,11.0,199.5
314,2025-05-30--14h-57m-52s,5.5,181.0,0.0,12.5,199.0
315,2025-05-30--14h-57m-52s,3.0,187.0,0.0,11.0,201.0


In [8]:
# save the dataframes to csv files
# save the dataframes to csv files based on the data_dir
for data_dir in new_data_dirs:
    df_80_subset = df_80[df_80["data_dir"] == data_dir]
    df_120_subset = df_120[df_120["data_dir"] == data_dir]
    df_160_subset = df_160[df_160["data_dir"] == data_dir]
    df_200_subset = df_200[df_200["data_dir"] == data_dir]
    df_80_subset.to_csv(os.path.join(data_folder, data_dir, "chunked_coincidences_n=80.csv"), index=False)
    df_120_subset.to_csv(os.path.join(data_folder, data_dir, "chunked_coincidences_n=120.csv"), index=False)
    df_160_subset.to_csv(os.path.join(data_folder, data_dir, "chunked_coincidences_n=160.csv"), index=False)
    df_200_subset.to_csv(os.path.join(data_folder, data_dir, "chunked_coincidences_n=200.csv"), index=False)