In [14]:
import pandas as pd
import numpy as np
import os

In [15]:
# Set to True to force a full refresh of the data
full_refresh = True
eta = 0.985

# Load data

In [16]:
repo_root = os.popen('git rev-parse --show-toplevel').read().strip()

In [17]:
data_folder = os.path.join(repo_root, 'multi-parameter-estimation', 'data')

# Get list of data directories
data_dirs = os.listdir(data_folder)
data_dirs = [d for d in data_dirs if os.path.isdir(os.path.join(data_folder, d))]

# skip old-data
if 'old-data' in data_dirs:
    data_dirs.remove('old-data')

data_dirs.sort()
data_dirs

['2025-06-03--16h-33m-47s',
 '2025-06-03--16h-35m-37s',
 '2025-06-03--16h-37m-20s',
 '2025-06-03--16h-39m-05s',
 '2025-06-03--16h-40m-50s',
 '2025-06-03--16h-42m-36s',
 '2025-06-03--16h-44m-21s',
 '2025-06-03--16h-46m-06s',
 '2025-06-03--16h-47m-52s',
 '2025-06-03--16h-49m-39s',
 '2025-06-03--16h-51m-26s',
 '2025-06-03--16h-53m-14s',
 '2025-06-03--16h-54m-59s',
 '2025-06-03--16h-56m-47s',
 '2025-06-03--16h-58m-36s',
 '2025-06-03--17h-00m-23s',
 '2025-06-03--17h-02m-09s',
 '2025-06-03--17h-03m-56s',
 '2025-06-03--17h-05m-43s',
 '2025-06-03--17h-07m-28s',
 '2025-06-03--17h-09m-15s',
 '2025-06-03--17h-11m-02s',
 '2025-06-03--17h-12m-49s',
 '2025-06-03--17h-14m-36s',
 '2025-06-03--17h-16m-22s',
 '2025-06-03--17h-18m-08s',
 '2025-06-03--17h-19m-55s',
 '2025-06-03--17h-21m-56s',
 '2025-06-03--17h-23m-44s',
 '2025-06-03--17h-25m-29s',
 '2025-06-03--17h-27m-14s',
 '2025-06-03--17h-29m-01s',
 '2025-06-03--17h-30m-48s',
 '2025-06-03--17h-32m-37s',
 '2025-06-03--17h-34m-23s',
 '2025-06-03--17h-36

In [18]:
new_data_dirs = data_dirs.copy()

if not full_refresh:
    for d in data_dirs:
        if os.path.exists(os.path.join(data_folder, d, "estimators.csv")):
            new_data_dirs.remove(d)

new_data_dirs

['2025-06-03--16h-33m-47s',
 '2025-06-03--16h-35m-37s',
 '2025-06-03--16h-37m-20s',
 '2025-06-03--16h-39m-05s',
 '2025-06-03--16h-40m-50s',
 '2025-06-03--16h-42m-36s',
 '2025-06-03--16h-44m-21s',
 '2025-06-03--16h-46m-06s',
 '2025-06-03--16h-47m-52s',
 '2025-06-03--16h-49m-39s',
 '2025-06-03--16h-51m-26s',
 '2025-06-03--16h-53m-14s',
 '2025-06-03--16h-54m-59s',
 '2025-06-03--16h-56m-47s',
 '2025-06-03--16h-58m-36s',
 '2025-06-03--17h-00m-23s',
 '2025-06-03--17h-02m-09s',
 '2025-06-03--17h-03m-56s',
 '2025-06-03--17h-05m-43s',
 '2025-06-03--17h-07m-28s',
 '2025-06-03--17h-09m-15s',
 '2025-06-03--17h-11m-02s',
 '2025-06-03--17h-12m-49s',
 '2025-06-03--17h-14m-36s',
 '2025-06-03--17h-16m-22s',
 '2025-06-03--17h-18m-08s',
 '2025-06-03--17h-19m-55s',
 '2025-06-03--17h-21m-56s',
 '2025-06-03--17h-23m-44s',
 '2025-06-03--17h-25m-29s',
 '2025-06-03--17h-27m-14s',
 '2025-06-03--17h-29m-01s',
 '2025-06-03--17h-30m-48s',
 '2025-06-03--17h-32m-37s',
 '2025-06-03--17h-34m-23s',
 '2025-06-03--17h-36

In [19]:
def load_chunks(data_dir, n):
    if not os.path.exists(os.path.join(data_folder, data_dir, f"corrected_chunked_coincidences_n={n}.csv")):
        print(f"Skipping {data_dir} n={n} as file does not exist.")
        return pd.DataFrame()
    coincidences = pd.read_csv(os.path.join(data_folder, data_dir, f"corrected_chunked_coincidences_n={n}.csv"))
    coincidences["data_dir"] = data_dir
    return coincidences

chunks_df = pd.concat([pd.concat([load_chunks(d, n) for d in new_data_dirs]) for n in [40,80,120,160,200]], ignore_index=True)
chunks_df

Unnamed: 0,data_dir,C',HH,SB,VV,N
0,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
1,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
2,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
3,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
4,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
...,...,...,...,...,...,...
3813678,2025-06-03--20h-24m-15s,99.0,52.0,3.0,46.0,200.0
3813679,2025-06-03--20h-24m-15s,96.0,55.0,3.0,46.0,200.0
3813680,2025-06-03--20h-24m-15s,106.0,47.0,1.0,46.0,200.0
3813681,2025-06-03--20h-24m-15s,78.0,75.0,3.0,44.0,200.0


# All samples

In [20]:
all_samples = chunks_df[chunks_df["N"] == 40].groupby("data_dir").sum().reset_index()

# append the new data to the existing data
chunks_df = pd.concat([chunks_df, all_samples], ignore_index=True)
chunks_df

Unnamed: 0,data_dir,C',HH,SB,VV,N
0,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
1,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
2,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
3,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
4,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0
...,...,...,...,...,...,...
3813808,2025-06-03--20h-17m-09s,210784.0,137746.0,25241.0,121229.0,495000.0
3813809,2025-06-03--20h-18m-55s,218136.0,137965.0,18121.0,122538.0,496760.0
3813810,2025-06-03--20h-20m-41s,221794.0,136637.0,12866.0,121503.0,492800.0
3813811,2025-06-03--20h-22m-29s,224729.0,136514.0,9374.0,121863.0,492480.0


130

# Estimate

In [21]:
chunks_df["theta_estimate"] = np.arccos(
    (chunks_df["HH"] - chunks_df["VV"]) / chunks_df["N"]
)

chunks_df

Unnamed: 0,data_dir,C',HH,SB,VV,N,theta_estimate
0,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000
1,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000
2,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000
3,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000
4,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000
...,...,...,...,...,...,...,...
3813808,2025-06-03--20h-17m-09s,210784.0,137746.0,25241.0,121229.0,495000.0,1.537422
3813809,2025-06-03--20h-18m-55s,218136.0,137965.0,18121.0,122538.0,496760.0,1.539736
3813810,2025-06-03--20h-20m-41s,221794.0,136637.0,12866.0,121503.0,492800.0,1.540081
3813811,2025-06-03--20h-22m-29s,224729.0,136514.0,9374.0,121863.0,492480.0,1.541043


In [22]:
chunks_df["delta_phi_estimate"] = (
    1 / 2
    * np.arccos(
        1
        / eta
        * (chunks_df["SB"] - chunks_df["C'"])
        / (chunks_df["SB"] + chunks_df["C'"])
    )
)

chunks_df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,data_dir,C',HH,SB,VV,N,theta_estimate,delta_phi_estimate
0,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,
1,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,
2,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,
3,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,
4,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,
...,...,...,...,...,...,...,...,...
3813808,2025-06-03--20h-17m-09s,210784.0,137746.0,25241.0,121229.0,495000.0,1.537422,1.247455
3813809,2025-06-03--20h-18m-55s,218136.0,137965.0,18121.0,122538.0,496760.0,1.539736,1.302535
3813810,2025-06-03--20h-20m-41s,221794.0,136637.0,12866.0,121503.0,492800.0,1.540081,1.349801
3813811,2025-06-03--20h-22m-29s,224729.0,136514.0,9374.0,121863.0,492480.0,1.541043,1.388017


In [23]:
chunks_df["clipped_delta_phi_estimate"] = (
    1 / 2
    * np.arccos(
        np.clip(
        1
        / eta
        * (chunks_df["SB"] - chunks_df["C'"])
        / (chunks_df["SB"] + chunks_df["C'"]), -1, 1)
    )
)

chunks_df

Unnamed: 0,data_dir,C',HH,SB,VV,N,theta_estimate,delta_phi_estimate,clipped_delta_phi_estimate
0,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,,
1,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,,
2,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,,
3,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,,
4,2025-06-03--16h-33m-47s,0.0,40.0,0.0,0.0,40.0,0.000000,,
...,...,...,...,...,...,...,...,...,...
3813808,2025-06-03--20h-17m-09s,210784.0,137746.0,25241.0,121229.0,495000.0,1.537422,1.247455,1.247455
3813809,2025-06-03--20h-18m-55s,218136.0,137965.0,18121.0,122538.0,496760.0,1.539736,1.302535,1.302535
3813810,2025-06-03--20h-20m-41s,221794.0,136637.0,12866.0,121503.0,492800.0,1.540081,1.349801,1.349801
3813811,2025-06-03--20h-22m-29s,224729.0,136514.0,9374.0,121863.0,492480.0,1.541043,1.388017,1.388017


In [24]:
chunks_df[chunks_df["data_dir"] == "2025-06-03--20h-15m-22s"].count()

data_dir                      28244
C'                            28244
HH                            28244
SB                            28244
VV                            28244
N                             28244
theta_estimate                28244
delta_phi_estimate            27530
clipped_delta_phi_estimate    28244
dtype: int64

# Save

In [25]:
from concurrent.futures import ThreadPoolExecutor
import os

# Group chunks_df by data_dir for efficient access
grouped_chunks = dict(tuple(chunks_df.groupby("data_dir")))

def save_estimators(data_dir):
    if data_dir in grouped_chunks:
        df_subset = grouped_chunks[data_dir]
        output_file = os.path.join(data_folder, data_dir, "corrected_estimators.csv")
        df_subset.to_csv(output_file, index=False)
        print(f"Saved estimators for {data_dir} to {output_file}")
    else:
        print(f"No data to save for {data_dir}")

# Use threading to speed up saving
with ThreadPoolExecutor(max_workers=8) as executor:
    executor.map(save_estimators, new_data_dirs)


Saved estimators for 2025-06-03--16h-35m-37s to /home/jh115/Heriot-Watt University Team Dropbox/RES_EPS_EMQL/projects/multi-parameter-estimation/multi-parameter-estimation/data/2025-06-03--16h-35m-37s/corrected_estimators.csv
Saved estimators for 2025-06-03--16h-33m-47s to /home/jh115/Heriot-Watt University Team Dropbox/RES_EPS_EMQL/projects/multi-parameter-estimation/multi-parameter-estimation/data/2025-06-03--16h-33m-47s/corrected_estimators.csv
Saved estimators for 2025-06-03--16h-37m-20s to /home/jh115/Heriot-Watt University Team Dropbox/RES_EPS_EMQL/projects/multi-parameter-estimation/multi-parameter-estimation/data/2025-06-03--16h-37m-20s/corrected_estimators.csv
Saved estimators for 2025-06-03--16h-39m-05s to /home/jh115/Heriot-Watt University Team Dropbox/RES_EPS_EMQL/projects/multi-parameter-estimation/multi-parameter-estimation/data/2025-06-03--16h-39m-05s/corrected_estimators.csv
Saved estimators for 2025-06-03--16h-40m-50s to /home/jh115/Heriot-Watt University Team Dropbox/