# Pipeline

In [None]:
import pandas as pd
import numpy as np

# Initialize random generator with fixed seed for reproducibility
rng = np.random.default_rng(seed=123)

# Sizes of the mock catalogs
n1, n2, n3 = 100000, 200000, 300000
# n1, n2, n3 = 30, 40, 50  # Uncomment for testing with smaller datasets

# Generate random RA/DEC coordinates
def random_coords(n):
    return rng.uniform(0, 360, n), rng.uniform(-90, 90, n)

# Apply small random offset to simulate measurement uncertainty (~0.25 arcsec)
def add_noise(ra, dec):
    delta_ra = rng.uniform(-0.00007, 0.00007)
    delta_dec = rng.uniform(-0.00007, 0.00007)
    return ra + delta_ra, dec + delta_dec

# --- 1. Objects common to all three catalogs ---
ra_common3, dec_common3 = random_coords(5)
z_common3 = rng.uniform(0.1, 1.0, 5)
flags_common3 = [(4,3,3), (4,4,3), (3,4,3), (3,4,4), (3,3,4)]  # Different z_flags in each catalog

# --- 2. Objects shared only between catalog 1 and 2 ---
ra_common12_extra, dec_common12_extra = random_coords(5)
z_common12_extra = rng.uniform(0.1, 1.0, 5)
flags_common12_extra = [(3,4), (4,3), (3,3), (4,4), (3,4)]

# --- 3. Objects shared only between catalog 2 and 3 ---
ra_common23_extra, dec_common23_extra = random_coords(2)
z_common23_extra = rng.uniform(0.1, 1.0, 2)
flags_common23_extra = [(3,3), (4,3)]

# --- 4. Objects shared only between catalog 1 and 3 ---
ra_common13_extra, dec_common13_extra = random_coords(3)
z_common13_extra = rng.uniform(0.1, 1.0, 3)
flags_common13_extra = [(3,4), (3,3), (4,3)]

# Generate formatted IDs for each catalog
def gen_ids(prefix, start, count):
    return [f"{prefix}_id_{i:07d}" for i in range(start, start + count)]

# Initialize lists to store catalog rows
rows1, rows2, rows3 = [], [], []

# Add common objects to all three catalogs with small positional/z shifts and different z_flags
for i in range(5):
    ra, dec = ra_common3[i], dec_common3[i]
    rows1.append((f"c1_id_common3_{i}", ra, dec, z_common3[i], flags_common3[i][0]))
    ra2, dec2 = add_noise(ra, dec)
    rows2.append((f"c2_id_common3_{i}", ra2, dec2, z_common3[i] + 0.0001, flags_common3[i][1]))
    ra3, dec3 = add_noise(ra, dec)
    rows3.append((f"c3_id_common3_{i}", ra3, dec3, z_common3[i] + 0.0002, flags_common3[i][2]))

# Add objects shared between catalog 1 and 2 only
for i in range(5):
    ra, dec = ra_common12_extra[i], dec_common12_extra[i]
    rows1.append((f"c1_id_common12_{i}", ra, dec, z_common12_extra[i], flags_common12_extra[i][0]))
    ra2, dec2 = add_noise(ra, dec)
    rows2.append((f"c2_id_common12_{i}", ra2, dec2, z_common12_extra[i] + 0.0001, flags_common12_extra[i][1]))

# Add objects shared between catalog 2 and 3 only
for i in range(2):
    ra, dec = ra_common23_extra[i], dec_common23_extra[i]
    rows2.append((f"c2_id_common23_{i}", ra, dec, z_common23_extra[i], flags_common23_extra[i][0]))
    ra3, dec3 = add_noise(ra, dec)
    rows3.append((f"c3_id_common23_{i}", ra3, dec3, z_common23_extra[i] + 0.0001, flags_common23_extra[i][1]))

# Add objects shared between catalog 1 and 3 only
for i in range(3):
    ra, dec = ra_common13_extra[i], dec_common13_extra[i]
    rows1.append((f"c1_id_common13_{i}", ra, dec, z_common13_extra[i], flags_common13_extra[i][0]))
    ra3, dec3 = add_noise(ra, dec)
    rows3.append((f"c3_id_common13_{i}", ra3, dec3, z_common13_extra[i] + 0.0001, flags_common13_extra[i][1]))

# Count how many rows are already added (used to avoid ID collisions)
count1 = len(rows1)
count2 = len(rows2)
count3 = len(rows3)

# Fill remaining rows with randomly generated unique objects
def fill_random(rows, target, prefix, start_idx):
    needed = target - len(rows)
    ras, decs = random_coords(needed)
    zs = rng.uniform(0.1, 1.0, needed)
    flags = rng.choice([3, 4], size=needed)
    ids = gen_ids(prefix, start_idx, needed)
    for i in range(needed):
        rows.append((ids[i], ras[i], decs[i], zs[i], flags[i]))

fill_random(rows1, n1, "c1", count1)
fill_random(rows2, n2, "c2", count2)
fill_random(rows3, n3, "c3", count3)

# Create Pandas DataFrames for each catalog
df1 = pd.DataFrame(rows1, columns=["id", "ra", "dec", "z", "z_flag"])
df2 = pd.DataFrame(rows2, columns=["id", "ra", "dec", "z", "z_flag"])
df3 = pd.DataFrame(rows3, columns=["id", "ra", "dec", "z", "z_flag"])

# Save catalogs to Parquet files
df1.to_parquet("mock-specz-1.parquet", index=False)
df2.to_parquet("mock-specz-2.parquet", index=False)
df3.to_parquet("mock-specz-3.parquet", index=False)

# Inspection

In [None]:
from astropy.coordinates import SkyCoord
import astropy.units as u
import pandas as pd

# --- 1. Load the catalogs ---
df1 = pd.read_parquet("mock-specz-1.parquet")
df2 = pd.read_parquet("mock-specz-2.parquet")
df3 = pd.read_parquet("mock-specz-3.parquet")

# --- 2. Convert RA/DEC to SkyCoord objects for spherical matching ---
coords1 = SkyCoord(ra=df1["ra"].values * u.deg, dec=df1["dec"].values * u.deg)
coords2 = SkyCoord(ra=df2["ra"].values * u.deg, dec=df2["dec"].values * u.deg)
coords3 = SkyCoord(ra=df3["ra"].values * u.deg, dec=df3["dec"].values * u.deg)

# --- 3. Function to perform crossmatch between two coordinate lists ---
def crossmatch(coords_a, coords_b, max_sep=1 * u.arcsec):
    idx_a, idx_b, d2d, _ = coords_a.search_around_sky(coords_b, max_sep)
    return idx_b, idx_a  # Return in order: index in A, index in B

# Perform pairwise crossmatches
# Match between catalog 1 and 2
idx1_12, idx2_12 = crossmatch(coords1, coords2)
match12 = pd.DataFrame({
    "id1": df1.iloc[idx1_12]["id"].values,
    "id2": df2.iloc[idx2_12]["id"].values,
})

# Match between catalog 2 and 3
idx2_23, idx3_23 = crossmatch(coords2, coords3)
match23 = pd.DataFrame({
    "id2": df2.iloc[idx2_23]["id"].values,
    "id3": df3.iloc[idx3_23]["id"].values,
})

# Match between catalog 1 and 3
idx1_13, idx3_13 = crossmatch(coords1, coords3)
match13 = pd.DataFrame({
    "id1": df1.iloc[idx1_13]["id"].values,
    "id3": df3.iloc[idx3_13]["id"].values,
})

# --- 4. Identify triple matches (objects matched across all three catalogs) ---
# Merge matches between 1–2 and 2–3 on id2
match12_23 = pd.merge(match12, match23, on="id2", how="inner")
# Further merge with 1–3 matches to ensure full 3-way connection (id1–id3 also match)
match123 = pd.merge(match12_23, match13, on=["id1", "id3"], how="inner")

# --- 5. Print basic statistics ---
print(f"Match 1–2: {len(match12)} pairs")
print(f"Match 2–3: {len(match23)} pairs")
print(f"Match 1–3: {len(match13)} pairs")
print(f"Match 1–2–3: {len(match123)} triplets")

# --- 6. Validate matched IDs against expected test objects (catalog 1 perspective) ---
# Expected IDs for 1–2 matches (5 shared in all three + 5 exclusive to 1–2)
expected_12_ids = [f"c1_id_common3_{i}" for i in range(5)] + [f"c1_id_common12_{i}" for i in range(5)]
expected_12_ids_set = set(expected_12_ids)

# Extract unique IDs from match12
matched_1_ids = set(match12["id1"].values)

# Identify unexpected IDs (false positives)
extra_1_ids = matched_1_ids - expected_12_ids_set
if extra_1_ids:
    print(f"⚠️ Unexpected objects in match 1–2 (id1): {extra_1_ids}")

# --- 7. Annotate which matches are expected ---
match12["is_expected"] = match12["id1"].isin(expected_12_ids_set)

# --- 8. Join original catalog rows for detailed inspection ---
df_inspect_12 = match12.merge(df1, left_on="id1", right_on="id", suffixes=("", "_cat1"))
df_inspect_12 = df_inspect_12.merge(df2, left_on="id2", right_on="id", suffixes=("_cat1", "_cat2"))

# --- 9. Select relevant columns to reduce visual clutter ---
df_inspect_12 = df_inspect_12[["id1", "ra_cat1", "dec_cat1", "z_cat1", "z_flag_cat1",
                               "id2", "ra_cat2", "dec_cat2", "z_cat2", "z_flag_cat2",
                               "is_expected"]]

# --- 10. Show matches with unexpected ones first ---
pd.set_option("display.max_rows", None)  # Optional: show all rows in notebook
df_inspect_12_sorted = df_inspect_12.sort_values(by="is_expected")

# Display the sorted result
df_inspect_12_sorted