# Generating mock data for CRC

In [None]:
import pandas as pd
import numpy as np
import os

# Ensure output directory exists
os.makedirs("data", exist_ok=True)

rng = np.random.default_rng(seed=42)

# =============================
# Helper functions
# =============================

def jitter_coords(base_ra, base_dec, n, jitter=0.0001):
    return (
        base_ra + rng.uniform(-jitter, jitter, size=n),
        base_dec + rng.uniform(-jitter, jitter, size=n)
    )

def generate_unique_sources(n, ra_range=(0, 360), dec_range=(-30, 30)):
    ra = rng.uniform(*ra_range, size=n)
    dec = rng.uniform(*dec_range, size=n)
    return ra, dec

def generate_redshifts(n):
    return rng.uniform(0.01, 1.0, size=n)

def generate_redshifts_with_jitter(base_z, low_jitter=0.0003, high_jitter=0.005, frac_low=0.8):
    n = len(base_z)
    is_low = rng.random(n) < frac_low
    jitter = np.where(is_low,
                      rng.uniform(-low_jitter, low_jitter, n),
                      rng.uniform(-high_jitter, high_jitter, n))
    return base_z + jitter

def generate_primus_flags(n):
    # 60% flag 4, 30% flag 3, 10% flag 2 or -1
    flags = rng.choice(
        [4, 3, 2, -1],
        size=n,
        p=[0.60, 0.30, 0.05, 0.05]  # 2 + (-1) = 10%
    )
    return flags

def generate_2dflens_flags(n):
    # 60% flag 4, 30% flag 3, 8% flag 2 or 1, 2% flag 6
    flags = rng.choice(
        [4, 3, 2, 1, 6],
        size=n,
        p=[0.60, 0.30, 0.04, 0.04, 0.02]
    )
    return flags

def generate_2mrs_zerr(n):
    return rng.uniform(0.00001, 0.001, size=n)

def build_catalog(id_prefix, ra, dec, z, suffix, z_flag=None, z_err=None, survey=None):
    size = len(ra)
    data = {
        f"id_{suffix}": [f"{id_prefix}_{i}" for i in range(size)],
        f"ra_{suffix}": ra,
        f"dec_{suffix}": dec,
        f"z_{suffix}": z,
        f"survey_{suffix}": survey,
        f"random_{suffix}": rng.uniform(0, 1, size)
    }
    if z_flag is not None:
        data[f"z_flag_{suffix}"] = z_flag
    if z_err is not None:
        data[f"z_err_{suffix}"] = z_err
    return pd.DataFrame(data)

def generate_exact_selfmatch_coords(n_pairs):
    ra, dec = generate_unique_sources(n_pairs)
    ra_duplicated = np.repeat(ra, 2)
    dec_duplicated = np.repeat(dec, 2)
    return ra_duplicated, dec_duplicated

# =============================
# Generate matched & unique sources
# =============================

# Match among all 3 catalogs (10)
base_ra_123, base_dec_123 = generate_unique_sources(10)
ra1_123, dec1_123 = jitter_coords(base_ra_123, base_dec_123, 10)
ra2_123, dec2_123 = jitter_coords(base_ra_123, base_dec_123, 10)
ra3_123, dec3_123 = jitter_coords(base_ra_123, base_dec_123, 10)

# Match between mock1 and mock2 (20)
base_ra_12, base_dec_12 = generate_unique_sources(20)
ra1_12, dec1_12 = jitter_coords(base_ra_12, base_dec_12, 20)
ra2_12, dec2_12 = jitter_coords(base_ra_12, base_dec_12, 20)

# Match between mock2 and mock3 (30)
base_ra_23, base_dec_23 = generate_unique_sources(30)
ra2_23, dec2_23 = jitter_coords(base_ra_23, base_dec_23, 30)
ra3_23, dec3_23 = jitter_coords(base_ra_23, base_dec_23, 30)

# Match between mock1 and mock3 (40)
base_ra_13, base_dec_13 = generate_unique_sources(40)
ra1_13, dec1_13 = jitter_coords(base_ra_13, base_dec_13, 40)
ra3_13, dec3_13 = jitter_coords(base_ra_13, base_dec_13, 40)

# Internal duplicates (50 per catalog = 25 pairs × 2)
ra1_self, dec1_self = generate_exact_selfmatch_coords(25)
ra2_self, dec2_self = generate_exact_selfmatch_coords(25)
ra3_self, dec3_self = generate_exact_selfmatch_coords(25)

# Unique sources (950 per catalog)
ra1_unique, dec1_unique = generate_unique_sources(950)
ra2_unique, dec2_unique = generate_unique_sources(950)
ra3_unique, dec3_unique = generate_unique_sources(950)

# =============================
# Generate base z for matched objects
# =============================
z_123 = generate_redshifts(10)
z_12 = generate_redshifts(20)
z_23 = generate_redshifts(30)
z_13 = generate_redshifts(40)
z_self1 = generate_redshifts(25)
z_self2 = generate_redshifts(25)
z_self3 = generate_redshifts(25)

# =============================
# Build mock catalogs
# =============================

def make_mock1():
    z = np.concatenate([
        generate_redshifts_with_jitter(z_123),
        generate_redshifts_with_jitter(z_12),
        generate_redshifts_with_jitter(z_13),
        np.repeat(z_self1, 2),
        generate_redshifts(950)
    ])
    ra = np.concatenate([ra1_123, ra1_12, ra1_13, ra1_self, ra1_unique])
    dec = np.concatenate([dec1_123, dec1_12, dec1_13, dec1_self, dec1_unique])
    z_flag = generate_primus_flags(len(ra)) 
    return build_catalog("mock1", ra, dec, z, suffix="1", z_flag=z_flag, survey="PRIMUS")

def make_mock2():
    z = np.concatenate([
        generate_redshifts_with_jitter(z_123),
        generate_redshifts_with_jitter(z_12),
        generate_redshifts_with_jitter(z_23),
        np.repeat(z_self2, 2),
        generate_redshifts(950)
    ])
    ra = np.concatenate([ra2_123, ra2_12, ra2_23, ra2_self, ra2_unique])
    dec = np.concatenate([dec2_123, dec2_12, dec2_23, dec2_self, dec2_unique])
    z_flag = generate_2dflens_flags(len(ra))
    return build_catalog("mock2", ra, dec, z, suffix="2", z_flag=z_flag, survey="2DFLENS")

def make_mock3():
    z = np.concatenate([
        generate_redshifts_with_jitter(z_123),
        generate_redshifts_with_jitter(z_23),
        generate_redshifts_with_jitter(z_13),
        np.repeat(z_self3, 2),
        generate_redshifts(950)
    ])
    ra = np.concatenate([ra3_123, ra3_23, ra3_13, ra3_self, ra3_unique])
    dec = np.concatenate([dec3_123, dec3_23, dec3_13, dec3_self, dec3_unique])
    z_err = generate_2mrs_zerr(len(ra))
    return build_catalog("mock3", ra, dec, z, suffix="3", z_err=z_err, survey="2MRS")

# =============================
# Save to Parquet
# =============================
mock1 = make_mock1()
mock2 = make_mock2()
mock3 = make_mock3()

mock1.to_parquet("data/mock1.parquet", index=False)
mock2.to_parquet("data/mock2.parquet", index=False)
mock3.to_parquet("data/mock3.parquet", index=False)

# Optional preview
mock1.head(2), mock2.head(2), mock3.head(2)

# Validation

In [None]:
import pandas as pd
from astropy.coordinates import SkyCoord
from astropy import units as u

# =============================
# 1. Load catalogs
# =============================
mock1 = pd.read_parquet("data/mock1.parquet")
mock2 = pd.read_parquet("data/mock2.parquet")
mock3 = pd.read_parquet("data/mock3.parquet")

# =============================
# 2. Create SkyCoord objects
# =============================
coords1 = SkyCoord(ra=mock1["ra_1"].values * u.deg, dec=mock1["dec_1"].values * u.deg)
coords2 = SkyCoord(ra=mock2["ra_2"].values * u.deg, dec=mock2["dec_2"].values * u.deg)
coords3 = SkyCoord(ra=mock3["ra_3"].values * u.deg, dec=mock3["dec_3"].values * u.deg)

# =============================
# 3. Crossmatching between catalogs
# =============================
radius = 1.0 / 3600  # 1 arcsecond in degrees

# mock1 → mock2
idx1_2, sep1_2, _ = coords1.match_to_catalog_sky(coords2)
matches12 = sep1_2.deg < radius

# mock2 → mock3
idx2_3, sep2_3, _ = coords2.match_to_catalog_sky(coords3)
matches23 = sep2_3.deg < radius

# mock1 → mock3
idx1_3, sep1_3, _ = coords1.match_to_catalog_sky(coords3)
matches13 = sep1_3.deg < radius

# For exclusive matches: avoid shape mismatch
# mock2 → mock1 (reverse of matches12)
idx2_1, sep2_1, _ = coords2.match_to_catalog_sky(coords1)
reverse_matches21 = sep2_1.deg < radius

# =============================
# 4. Match classification
# =============================

# Triple match: objects in mock1 matching both mock2 and mock3
triplet = matches12 & matches13

# Exclusive pairs (not matched with the third catalog)
pair_12_only = matches12 & ~matches13
pair_23_only = matches23 & ~reverse_matches21
pair_13_only = matches13 & ~matches12

# =============================
# 5. Internal duplicates (identical RA/DEC within the same catalog)
# =============================
internal1 = mock1.duplicated(subset=["ra_1", "dec_1"], keep=False)
internal2 = mock2.duplicated(subset=["ra_2", "dec_2"], keep=False)
internal3 = mock3.duplicated(subset=["ra_3", "dec_3"], keep=False)

# =============================
# 6. Unmatched objects (no external match)
# =============================
no_match1 = ~matches12 & ~matches13
no_match2 = ~matches23 & ~reverse_matches21
no_match3 = ~reverse_matches21 & ~matches23

# =============================
# 7. Print results
# =============================
print("🔗 Triple match (1 ∩ 2 ∩ 3):", triplet.sum())
print("🔗 Pair match only 1 ∩ 2:", pair_12_only.sum())
print("🔗 Pair match only 2 ∩ 3:", pair_23_only.sum())
print("🔗 Pair match only 1 ∩ 3:", pair_13_only.sum())
print()
print("♻️ Internal duplicates:")
print(" - mock1:", internal1.sum())
print(" - mock2:", internal2.sum())
print(" - mock3:", internal3.sum())
print()
print("🧩 Objects with no external match:")
print(" - mock1:", no_match1.sum())
print(" - mock2:", no_match2.sum())
print(" - mock3:", no_match3.sum())

# Spatial Distribution

In [None]:
import matplotlib.pyplot as plt

plt.scatter(mock1["ra_1"], mock1["dec_1"])
plt.scatter(mock2["ra_2"], mock2["dec_2"])
plt.scatter(mock3["ra_3"], mock3["dec_3"])