# Imports

In [None]:
import os
import glob
import warnings
import pyarrow.parquet as pq

from IPython.display import display

import numpy as np
import pandas as pd

import getpass
import os

user = getpass.getuser()

# Configuration

In [None]:
# =========================================================
# CONFIGURATION SECTION
# =========================================================
#input_paths = [
#    f"/scratch/users/{user}/speczs-catalogs/processed/2dfgrs_final_release.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/2dflens_final_release.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/2mrs_v240.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/processed/3dhst_v4.1.5.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/6dfgs_dr3.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/processed/astrodeep_jwst.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/processed/astrodeep-gs43.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/desi_dr1_in_lsst_dp1_fields.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/jades_dr3.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/mosdef_final_release.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/ozdes_dr2.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/primus_dr1.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/vandels_dr4.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/processed/vlt_vimos_v2.0.1.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/processed/vuds_dr1.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/vvds_final_release.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/johns-catalogs/z_cat_CANDELS_clean_sitcomtn-154.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/johns-catalogs/z_cat_NED_clean_sitcomtn-154.parquet",
#]

input_paths = glob.glob('test_data/*.parquet')

final_catalog_path = f"./process001/outputs/crd.parquet"
prepared_temp_dir = f"./process001/temp/"

combine_mode = "concatenate_and_mark_duplicates" # Options: "concatenate", "concatenate_and_mark_duplicates", or "concatenate_and_remove_duplicates"

# Validation

## Basic Info

Counting input and output rows.

In [None]:
# ========================================================
# COUNT INPUT ROWS
# =========================================================
total_input_rows = 0
for path in input_paths:
    if os.path.exists(path):
        parquet_file = pq.ParquetFile(path)
        n_rows = parquet_file.metadata.num_rows
        print(f"{path} -> {n_rows} rows")
        total_input_rows += n_rows
    else:
        warnings.warn(f"⚠️ File not found: {path}")

print(f"✅ Total number of input rows: {total_input_rows}")

# =========================================================
# LOAD FINAL MERGED CATALOG
# =========================================================
if not os.path.exists(final_catalog_path):
    raise FileNotFoundError(f"❌ Final catalog not found: {final_catalog_path}")

df_final = pd.read_parquet(final_catalog_path)
print(f"✅ Total number of rows in final catalog: {len(df_final)}")

Printing the dataframe.

In [None]:
df_final

Dataframe columns.

In [None]:
df_final.columns

Dataframe columns types.

In [None]:
df_final.dtypes

Basic statistics.

In [None]:
df_final.describe()

Counting tie_result values.

In [None]:
if combine_mode != "concatenate":
    print(df_final["tie_result"].value_counts())

Counting source values.

In [None]:
df_final["source"].value_counts()

Checking the percentage of unsolved objects.

In [None]:
if combine_mode != "concatenate":
    # Total number of objects
    total_all = len(df_final)
    
    # Filter objects that were compared (compared_to is not null or empty)
    mask_compared = df_final["compared_to"].notna() & (df_final["compared_to"] != "")
    df_compared = df_final[mask_compared]
    
    # Count how many have tie_result == 2
    count_tie2 = (df_final["tie_result"] == 2).sum()
    count_tie2_compared = (df_compared["tie_result"] == 2).sum()
    
    # Percentages
    percent_all = (count_tie2 / total_all) * 100 if total_all > 0 else 0
    percent_compared = (count_tie2_compared / len(df_compared)) * 100 if len(df_compared) > 0 else 0
    
    # Formatted print
    print(f"📊 tie_result == 2 represents:")
    print(f"  • {percent_all:.2f}% of the total ({count_tie2} out of {total_all})")
    print(f"  • {percent_compared:.2f}% of the compared objects ({count_tie2_compared} out of {len(df_compared)})")

## Individual Catalogs Deduplication Validation

In [None]:
if "test_data/pipeline_generated_sample.parquet" in input_paths:
    df_final_val = df_final[df_final["source"] != "019_pipeline_sample"]
else:
    df_final_val = df_final

In [None]:
%load_ext autoreload
%autoreload 2

import validation_functions as vf

In [None]:
res = vf.validate_intra_source_cells_fast(df_final_val, ndp=4, source_col="source")

In [None]:
vf.explain_intra_source_validation_output(
    res,
    top_k=5,
    df_original=df_final_val,   
    ndp_used=4,
    samples_per_source=2,   
    max_sources=5,         
    source_col="source",
)

## Cross Catalog Deduplication Validation

In [None]:
report = vf.validate_tie_results_fast(df_final_val, threshold=0.0005, max_groups=20000, include_rows=True)

In [None]:
vf.explain_tie_validation_output(report, show_per_rule=3)

## Non-compared Validation

In [None]:
result = vf.render_na_compared_to_validation(df_final_val, show_max=10, assert_if_invalid=False)

## Own Pipeline Product Validation

In [None]:
df_original = pd.read_parquet("test_data/pipeline_generated_sample.parquet")
df_processed = df_final[df_final["source"] == "019_pipeline_sample"]

In [None]:
res = vf.validate_tie_preservation(df_original, df_processed, key="CRD_ID")
print(vf.explain_tie_preservation(res))

## Manual Validation (Optional)

In [None]:
#results = vf.analyze_groups_by_compared_to(
#    df_final,
#    threshold=0.0005,
#    max_groups=5000,
#    max_examples_per_case=4,
#    render=True,
#)

## Validation - Prepared Catalogs

In [None]:
# Dicionário de regras equivalente ao YAML
translation_rules = {
    "2DFGRS": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 5: 4},
        "instrument_type_translation": {"default": "s"},
    },
    "2DFLENS": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 6: 6},
        "instrument_type_translation": {"default": "s"},
    },
    "2MRS": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "z_err == 0", "value": 3},
                {"expr": "0 < z_err < 0.0005", "value": 4},
                {"expr": "z_err >= 0.0005", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "3D-HST": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "z_best_s == 0", "value": 6},
                {"expr": "z_best_s == 1 and z_spec != -1", "value": 4},
                {"expr": "z_best_s == 2 and use_zgrism == 1 and flag1 == 0 and flag2 == 0", "value": 3},
                {"expr": "z_best_s == 3 and use_phot == 1", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {
            "conditions": [
                {"expr": "z_best_s == 1", "value": "s"},
                {"expr": "z_best_s == 2", "value": "g"},
                {"expr": "z_best_s == 3", "value": "p"},
            ],
            "default": "g",
        },
    },
    "6DFGS": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 6: 6},
        "instrument_type_translation": {"default": "s"},
    },
    "ASTRODEEP": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "zspec_survey != '-'", "value": 4},
                {"expr": "zspec_survey == '-'", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {
            "conditions": [
                {"expr": "zspec_survey != '-'", "value": "s"},
                {"expr": "zspec_survey == '-'", "value": "p"},
            ],
            "default": "p",
        },
    },
    "ASTRODEEP-JWST": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "zspec != -99 and z_flag < 400 and (len(str(int(z_flag))) <= 1 or int(str(int(z_flag))[-2]) <= 3)", "value": 4},
                {"expr": "zspec == -99 and z_flag < 400 and (len(str(int(z_flag))) <= 1 or int(str(int(z_flag))[-2]) <= 3)", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {
            "conditions": [
                {"expr": "zspec != -99", "value": "s"},
                {"expr": "zspec == -99", "value": "p"},
            ],
            "default": "p",
        },
    },
    "DESI": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "ZCAT_PRIMARY != True", "value": 0},
                {"expr": "z_flag != 0 and ZCAT_PRIMARY == True", "value": 1},
                {"expr": "z_flag == 0 and ZCAT_PRIMARY == True and z_err < 0.0005", "value": 4},
                {"expr": "z_flag == 0 and ZCAT_PRIMARY == True and z_err >= 0.0005", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "JADES": {
        "z_flag_translation": {4: 4, 3: 3, 2: 2, 1: 1, 0: 0},
        "instrument_type_translation": {"default": "s"},
    },
    "MOSDEF": {
        "z_flag_translation": {7: 4, 6: 3, 5: 2, 4: 2, 3: 1, 2: 1, 1: 0, 0: 0},
        "instrument_type_translation": {"default": "s"},
    },
    "OZDES": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 6: 6},
        "instrument_type_translation": {"default": "s"},
    },
    "PRIMUS": {
        "z_flag_translation": {-1: 0, 2: 1, 3: 2, 4: 3},
        "instrument_type_translation": {"default": "g"},
    },
    "VANDELS": {
        "z_flag_translation": {
            0: 0, 1: 1, 2: 2, 3: 4, 4: 4, 9: 3,
            10: 0, 11: 1, 12: 2, 13: 4, 14: 4, 19: 3,
            20: 0, 21: 1, 22: 2, 23: 4, 24: 4, 29: 3,
            210: 0, 211: 1, 212: 2, 213: 4, 214: 4, 219: 3,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "VIMOS": {
        "z_flag_translation": {4: 4, 3: 3, 2: 2, 1: 1, 0: 0},
        "instrument_type_translation": {"default": "s"},
    },
    "VUDS": {
        "z_flag_translation": {
            1: 1, 11: 1, 21: 1, 31: 1, 41: 1,
            2: 2, 12: 2, 22: 2, 32: 2, 42: 2, 9: 2, 19: 2, 29: 2, 39: 2, 49: 2,
            3: 3, 13: 3, 23: 3, 33: 3, 43: 3,
            4: 4, 14: 4, 24: 4, 34: 4, 44: 4,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "VVDS": {
        "z_flag_translation": {
            0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 9: 2,
            10: 0, 11: 1, 12: 2, 13: 3, 14: 4, 19: 2,
            20: 0, 21: 1, 22: 2, 23: 3, 24: 4, 29: 2,
            210: 0, 211: 1, 212: 2, 213: 3, 214: 4, 219: 2,
        },
        "instrument_type_translation": {"default": "s"},
    },

    # Special cases using continuous rule and inherited type
    "CANDELS": {"_special": "CANDELS_NED"},
    "NED": {"_special": "CANDELS_NED"},
}

def _safe_eval_expr(expr: str, ctx: dict) -> bool:
    """
    Avalia 'expr' usando apenas variáveis do ctx e funções básicas.
    Retorna True/False; se der erro, retorna False.
    """
    try:
        # Permitir apenas funções básicas e numpy
        allowed_globals = {
            "__builtins__": {"len": len, "int": int, "str": str, "float": float},
            "np": np,
        }
        return bool(eval(expr, allowed_globals, ctx))
    except Exception:
        return False

def _apply_translation(value_map, row_ctx):
    """
    value_map pode ser:
      - dict simples {orig: dest} (pode conter 'default')
      - dict com 'conditions' (lista de {expr, value}) e opcional 'default'
    Retorna (valor_traduzido, matched_bool)
    """
    if isinstance(value_map, dict) and "conditions" in value_map:
        for cond in value_map["conditions"]:
            expr = cond.get("expr", "")
            val = cond.get("value", np.nan)
            if expr and _safe_eval_expr(expr, row_ctx):
                return val, True
        # nenhum matched -> usa default se houver
        if "default" in value_map:
            return value_map["default"], True
        return np.nan, False

    # mapeamento direto (sem 'conditions'):
    if isinstance(value_map, dict):
        key = row_ctx.get("z_flag", np.nan)
        if key in value_map:
            return value_map[key], True
        # Se não houver chave correspondente, mas existir 'default', use-o
        if "default" in value_map:
            return value_map["default"], True
        return np.nan, False

    return np.nan, False

def validate_row(row):
    survey = row.get("survey", None)

    # construir contexto com None -> np.nan, para evitar erros de comparação
    ctx = {}
    for k, v in row.items():
        ctx[k] = (np.nan if v is None else v)

    # Casos especiais (CANDELS e NED): regra contínua 0..1 e type herdado
    if survey in ("CANDELS", "NED"):
        x = row.get("z_flag", np.nan)
        # z_flag esperado:
        if x == 0.0:
            z_expected = 0.0
        elif (isinstance(x, (float, int))) and (0.0 < x < 0.7):
            z_expected = 1.0
        elif (isinstance(x, (float, int))) and (0.7 <= x < 0.9):
            z_expected = 2.0
        elif (isinstance(x, (float, int))) and (0.9 <= x < 0.99):
            z_expected = 3.0
        elif (isinstance(x, (float, int))) and (0.99 <= x <= 1.0):
            z_expected = 4.0
        else:
            z_expected = np.nan

        # type_expected é o próprio 'type' da linha
        type_expected = row.get("instrument_type", np.nan)
        return z_expected, type_expected

    # Regras gerais dos surveys
    rules = translation_rules.get(survey, None)
    if rules is None:
        return np.nan, np.nan

    # z_flag_homogenized esperado
    z_rules = rules.get("z_flag_translation", None)
    if z_rules is None:
        z_expected = np.nan
    else:
        z_expected, _ = _apply_translation(z_rules, ctx)

    # instrument_type_homogenized esperado
    t_rules = rules.get("instrument_type_translation", None)
    if t_rules is None:
        type_expected = np.nan
    else:
        if isinstance(t_rules, dict) and ("conditions" in t_rules or "default" in t_rules):
            type_expected, matched = _apply_translation(t_rules, ctx)
        else:
            type_expected, matched = _apply_translation(t_rules, ctx)


    return z_expected, type_expected


# =========================================================
# VALIDATE TRANSLATIONS IN TEMP FILES
# =========================================================
merged_files = glob.glob(os.path.join(prepared_temp_dir, "prepared*/*.parquet"))
merged_files = [f for f in merged_files if "pipeline_sample" not in f]

if not merged_files:
    print("⚠️ No prepared parquet files found for validation.")
else:
    issues = []
    
    for merged_file in merged_files:
        print(f"🔍 Validating {merged_file}")
        df = pd.read_parquet(merged_file)
    
        for _, row in df.iterrows():
            z_exp, type_exp = validate_row(row)
    
            if not (pd.isna(z_exp) and pd.isna(row["z_flag_homogenized"])) and z_exp != row["z_flag_homogenized"]:
                issue = row.to_dict()
                issue["field"] = "z_flag_homogenized"
                issue["expected"] = z_exp
                issue["found"] = row["z_flag_homogenized"]
                issues.append(issue)
    
            if not (pd.isna(type_exp) and pd.isna(row["instrument_type_homogenized"])) and type_exp != row["instrument_type_homogenized"]:
                issue = row.to_dict()
                issue["field"] = "instrument_type_homogenized"
                issue["expected"] = type_exp
                issue["found"] = row["instrument_type_homogenized"]
                issues.append(issue)
    
    if issues:
        issues_df = pd.DataFrame(issues)
        display(issues_df)
        print(f"⚠️ {len(issues)} mismatches found!")
    else:
        print("✅ All homogenized fields match the expected values.")

# Time Profiler

In [None]:
import os
import re
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from collections import defaultdict

# ============================================
# 1. CONFIGURAÇÃO
# ============================================

log_dir = "process001/process_info"

log_files = [
    "prepare_all.log",
    "import_all.log",
    "margin_cache_all.log",
    "crossmatch_and_merge_all.log",
    "process.log"
]

START_RE = re.compile(
    r"(?P<timestamp>\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d+): Starting: (?P<task>[\w_]+) id=(?P<id>[\w\d_]+)"
)
FINISH_RE = re.compile(
    r"(?P<timestamp>\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d+): Finished: (?P<task>[\w_]+) id=(?P<id>[\w\d_]+)"
)

# Vamos manter SEMPRE o primeiro start e o último finish de cada task|id
start_times = {}
end_times = {}

# ============================================
# 2. LEITURA E PARSE DOS LOGS
# ============================================

for file in log_files:
    path = os.path.join(log_dir, file)
    if not os.path.exists(path):
        continue

    with open(path) as f:
        for line in f:
            m_start = START_RE.search(line)
            m_finish = FINISH_RE.search(line)

            if m_start:
                task_id = f"{m_start.group('task')}|{m_start.group('id')}"
                ts = datetime.strptime(m_start.group("timestamp"), "%Y-%m-%d-%H:%M:%S.%f")
                # guarda o menor (primeiro) start
                if task_id not in start_times:
                    start_times[task_id] = ts
                else:
                    if ts < start_times[task_id]:
                        start_times[task_id] = ts

            if m_finish:
                task_id = f"{m_finish.group('task')}|{m_finish.group('id')}"
                ts = datetime.strptime(m_finish.group("timestamp"), "%Y-%m-%d-%H:%M:%S.%f")
                # guarda o maior (último) finish
                if task_id not in end_times:
                    end_times[task_id] = ts
                else:
                    if ts > end_times[task_id]:
                        end_times[task_id] = ts

# ============================================
# 3. CONSTRUÇÃO DO EIXO Y EM ORDEM CUSTOMIZADA
# ============================================

# Considere apenas tasks que têm start e end
all_ids = sorted(set(start_times) & set(end_times))

pipeline_init_id = "pipeline_init|pipeline_init"
consolidate_id = "consolidate|consolidate"

# IDs de prepare_catalog individuais (exclui o agregador prepare_catalogs)
prepare_ids = [
    tid for tid in all_ids
    if tid.startswith("prepare_catalog|") and tid != "prepare_catalogs|prepare_catalogs"
]

# Ordene os prepares pelo start: quem começa antes aparece antes -> y menor -> "mais abaixo" visualmente
prepare_ids_sorted = sorted(prepare_ids, key=lambda tid: start_times[tid])

# Import cat0 inicial, se existir
import_cat0 = [tid for tid in all_ids if tid == "import_catalog|cat0_hats"]

# Demais tasks (exceto pipeline_init, consolidate, prepares e import_cat0)
remaining_ids = [
    tid for tid in all_ids
    if tid not in prepare_ids + import_cat0 + [pipeline_init_id, consolidate_id]
]

# Agrupar por step numérico (catX, merged_stepX, etc.) para ordenar dentro dos steps
step_dict = defaultdict(list)
for tid in remaining_ids:
    match = re.search(r"(?:cat|merged_step)(\d+)", tid)
    if match:
        step = int(match.group(1))
        step_dict[step].append(tid)

ordered_step_ids = []
for step in sorted(step_dict):
    step_tasks = step_dict[step]

    def task_order(tid):
        if tid.startswith("import_catalog|cat"):
            return 0
        elif tid.startswith("generate_margin_cache"):
            return 1
        elif tid.startswith("crossmatch_and_merge"):
            return 2
        elif tid.startswith("import_catalog|merged_step"):
            return 3
        else:
            return 99

    # Dentro do step, mantém uma ordem lógica pelas "fases"
    ordered_step_ids.extend(sorted(step_tasks, key=task_order))

# Monta a ordem final: pipeline_init -> prepares (ordenados pelo start) -> import_cat0 -> steps -> consolidate
ordered_ids = []
if pipeline_init_id in all_ids:
    ordered_ids.append(pipeline_init_id)
ordered_ids.extend(prepare_ids_sorted)
ordered_ids.extend(import_cat0)
ordered_ids.extend(ordered_step_ids)
if consolidate_id in all_ids:
    ordered_ids.append(consolidate_id)

# ============================================
# 4. MONTAGEM DOS DADOS PARA O PLOT
# ============================================

# Tempo adicional a ser subtraído do início do pipeline_init (em segundos)
aditional_pipeline_init_time = 3  # ⏱️ ajuste aqui conforme necessário

# Se existir pipeline_init, ajusta o início para "andar" um pouco antes (apenas para estética)
if pipeline_init_id in start_times:
    start_times[pipeline_init_id] -= timedelta(seconds=aditional_pipeline_init_time)

# Zera o tempo no primeiro start dentre os que vamos plotar
start_zero = min(start_times[tid] for tid in ordered_ids)

# --------------------------------------------------
# 🛠️ INSERIR REGISTRO MANUAL DA TAREFA "register"
# --------------------------------------------------
register_id = "register|register"
register_duration = 3  # ⏱️ duração da tarefa "register" em segundos

# Insere o "register" ao final
ordered_ids.append(register_id)
# Começa após o consolidate (se existir) ou após o último término conhecido
if consolidate_id in end_times:
    register_start = max(end_times[consolidate_id], *end_times.values())
else:
    register_start = max(end_times.values())
register_end = register_start + timedelta(seconds=register_duration)

start_times[register_id] = register_start
end_times[register_id] = register_end
# --------------------------------------------------

# Constrói listas relativas ao start_zero
y_labels = []
start_list = []
end_list = []

for tid in ordered_ids:
    y_labels.append(tid)
    start_rel = (start_times[tid] - start_zero).total_seconds()
    end_rel = (end_times[tid] - start_zero).total_seconds()
    start_list.append(start_rel)
    end_list.append(end_rel)

# ============================================
# 4b. AJUSTAR POSIÇÕES Y PARA SEPARAR "register"
# ============================================

# Cria posições Y padrão e separa o último (register) com um espaçamento extra
y_positions = list(range(len(ordered_ids)))
y_positions[-1] += 5.0  # 🛠️ Aumenta a posição do "register" no eixo Y

# ============================================
# 5. PLOTAGEM DO GRÁFICO DE TIME PROFILE
# ============================================

plt.figure(figsize=(12, 4))

# === Mapear cores por grupo
group_colors = {
    "pipeline_init": "#003f5c",       # azul escuro
    "prepare_catalogs": "#b8860b",    # amarelo escuro
    "crossmatch": "#2f855a",          # verde escuro
    "consolidate": "#003f5c",         # mesmo do pipeline_init
    "register": "#003f5c",            # mesmo azul escuro do pipeline_init
}

# === Determinar grupo de cada tarefa
def get_group(tid):
    if tid == pipeline_init_id:
        return "pipeline_init"
    elif tid == register_id:
        return "register"
    elif tid in prepare_ids:
        return "prepare_catalogs"
    elif tid == consolidate_id:
        return "consolidate"
    else:
        return "crossmatch"

# === Plotar tarefas com cor unificada para linha e bolinhas
for y, start, end, tid in zip(y_positions, start_list, end_list, ordered_ids):
    group = get_group(tid)
    color = group_colors[group]
    plt.hlines(y, start, end, colors=color, linewidth=2)
    plt.scatter(start, y, color=color, s=10)  # início
    plt.scatter(end, y, color=color, s=10)    # fim

# ============================================
# Agrupar labels do eixo Y por grupo
# ============================================

group_positions = defaultdict(list)
for y, tid in zip(y_positions, ordered_ids):
    group_positions[get_group(tid)].append(y)

group_labels = []
group_ticks = []

for label in ["pipeline_init", "prepare_catalogs", "crossmatch", "consolidate", "register"]:
    if group_positions[label]:
        center = sum(group_positions[label]) / len(group_positions[label])
        group_labels.append(label)
        group_ticks.append(center)

# ============================================
# Personalização final do gráfico
# ============================================

plt.yticks(group_ticks, group_labels, fontsize=20)
plt.xticks(fontsize=12)
plt.xlabel("Time (s)", fontsize=20)
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()