# Imports

In [None]:
import os
import glob
import warnings
import pyarrow.parquet as pq

from IPython.display import display

import numpy as np
import pandas as pd

import getpass
import os

user = getpass.getuser()

# Configuration

In [None]:
# =========================================================
# CONFIGURATION SECTION
# =========================================================
#input_paths = [
#    f"/scratch/users/{user}/speczs-catalogs/processed/2dfgrs_final_release.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/2dflens_final_release.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/2mrs_v240.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/3dhst_v4.1.5.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/6dfgs_dr3.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/astrodeep_jwst.parquet",
#    f"/scratch/users/luigi.silva/speczs-catalogs/processed/astrodeep-gs43.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/desi_dr1_in_lsst_dp1_fields.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/jades_dr3.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/mosdef_final_release.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/ozdes_dr2.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/primus_dr1.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/vandels_dr4.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/vlt_vimos_v2.0.1.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/vuds_dr1.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/processed/vvds_final_release.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/johns-catalogs/z_cat_CANDELS_clean_sitcomtn-154.parquet",
#    f"/scratch/users/{user}/speczs-catalogs/johns-catalogs/z_cat_NED_clean_sitcomtn-154.parquet",
#    f"/scratch/users/{user}/pzserver_pipelines/combine_redshift_dedup/test_data/pipeline_generated_sample.parquet"
#]

input_paths = glob.glob('test_data/*.parquet')

final_catalog_path = f"./process001/outputs/crd.parquet"
prepared_temp_dir = f"./process001/temp/"

combine_mode = "concatenate_and_mark_duplicates" # Options: "concatenate", "concatenate_and_mark_duplicates", or "concatenate_and_remove_duplicates"

# Validation

## Basic Info

Counting input and output rows.

In [None]:
# ========================================================
# COUNT INPUT ROWS
# =========================================================
total_input_rows = 0
for path in input_paths:
    if os.path.exists(path):
        parquet_file = pq.ParquetFile(path)
        n_rows = parquet_file.metadata.num_rows
        print(f"{path} -> {n_rows} rows")
        total_input_rows += n_rows
    else:
        warnings.warn(f"⚠️ File not found: {path}")

print(f"✅ Total number of input rows: {total_input_rows}")

# =========================================================
# LOAD FINAL MERGED CATALOG
# =========================================================
if not os.path.exists(final_catalog_path):
    raise FileNotFoundError(f"❌ Final catalog not found: {final_catalog_path}")

df_final = pd.read_parquet(final_catalog_path)
print(f"✅ Total number of rows in final catalog: {len(df_final)}")

Printing the dataframe.

In [None]:
df_final

Dataframe columns.

In [None]:
df_final.columns

Dataframe columns types.

In [None]:
df_final.dtypes

Basic statistics.

In [None]:
if combine_mode != "concatenate":
    display(df_final.drop(columns=["group_id"]).describe())
else:
    display(df_final.describe())

Counting tie_result values.

In [None]:
if combine_mode != "concatenate":
    print(df_final["tie_result"].value_counts())

Counting source values.

In [None]:
df_final["source"].value_counts()

Checking the percentage of unsolved objects.

In [None]:
if combine_mode != "concatenate":
    # Total number of objects
    total_all = len(df_final)
    
    # Filter objects that were compared (compared_to is not null or empty)
    if "compared_to" in df_final.columns.to_list():
        mask_compared = df_final["compared_to"].notna() & (df_final["compared_to"] != "")
        df_compared = df_final[mask_compared]
        
        # Count how many have tie_result == 2
        count_tie2 = (df_final["tie_result"] == 2).sum()
        count_tie2_compared = (df_compared["tie_result"] == 2).sum()
        
        # Percentages
        percent_all = (count_tie2 / total_all) * 100 if total_all > 0 else 0
        percent_compared = (count_tie2_compared / len(df_compared)) * 100 if len(df_compared) > 0 else 0
        
        # Formatted print
        print(f"📊 tie_result == 2 represents:")
        print(f"  • {percent_all:.2f}% of the total ({count_tie2} out of {total_all})")
        print(f"  • {percent_compared:.2f}% of the compared objects ({count_tie2_compared} out of {len(df_compared)})")

## Individual Catalogs Deduplication Validation

In [None]:
if combine_mode != "concatenate":
    if "test_data/pipeline_generated_sample.parquet" in input_paths:
        df_final_val = df_final[df_final["source"] != "019_pipeline_sample"]
    else:
        df_final_val = df_final

In [None]:
if combine_mode != "concatenate":
    %load_ext autoreload
    %autoreload 2
    
    import validation_functions as vf

In [None]:
if combine_mode != "concatenate":
    if "compared_to" in df_final.columns.to_list():
        res = vf.validate_intra_source_cells_fast(df_final_val, ndp=4, source_col="source")

In [None]:
if combine_mode != "concatenate":
    if "compared_to" in df_final.columns.to_list():
        vf.explain_intra_source_validation_output(
            res,
            top_k=5,
            df_original=df_final_val,   
            ndp_used=4,
            samples_per_source=2,   
            max_sources=5,         
            source_col="source",
        )

## Cross Catalog Deduplication Validation

In [None]:
if combine_mode != "concatenate":
    report = vf.validate_tie_results_fast(df_final_val, threshold=0.0005, max_groups=40000, include_rows=True)

In [None]:
if combine_mode != "concatenate":
    vf.explain_tie_validation_output(report, show_per_rule=3)

## Non-compared Validation

In [None]:
if combine_mode != "concatenate":
    if "compared_to" in df_final.columns.to_list():
        result = vf.render_na_compared_to_validation(df_final_val, show_max=10, assert_if_invalid=False)

## Own Pipeline Product Validation

In [None]:
if combine_mode != "concatenate":
    if "compared_to" in df_final.columns.to_list():
        df_original = pd.read_parquet("test_data/pipeline_generated_sample.parquet")
        df_processed = df_final[df_final["source"] == "019_pipeline_sample"]

In [None]:
if combine_mode != "concatenate":
    if "compared_to" in df_final.columns.to_list():
        res = vf.validate_tie_preservation(df_original, df_processed, key="CRD_ID")
        print(vf.explain_tie_preservation(res))

## Manual Validation (Optional)

In [None]:
if combine_mode != "concatenate":
    if "compared_to" in df_final.columns.to_list():
        df_for_manual = df_final[[
            "CRD_ID","ra","dec","z","survey","source",
            "tie_result","compared_to","z_flag_homogenized",
            "instrument_type_homogenized","group_id"
        ]]
    else:
        df_for_manual = df_final[[
            "CRD_ID","ra","dec","z","survey","source",
            "tie_result","z_flag_homogenized",
            "instrument_type_homogenized","group_id"
        ]]    
        
    results = vf.analyze_groups_by_group_id_fast(
        df_for_manual,
        threshold=0.0005,
        max_groups=10000,
        max_examples_per_case=3,
        render=True,
        compute_same_source_pair=True,
    )

## Validation of groups with compared_to `<NA>`

In [None]:
import pandas as pd
import numpy as np

def check_group_id_integrity(
    df: pd.DataFrame,
    group_col: str = "group_id",
    compared_col: str = "compared_to",
    id_col: str = "CRD_ID",
    zf_col: str = "z_flag_homogenized",
    show_examples: int = 5,
):
    if group_col not in df.columns:
        raise KeyError(f"'{group_col}' não está no DataFrame.")

    # compared_to como string segura (ou vazio, se a coluna não existir)
    if compared_col in df.columns:
        cmp_str = df[compared_col].astype("string").fillna("").str.strip()
    else:
        cmp_str = pd.Series([""] * len(df), index=df.index, dtype="string")

    # flags linha-a-linha
    cmp_nonempty = cmp_str.ne("")
    is_star = pd.Series(False, index=df.index)
    if zf_col in df.columns:
        is_star = pd.to_numeric(df[zf_col], errors="coerce").eq(6)

    # agregações por group_id (inclui NaN se houver)
    gkey = df[group_col]
    size = gkey.groupby(gkey, dropna=False).size().rename("size")
    n_cmp_nonempty = cmp_nonempty.groupby(gkey, dropna=False).sum().rename("n_cmp_nonempty")
    n_star = is_star.groupby(gkey, dropna=False).sum().rename("n_star")

    stats = pd.concat([size, n_cmp_nonempty, n_star], axis=1).fillna(0)
    stats["n_cmp_nonempty"] = stats["n_cmp_nonempty"].astype(int)
    stats["n_star"] = stats["n_star"].astype(int)

    # suspeitos = grupos com 2+ linhas e NENHUM compared_to preenchido
    suspects = stats[(stats["size"] >= 2) & (stats["n_cmp_nonempty"] == 0)]

    print(f"Grupos suspeitos (size ≥ 2 e compared_to vazio para todos): {len(suspects)}")
    if len(suspects) > 0:
        display(suspects.sort_values(["size"], ascending=False).head(10))
        ex_gids = suspects.index[:show_examples]
        cols = [c for c in [id_col, "survey", "source", "z", zf_col, compared_col, group_col] if c in df.columns]
        sample = df[df[group_col].isin(ex_gids)].loc[:, cols].sort_values([group_col, id_col], na_position="last")
        display(sample)

    # retorno útil para salvar/inspecionar depois
    return {"stats": stats, "suspects": suspects}

In [None]:
out = check_group_id_integrity(df_final)
assert len(out["suspects"]) == 0, "There is group_id with size>=2 and compared_to empty!"

## Validation - Prepared Catalogs

In [None]:
# Dicionário de regras equivalente ao YAML
translation_rules = {
    "2DFGRS": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 5: 4},
        "instrument_type_translation": {"default": "s"},
    },
    "2DFLENS": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 6: 6},
        "instrument_type_translation": {"default": "s"},
    },
    "2MRS": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "z_err == 0", "value": 3},
                {"expr": "0 < z_err < 0.0005", "value": 4},
                {"expr": "z_err >= 0.0005", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "3D-HST": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "z_best_s == 0", "value": 6},
                {"expr": "z_best_s == 1 and z_spec != -1", "value": 4},
                {"expr": "z_best_s == 2 and use_zgrism == 1 and flag1 == 0 and flag2 == 0", "value": 3},
                {"expr": "z_best_s == 3 and use_phot == 1", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {
            "conditions": [
                {"expr": "z_best_s == 1", "value": "s"},
                {"expr": "z_best_s == 2", "value": "g"},
                {"expr": "z_best_s == 3", "value": "p"},
            ],
            "default": "g",
        },
    },
    "6DFGS": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 6: 6},
        "instrument_type_translation": {"default": "s"},
    },
    "ASTRODEEP": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "zspec_survey != '-'", "value": 4},
                {"expr": "zspec_survey == '-'", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {
            "conditions": [
                {"expr": "zspec_survey != '-'", "value": "s"},
                {"expr": "zspec_survey == '-'", "value": "p"},
            ],
            "default": "p",
        },
    },
    "ASTRODEEP-JWST": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "zspec != -99 and z_flag < 400 and (len(str(int(z_flag))) <= 1 or int(str(int(z_flag))[-2]) <= 3)", "value": 4},
                {"expr": "zspec == -99 and z_flag < 400 and (len(str(int(z_flag))) <= 1 or int(str(int(z_flag))[-2]) <= 3)", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {
            "conditions": [
                {"expr": "zspec != -99", "value": "s"},
                {"expr": "zspec == -99", "value": "p"},
            ],
            "default": "p",
        },
    },
    "DESI": {
        "z_flag_translation": {
            "conditions": [
                {"expr": "ZCAT_PRIMARY != True", "value": 0},
                {"expr": "z_flag != 0 and ZCAT_PRIMARY == True", "value": 1},
                {"expr": "z_flag == 0 and ZCAT_PRIMARY == True and z_err < 0.0005", "value": 4},
                {"expr": "z_flag == 0 and ZCAT_PRIMARY == True and z_err >= 0.0005", "value": 3},
            ],
            "default": 0,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "JADES": {
        "z_flag_translation": {4: 4, 3: 3, 2: 2, 1: 1, 0: 0},
        "instrument_type_translation": {"default": "s"},
    },
    "MOSDEF": {
        "z_flag_translation": {7: 4, 6: 3, 5: 2, 4: 2, 3: 1, 2: 1, 1: 0, 0: 0},
        "instrument_type_translation": {"default": "s"},
    },
    "OZDES": {
        "z_flag_translation": {1: 0, 2: 1, 3: 3, 4: 4, 6: 6},
        "instrument_type_translation": {"default": "s"},
    },
    "PRIMUS": {
        "z_flag_translation": {-1: 0, 2: 1, 3: 2, 4: 3},
        "instrument_type_translation": {"default": "g"},
    },
    "VANDELS": {
        "z_flag_translation": {
            0: 0, 1: 1, 2: 2, 3: 4, 4: 4, 9: 3,
            10: 0, 11: 1, 12: 2, 13: 4, 14: 4, 19: 3,
            20: 0, 21: 1, 22: 2, 23: 4, 24: 4, 29: 3,
            210: 0, 211: 1, 212: 2, 213: 4, 214: 4, 219: 3,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "VIMOS": {
        "z_flag_translation": {4: 4, 3: 3, 2: 2, 1: 1, 0: 0},
        "instrument_type_translation": {"default": "s"},
    },
    "VUDS": {
        "z_flag_translation": {
            1: 1, 11: 1, 21: 1, 31: 1, 41: 1,
            2: 2, 12: 2, 22: 2, 32: 2, 42: 2, 9: 2, 19: 2, 29: 2, 39: 2, 49: 2,
            3: 3, 13: 3, 23: 3, 33: 3, 43: 3,
            4: 4, 14: 4, 24: 4, 34: 4, 44: 4,
        },
        "instrument_type_translation": {"default": "s"},
    },
    "VVDS": {
        "z_flag_translation": {
            0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 9: 2,
            10: 0, 11: 1, 12: 2, 13: 3, 14: 4, 19: 2,
            20: 0, 21: 1, 22: 2, 23: 3, 24: 4, 29: 2,
            210: 0, 211: 1, 212: 2, 213: 3, 214: 4, 219: 2,
        },
        "instrument_type_translation": {"default": "s"},
    },

    # Special cases using continuous rule and inherited type
    "CANDELS": {"_special": "CANDELS_NED"},
    "NED": {"_special": "CANDELS_NED"},
}

def _safe_eval_expr(expr: str, ctx: dict) -> bool:
    """
    Avalia 'expr' usando apenas variáveis do ctx e funções básicas.
    Retorna True/False; se der erro, retorna False.
    """
    try:
        # Permitir apenas funções básicas e numpy
        allowed_globals = {
            "__builtins__": {"len": len, "int": int, "str": str, "float": float},
            "np": np,
        }
        return bool(eval(expr, allowed_globals, ctx))
    except Exception:
        return False

def _apply_translation(value_map, row_ctx):
    """
    value_map pode ser:
      - dict simples {orig: dest} (pode conter 'default')
      - dict com 'conditions' (lista de {expr, value}) e opcional 'default'
    Retorna (valor_traduzido, matched_bool)
    """
    if isinstance(value_map, dict) and "conditions" in value_map:
        for cond in value_map["conditions"]:
            expr = cond.get("expr", "")
            val = cond.get("value", np.nan)
            if expr and _safe_eval_expr(expr, row_ctx):
                return val, True
        # nenhum matched -> usa default se houver
        if "default" in value_map:
            return value_map["default"], True
        return np.nan, False

    # mapeamento direto (sem 'conditions'):
    if isinstance(value_map, dict):
        key = row_ctx.get("z_flag", np.nan)
        if key in value_map:
            return value_map[key], True
        # Se não houver chave correspondente, mas existir 'default', use-o
        if "default" in value_map:
            return value_map["default"], True
        return np.nan, False

    return np.nan, False

def validate_row(row):
    survey = row.get("survey", None)

    # construir contexto com None -> np.nan, para evitar erros de comparação
    ctx = {}
    for k, v in row.items():
        ctx[k] = (np.nan if v is None else v)

    # Casos especiais (CANDELS e NED): regra contínua 0..1 e type herdado
    if survey in ("CANDELS", "NED"):
        x = row.get("z_flag", np.nan)
        # z_flag esperado:
        if x == 0.0:
            z_expected = 0.0
        elif (isinstance(x, (float, int))) and (0.0 < x < 0.7):
            z_expected = 1.0
        elif (isinstance(x, (float, int))) and (0.7 <= x < 0.9):
            z_expected = 2.0
        elif (isinstance(x, (float, int))) and (0.9 <= x < 0.99):
            z_expected = 3.0
        elif (isinstance(x, (float, int))) and (0.99 <= x <= 1.0):
            z_expected = 4.0
        else:
            z_expected = np.nan

        # type_expected é o próprio 'type' da linha
        type_expected = row.get("instrument_type", np.nan)
        return z_expected, type_expected

    # Regras gerais dos surveys
    rules = translation_rules.get(survey, None)
    if rules is None:
        return np.nan, np.nan

    # z_flag_homogenized esperado
    z_rules = rules.get("z_flag_translation", None)
    if z_rules is None:
        z_expected = np.nan
    else:
        z_expected, _ = _apply_translation(z_rules, ctx)

    # instrument_type_homogenized esperado
    t_rules = rules.get("instrument_type_translation", None)
    if t_rules is None:
        type_expected = np.nan
    else:
        if isinstance(t_rules, dict) and ("conditions" in t_rules or "default" in t_rules):
            type_expected, matched = _apply_translation(t_rules, ctx)
        else:
            type_expected, matched = _apply_translation(t_rules, ctx)


    return z_expected, type_expected


# =========================================================
# VALIDATE TRANSLATIONS IN TEMP FILES
# =========================================================
merged_files = glob.glob(os.path.join(prepared_temp_dir, "prepared*/*.parquet"))
merged_files = [f for f in merged_files if "pipeline_sample" not in f]

if not merged_files:
    print("⚠️ No prepared parquet files found for validation.")
else:
    issues = []
    
    for merged_file in merged_files:
        print(f"🔍 Validating {merged_file}")
        df = pd.read_parquet(merged_file)
    
        for _, row in df.iterrows():
            z_exp, type_exp = validate_row(row)
    
            if not (pd.isna(z_exp) and pd.isna(row["z_flag_homogenized"])) and z_exp != row["z_flag_homogenized"]:
                issue = row.to_dict()
                issue["field"] = "z_flag_homogenized"
                issue["expected"] = z_exp
                issue["found"] = row["z_flag_homogenized"]
                issues.append(issue)
    
            if not (pd.isna(type_exp) and pd.isna(row["instrument_type_homogenized"])) and type_exp != row["instrument_type_homogenized"]:
                issue = row.to_dict()
                issue["field"] = "instrument_type_homogenized"
                issue["expected"] = type_exp
                issue["found"] = row["instrument_type_homogenized"]
                issues.append(issue)
    
    if issues:
        issues_df = pd.DataFrame(issues)
        display(issues_df)
        print(f"⚠️ {len(issues)} mismatches found!")
    else:
        print("✅ All homogenized fields match the expected values.")

# Time Profiler

In [None]:
import os, re
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from collections import defaultdict

log_path = "process001/process_info/pipeline.log"
INIT_LEFT_PAD_S = 3

# ---------- Regex ----------
TS = r"(?P<ts>\d{4}-\d{2}-\d{2}-\d{2}:\d{2}:\d{2}\.\d+)"
PH = r"(?P<ph>START|END)"

# Fases macro da coluna "stage" (3ª coluna): preparation, automatch, crossmatch, deduplication, consolidation, register
PHASES = r"(?:preparation|automatch|crossmatch|deduplication|consolidation|register)"
RE_PHASE = re.compile(
    fr"{TS}\s*\|\s*INFO\s*\|\s*(?P<phase>{PHASES})\s*\|\s*crc\s*\|\s*{PH}\s+(?P<msg>.*)$"
)

# init (bootstrap)
RE_INIT = re.compile(fr"{TS}\s*\|\s*INFO\s*\|\s*init\s*\|\s*crc\s*\|\s*{PH}\s+init:", re.X)

# prepare_catalog (produto)
RE_PREPARE = re.compile(
    fr"{TS}\s*\|\s*INFO\s*\|\s*preparation\s*\|\s*crc\.specz\s*\|\s*{PH}\s+prepare_catalog\s+product=(?P<name>[\w.\-\d]+)",
    re.X,
)

# automatch (artifact)
RE_AUTOMATCH = re.compile(
    fr"{TS}\s*\|\s*INFO\s*\|\s*automatch\s*\|\s*crc\.crossmatch_auto\s*\|\s*{PH}\s+automatch:\s+artifact=(?P<name>[\w\.\-\d]+)",
    re.X,
)

# crossmatch (step)
RE_XMATCH = re.compile(
    fr"{TS}\s*\|\s*INFO\s*\|\s*crossmatch\s*\|\s*crc\.crossmatch\s*\|\s*{PH}\s+crossmatch_update_compared_to:\s+step=(?P<step>\d+)",
    re.X,
)

def parse_ts(s: str) -> datetime:
    return datetime.strptime(s, "%Y-%m-%d-%H:%M:%S.%f")

# ---------- Varredura ----------
start_times, end_times = {}, {}

def reg(task_type: str, ident: str, ts: datetime, ph: str):
    key = f"{task_type}|{ident}"
    if ph == "START":
        if key not in start_times or ts < start_times[key]:
            start_times[key] = ts
    else:
        if key not in end_times or ts > end_times[key]:
            end_times[key] = ts

if not os.path.exists(log_path):
    raise FileNotFoundError(log_path)

with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        # INIT (específico)
        m = RE_INIT.search(line)
        if m:
            ts = parse_ts(m.group("ts")); ph = m.group("ph")
            reg("init", "init", ts, ph)
            continue

        # PREPARE por produto (específico)
        m = RE_PREPARE.search(line)
        if m:
            ts = parse_ts(m.group("ts")); ph = m.group("ph")
            reg("prepare_catalog", m.group("name"), ts, ph)
            continue

        # AUTOMATCH por artifact (específico)
        m = RE_AUTOMATCH.search(line)
        if m:
            ts = parse_ts(m.group("ts")); ph = m.group("ph")
            reg("automatch", m.group("name"), ts, ph)
            continue

        # CROSSMATCH step=N (específico)
        m = RE_XMATCH.search(line)
        if m:
            ts = parse_ts(m.group("ts")); ph = m.group("ph")
            reg("crossmatch", f"step{int(m.group('step')):02d}", ts, ph)
            continue

        # Fase macro (genérica; por último!)
        m = RE_PHASE.search(line)
        if m:
            ts = parse_ts(m.group("ts")); ph = m.group("ph")
            phase = m.group("phase")
            reg(f"{phase}_phase", phase, ts, ph)
            continue

# ---------- Filtra completas ----------
all_keys = [k for k in start_times if k in end_times]
if not all_keys:
    raise RuntimeError("Nenhuma tarefa com START e END encontrada no log.")

# estética
if "init|init" in start_times:
    start_times["init|init"] -= timedelta(seconds=INIT_LEFT_PAD_S)

# ---------- Ordenação ----------
def group_of(key: str) -> str:
    return key.split("|", 1)[0]

def start_of(key: str) -> datetime:
    return start_times[key]

def crossmatch_sort_key(key: str):
    s = key.split("|", 1)[1].replace("step", "")
    return int(s) if s.isdigit() else 10**9

groups = [
    "init",
    "preparation_phase",   # barra macro da fase
    "prepare_catalog",     # barras por produto
    "automatch_phase",
    "automatch",           # barras por artifact
    "crossmatch_phase",
    "crossmatch",          # barras por step
    "deduplication_phase",
    "consolidation_phase",
    "register_phase",      # aparece só se existir no log
]


palette = {
    "init": "#1f77b4",
    "preparation_phase": "#ff7f0e",
    "prepare_catalog": "#ffbb78",
    "automatch_phase": "#2ca02c",
    "automatch": "#98df8a",
    "crossmatch_phase": "#d62728",
    "crossmatch": "#ff9896",
    "deduplication_phase": "#9467bd",
    "consolidation_phase": "#8c564b",
    "register_phase": "#e377c2",
}


ordered_keys = []
for g in groups:
    gkeys = [k for k in all_keys if group_of(k) == g]
    if not gkeys:
        continue
    if g == "crossmatch":
        gkeys = sorted(gkeys, key=crossmatch_sort_key)
    else:
        gkeys = sorted(gkeys, key=start_of)
    ordered_keys.extend(gkeys)

# --- Injeta register_phase se não existir ---
if not any(k.startswith("register_phase|") for k in start_times):
    if ordered_keys:
        last_end = max(end_times[k] for k in ordered_keys)
    else:
        last_end = min(start_times.values())
    reg_start = last_end + timedelta(seconds=0.5)
    reg_end   = reg_start + timedelta(seconds=1.0)
    start_times["register_phase|register"] = reg_start
    end_times["register_phase|register"]   = reg_end
    ordered_keys.append("register_phase|register")

# ---------- Referência temporal ----------
t0 = min(start_times[k] for k in ordered_keys)
starts = [(start_times[k] - t0).total_seconds() for k in ordered_keys]
ends   = [(end_times[k]   - t0).total_seconds() for k in ordered_keys]

# ---------- Plot ----------
fig, ax = plt.subplots(figsize=(15, 8))

# pular os globais destas três fases
skip_globals = {"preparation_phase", "automatch_phase", "crossmatch_phase"}

# desenha as barras (apenas uma vez)
for i, k in enumerate(ordered_keys):
    g = group_of(k)
    if g in skip_globals:
        continue  # não desenha as barras globais dessas três fases

    c  = palette.get(g, "#444444")
    lw = 4 if g.endswith("_phase") else 2  # fases globais restantes (dedup/consolid/register) ficam mais grossas
    ax.hlines(y=i, xmin=starts[i], xmax=ends[i], colors=c, linewidth=lw)
    ax.scatter([starts[i], ends[i]], [i, i], s=14, color=c, zorder=3)

# tira os labels default do eixo Y
ax.set_yticks(range(len(ordered_keys)))
ax.set_yticklabels([""] * len(ordered_keys))

# adiciona só um label por bloco, no centro
def add_block_label(prefix, text):
    idxs = [i for i, k in enumerate(ordered_keys) if k.startswith(prefix)]
    if idxs:
        mid = (min(idxs) + max(idxs)) / 2
        ax.text(-5, mid, text, va="center", ha="right", fontsize=10, fontweight="bold")

add_block_label("init",                "init")
add_block_label("prepare_catalog", "preparation")
add_block_label("automatch",       "automatch")
add_block_label("crossmatch",      "crossmatch")
add_block_label("deduplication_phase", "deduplication")
add_block_label("consolidation_phase", "consolidation")
add_block_label("register_phase",      "register")


# eixo X e margens (inclui espaço à esquerda para os rótulos em x=-5)
xmax = max(ends) if ends else 1.0
ax.set_xlim(-6, xmax * 1.02)
ax.set_xlabel("Time (s)", fontsize=12)
ax.grid(True, linestyle="--", alpha=0.3, axis="x")
ax.set_title("CRC – Time Profile (pipeline.log)", fontsize=16)

# dá espaço para os rótulos à esquerda sem depender do tight_layout
plt.subplots_adjust(left=0.20, right=0.98, top=0.92, bottom=0.08)
plt.show()