In [1]:
import pathlib as pl
import pandas as pd
import statistics
import collections as col

pd.set_option('future.no_silent_downcasting', True)

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pl.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

# created in notebook
# dump-hprc-gap-table.ipynb
SUMMARY_FILE = PROJECT_DATA_ROOT.joinpath(
    "2024_gap_eval", "sample_summary.all-t.pre-supp-table.tsv"
)
table = pd.read_csv(SUMMARY_FILE, sep="\t", header=0)


def norm_err_t(error):
    if error == "01pct":
        t = "err. <0.1%"
    else:
        t = f"err. <{error.strip('pct')}%"
    return t


def norm_column_label(column):

    if column in ["chrom", "start", "end"]:
        return column
    elif column == "gap_length":
        return "gap size (bp)"
    elif column == "max_closed_haps":
        return "max. closing haplotypes (n)"
    elif column.startswith("closed"):
        parts = column.split("_")
        err_t = norm_err_t(parts[-1])
        if parts[2] == "n":
            return f"closed in haplotypes at {err_t} (n)"
        else:
            return f"closed in haplotypes at {err_t} (%)"
    elif "hap" in column:
        asm_unit = column.rsplit(".", 1)[0]
        err_t = norm_err_t(column.split("_")[-1])
        return f"{asm_unit} closed at {err_t}"
    else:
        raise ValueError(column)
            

new_columns = [norm_column_label(c) for c in table.columns]
table.columns = new_columns

out_file = TABLE_OUT_SUPPL.joinpath(
    "table_SXPE_hprc-gap-summary.tsv"
)

table.to_csv(out_file, sep="\t", header=True, index=False)