In [3]:
import pathlib as pl
import pandas as pd
import re

pd.set_option('future.no_silent_downcasting', True)

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pl.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

_MYNAME="norm-assm-stats-table-header"
_NBSTAMP=get_nb_stamp(_MYNAME)

ASSEMBLER = "verkko"


def norm_stat_header(header):

    parts = header.split("_")
    if parts[2] == "grt":
        size_idx = 3
    elif parts[3] == "grt":
        size_idx = 4
    else:
        raise ValueError(header)

    if parts[size_idx] == "0bp":
        length_info = " "
    else:
        length_info = f" >{parts[size_idx]} "
    
    if header.startswith("cov_xfold"):
        return f"Coverage{length_info}(x-fold)", float
    elif header.startswith("length_N50") or header.startswith("length_auN"):
        return f"Length {parts[1]}{length_info}(bp)", int
    elif header.startswith("total_length"):
        return f"Length{length_info}(bp)", int
    elif header.startswith("total_num"):
        return f"Sequences{length_info}(n)", int
    elif header.startswith("pct_dip"):
        return f"Relative length (% H1/H2)", float
    else:
        raise ValueError(header)

source_file = None
if ASSEMBLER == "verkko":
    source_file = PROJECT_BASE.joinpath("annotations", "autogen", "verkko_assemblies.hgsvc3.tsv")
if ASSEMBLER == "hifiasm":
    source_file = PROJECT_BASE.joinpath("annotations", "autogen", "hifiasm_assemblies.hgsvc3.tsv")


table = pd.read_csv(source_file, sep="\t", comment="#", header=[0,1], index_col=[0,1,2])

if ASSEMBLER == "verkko":
    # 2024-12-03 replace accessions w/ updated list provided by Feyza
    verkko_acc_file = PROJECT_BASE.joinpath("annotations", "external", "accessions", "20241120_verkko_assembly_acc.txt")
    verkko_acc = pd.read_csv(verkko_acc_file, sep="\t")
    verkko_acc["sample"] = verkko_acc["description"].apply(lambda x: x.split(".")[0])
    #verkko_acc["sample"] = verkko_acc["sample"].str.replace("GM", "NA")
    verkko_acc = verkko_acc.set_index("sample", inplace=False)
    verkko_acc.rename({"id": "accession"}, axis=1, inplace=True)
  
table.columns = table.columns.rename("asm_unit", level="sequence")
table.index = table.index.droplevel(["sample_num", "verkko_batch"])

new_index = []
for sample in table.index:
    sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == sample, "sex"].iloc[0]
    if ASSEMBLER == "verkko":
        acc = verkko_acc.at[sample, "accession"]
        new_index.append((sample, sex, acc))
    else:
        new_index.append((sample, sex))

if ASSEMBLER == "verkko":
    new_index = pd.MultiIndex.from_tuples(new_index, names=["sample", "sex", "accession"])
else:
    new_index = pd.MultiIndex.from_tuples(new_index, names=["sample", "sex"])

table.index = new_index

new_columns = []
data_types = []
for (au, stat) in table.columns:
    norm_stat, data_type = norm_stat_header(stat)
    new_columns.append((au, norm_stat))
    data_types.append(data_type)

new_columns = pd.MultiIndex.from_tuples(new_columns, names=["assembly_unit", "statistic"])
table.columns = new_columns

for dtype, column in zip(data_types, table.columns):
    table[column] = table[column].astype(dtype)

table.sort_index(axis=0, inplace=True)

table_sx_asm_stats = TABLE_OUT_SUPPL.joinpath(
    f"table_SXPE_{ASSEMBLER}-assembly-stats.tsv"
)

table.to_csv(table_sx_asm_stats, sep="\t", header=True, index=True)