In [1]:
import pandas as pd
import pathlib

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"

_PROJECT_CONFIG_NB = str(pathlib.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

raw_header_file = PROJECT_BASE.joinpath(
    "annotations", "data_freezes", "hgsvc3_assembly_data_sources.final-raw-header.tsv"
)

table = pd.read_csv(raw_header_file, sep="\t", comment="#")

def norm_column_header(column):

    if column in ["sample", "sex", "population", "datatype", "filename", "file_accession", "project_accession"]:
        return " ".join(column.split("_"))
    elif column == "supergroup":
        return "continental group"
    elif column == "verkko_assembly_accession":
        return "Verkko assembly accession"
    elif column == "matched_name":
        return "metadata filename"
    elif column == "match_quality":
        return "metadata annotation"
    elif column == "reuse_data":
        return "use data"
    elif "_grt_" in column:
        parts = column.split("_")
        threshold_idx = parts.index("grt") + 1
        if parts[threshold_idx] == "0bp":
            threshold = ""
        else:
            threshold = f" >{parts[threshold_idx]}"
        if parts[1] == "num":
            return "total reads" + threshold + " (n)"
        if parts[0] == "total":
            return "total read length" + threshold + " (bp)"
        if parts[0] == "cov":
            return "coverage" + threshold + " (fold)"
        if parts[0] == "length":
            return "read length " + parts[1] + threshold + " (bp)"
        raise ValueError(column)
    else:
        raise ValueError(column)
    

select_hgsvc = ~(table["sample"] == "NA24385")

cov_columns = [
    "cov_xfold_grt_100kbp_at_3Gbp",
    "cov_xfold_grt_0bp_at_3Gbp"
]

for datatype in ["hifi", "ont"]:
    
    select_type = table["datatype"] == datatype
    select_all = table["filename"] == "all"
    selector = select_hgsvc & select_type & select_all
    sub = table.loc[selector, cov_columns]
    print(datatype)
    print(sub.shape)

    print(sub.median(axis=0).round(0))


renamer = dict(
    (old_name, norm_column_header(old_name)) for old_name in table.columns
)

table = table.rename(renamer, axis=1, inplace=False)

table_sx_assm_data = TABLE_OUT_SUPPL.joinpath(
    f"table_SXPE_verkko_assembly_data.tsv"
)

table.to_csv(table_sx_assm_data, sep="\t", header=True, index=False)

hifi
(64, 2)
cov_xfold_grt_100kbp_at_3Gbp     0.0
cov_xfold_grt_0bp_at_3Gbp       40.0
dtype: float64
ont
(64, 2)
cov_xfold_grt_100kbp_at_3Gbp    35.0
cov_xfold_grt_0bp_at_3Gbp       55.0
dtype: float64
