In [28]:
import pathlib as pl
import pandas as pd
import re

pd.set_option('future.no_silent_downcasting', True)

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pl.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

_MYNAME="dump-qc-label-regions-1p"
_NBSTAMP=get_nb_stamp(_MYNAME)

ASSEMBLER = "verkko"

TABLE_FOLDER = PROJECT_DATA_ROOT.joinpath(
    "2024_merged_labels/summaries"
)


def normalize_column_name(name):

    if name in ["sample", "assembler"]:
        return name
    else:
        if "error_region" in name:
            prefix = "flagged regions"
        elif "clean_region" in name:
            prefix = "clean regions"
        else:
            raise ValueError(name)
        if "size_pct" in name:
            num = int(name[-2:])
            if num == 50:
                suffix = "size median (bp)"
            else:
                suffix = f"size {num}th %ile (bp)"
        elif "size_mean" in name:
            suffix = "size mean (bp)"
        elif name.endswith("num"):
            suffix = "count (n)"
        elif "size_min" in name or "size_max" in name:
            suffix = f"size {name.split('_')[-1]} (bp)"
        elif "_pct" in name:
            suffix = "size total (%bp)"
        elif "_bp" in name:
            suffix = "size total (bp)"
        else:
            raise ValueError(name)
    return f"{prefix} {suffix}"
            

for table_file in TABLE_FOLDER.glob("*subset-1p.regions.stats.tsv"):
    if "wg" in table_file.name:
        continue
    print(table_file.name)
    df = pd.read_csv(table_file, sep="\t", header=0)
    df["sample"] = df["sample"].apply(lambda x: x.split(".")[0])
    df.insert(1, "assembler", ASSEMBLER)
    pct_clean = (df["clean_regions_bp"] / (df["error_regions_bp"] + df["clean_regions_bp"]) * 100).round(1)
    df.insert(3, "clean_regions_pct", pct_clean)
    print("=== reported stats")
    print("median clean regions pct/total ", df["clean_regions_pct"].describe()["50%"])
    print("---")
    print("max clean region size")
    print(df["clean_region_size_max"].describe())
    print("---")
    print("median clean region sizes")
    print(df["clean_region_size_pct50"].describe())
    print("===")
    drop_cols = [c for c in df.columns if "stddev" in c]
    df.drop(drop_cols, axis=1, inplace=True)
    norm_columns = [normalize_column_name(c) for c in df.columns]
    df.columns = norm_columns

    for column in norm_columns:
        if "count (n)" in column:
            df[column] = df[column].astype(int)
        if "%ile" in column:
            df[column] = df[column].round(0).astype(int)
        if any(x in column for x in ["min", "max", "mean", "median"]):
            df[column] = df[column].round(0).astype(int)


table_out = TABLE_OUT_SUPPL.joinpath(f"table_sxpe_{ASSEMBLER}-flagged-regions.tsv")
df.to_csv(table_out, sep="\t", header=True, index=False)


SAMPLES.verkko.merged-issues.ps-no-ont.subset-1p.regions.stats.tsv
=== reported stats
median clean regions pct/total  99.6
---
max clean region size
count    6.500000e+01
mean     2.210476e+07
std      8.594619e+06
min      7.779989e+06
25%      1.630436e+07
50%      2.118054e+07
75%      2.542078e+07
max      5.504328e+07
Name: clean_region_size_max, dtype: float64
---
median clean region sizes
count    6.500000e+01
mean     8.473282e+05
std      4.187525e+05
min      3.964800e+04
25%      4.764165e+05
50%      8.590960e+05
75%      1.092393e+06
max      1.820387e+06
Name: clean_region_size_pct50, dtype: float64
===
