In [5]:
import pandas as pd
import pathlib
import math
import numpy as np
import collections as col
import pickle as pck

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"

_PROJECT_CONFIG_NB = str(pathlib.Path("00_project_config.ipynb").resolve(strict=True))
_ASSM_STATS_NB = str(pathlib.Path("10_assm_stats.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB
%run $_ASSM_STATS_NB

def get_assm_stat(assembler, statistic):

    if assembler == "verkko":
        table = VRK_ASSM_STATS
    if assembler == "hifiasm":
        table = HSM_ASSM_STATS
    statistics = []
    for sample in HGSVC_FEMALES + HGSVC_MALES:
        for hap in ["hap1", "hap2"]:
            aun = table.get_stat(sample, (hap, statistic), int(1e6))
            statistics.append(aun)
    median = np.median(np.array(statistics, dtype=int))
    print(f"{assembler} median {statistic}: {median} Mbp") 
    return

get_assm_stat("verkko", "aun")
get_assm_stat("hifiasm", "aun")

def get_qv_stat(assembler):
    table_file = PROJECT_BASE.joinpath(
        "annotations", "autogen", f"{assembler}_qv-est.tsv"
    )
    table = pd.read_csv(table_file, sep="\t", header=0, comment="#")
    sample_suffix = None
    if assembler == "verkko":
        sample_suffix = "vrk-ps-sseq"
    if assembler == "hifiasm":
        sample_suffix = "hsm=-vrk-sseq"
    phased_units = table.loc[table["asm_unit"] == "phased", :].copy()
    assert phased_units.shape[0] == HGSVC_TOTAL
    var_qv = round(phased_units["variant_qv_est"].median(), 0)
    merq_qv = round(phased_units["merqury_qv_est"].median(), 0)
    print(f"{assembler} median variant QV: {var_qv}")
    print(f"{assembler} median Merqury QV: {merq_qv}")
    return

get_qv_stat("verkko")
get_qv_stat("hifiasm")

def compute_flagger_stats(assembler):

    def assign_au(contig):
        if any(contig.startswith(x) for x in ["h1tg", "haplotype1"]):
            return "hap1"
        elif any(contig.startswith(x) for x in ["h2tg", "haplotype2"]):
            return "hap2"
        elif "unassigned" in contig:
            return "unassigned"
        else:
            raise

    def ann_seq_length(row):
        l = SEQLENS[(row.assembly, None, row.seq)]
        return l

    CACHE_SEQLEN_FILE = PROJECT_NB_CACHE.joinpath(f"cache.seqlens.{assembler}.pck")

    SEQLENS = pck.load(open(CACHE_SEQLEN_FILE, "rb"))["data"]

    folder = pathlib.Path(
        f"/home/ebertp/work/share/hgsvc3/assembly_annotations/flagger/{assembler}"
    )
    if assembler == "verkko":
        assm_suffix = ".vrk-ps-sseq"
    if assembler == "hifiasm":
        assm_suffix = ".hsm-ps-sseq"

    full_output = []
    agg_lengths = dict()
    agg_pct = col.defaultdict(list)
    for bed_file in folder.glob("*.bed"):
        df = pd.read_csv(bed_file, sep="\t", skiprows=1, header=None, usecols=[0,1,2,3])
        sample = bed_file.name.split(".")[0]
        df.columns = ["seq", "start", "end", "label"]
        df["sample"] = sample
        df["asm_unit"] = df["seq"].apply(assign_au)
        df["assembly"] = df["sample"] + assm_suffix
        select_large = df["end"] > int(1e6)-1
        df["is_large"] = 0
        df.loc[select_large, "is_large"] = 1
        df["length"] = df["end"] - df["start"]
        df["seq_length"] = df.apply(ann_seq_length, axis=1)
        for au, seqs in df.groupby("asm_unit"):
            sub = seqs.drop_duplicates("seq", inplace=False)
            agg_len = sub["seq_length"].sum()
            agg_lengths[(sample, au)] = int(agg_len)
        full_output.append(df)
    full_output = pd.concat(full_output, axis=0, ignore_index=False)
    
    by_au = full_output.groupby(["sample", "asm_unit", "label"])["length"].sum()
    for (sample, au, label), value in by_au.items():
        sample_au_len = agg_lengths[(sample, au)]
        pct_len = round(value/sample_au_len * 100, 2)
        agg_pct[(au, label)].append(pct_len)
        if au != "unassigned":
            agg_pct[("phased", label)].append(pct_len)

    for label in ["Hap"]:
        median_pct = round(np.median(agg_pct[("phased", label)]), 1)
        print(f"{assembler} median {label}: {median_pct}%")

    return

compute_flagger_stats("verkko")
#compute_flagger_stats("hifiasm")

def compute_mutual_support(assembler):

    summary_files = [
        pl.Path(
            "/home/ebertp/work/projects/hgsvc/2024_asm_compare/stats/hsm-to-vrk/summary",
            "SAMPLES.hsm-to-vrk.mapq-1.seq-100k.aln-10k.merged-regions.tsv"
        ),
        pl.Path(
            "/home/ebertp/work/projects/hgsvc/2024_asm_compare/stats/vrk-to-hsm/summary",
            "SAMPLES.vrk-to-hsm.mapq-1.seq-100k.aln-10k.merged-regions.tsv"
        )
    ]

    if assembler == "verkko":
        summary_files = summary_files[:1]
    if assembler == "hifiasm":
        summary_files = summary_files[1:]

    for summary_file in summary_files:
        df = pd.read_csv(summary_file, sep="\t", header=0)
        qry_trg = summary_file.name.split(".")[1]
        print(qry_trg)
    
        for prec, stats in df.groupby("precision"):
            print(prec)
            print(stats["supported_pct"].median(), "%")
        print("===")
        
#compute_mutual_support("verkko")
#compute_mutual_support("hifiasm")

verkko median aun: 137.0 Mbp
hifiasm median aun: 95.0 Mbp
verkko median variant QV: 54.0
verkko median Merqury QV: 57.0
hifiasm median variant QV: 53.0
hifiasm median Merqury QV: 58.0
verkko median Hap: 99.7%
