In [38]:
import pathlib as pl
import pandas as pd
import re

pd.set_option('future.no_silent_downcasting', True)

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pl.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

_MYNAME="dump-label-associations-tables"
_NBSTAMP=get_nb_stamp(_MYNAME)

ASSEMBLER = "verkko"

TABLE_FOLDER = PROJECT_DATA_ROOT.joinpath(
    "2024_merged_labels/summaries"
)

label_to_plain_text = {
    "BSCDUP": "BUSCO single-copy gene (duplicated)",
    "BSCFRG": "BUSCO single-copy gene (fragmented)",
    "DEEPVR": "DeepVariant variant call",
    "FLGCOL": "Flagger collapsed region",
    "FLGERR": "Flagger error",
    "FLGDUP": "Flagger false duplication",
    "FLGUNK": "Flagger unknown issue",
    "ISPCHF": "Inspector flagged region (HiFi)",
    "MRQURY": "Merqury unsupported k-mers",
    "NUCFRQ": "NucFreq flagged region",
    "SSQBRK": "Strand-seq breakpoint (switch error)"
}

for table_file in TABLE_FOLDER.glob("*assoc-tests.tsv"):
    if ASSEMBLER not in table_file.name:
        continue
    if ".ps-no-ont." not in table_file.name:
        continue
    df = pd.read_csv(
        table_file, sep="\t", header=0,
        dtype={
            "fet_pvalue": str,
            "adj_pvalue": str,
        }
                    
    )
    plain_text_labels = df["error_label"].replace(label_to_plain_text, inplace=False)
    df["sample"] = df["sample"].apply(lambda s: s.split(".")[0])
    df.insert(1, "assembler", ASSEMBLER)
    df.insert(3, "label_plain_text", plain_text_labels)

sub = df.loc[df["sig_label"] == "sig.", :]
common_assoc_size = sub.groupby(["error_label", "label_plain_text",  "annotation"]).size()
common_assoc_or = sub.groupby(["error_label", "label_plain_text", "annotation"])["odds_ratio"].median()

common_assoc = pd.concat([common_assoc_size, common_assoc_or], axis=1, ignore_index=False)
common_assoc.columns = ["significant associations (n)", "odds ratio (median)"]
common_assoc.reset_index(drop=False, inplace=True)
common_assoc.rename(
    {
        "error_label": "error flag label",
        "label_plain_text": "label (description)",
        
    }, axis=1, inplace=True
)


df.rename(
    {
        "error_label": "error flag label",
        "label_plain_text": "label (description)",
        "odds_ratio": "odds ratio (OR)",
        "or_ci_low": "OR 95% CI (lower)",
        "or_ci_high": "OR 95% CI (upper)",
        "fet_pvalue": "FET p-value (raw)",
        "adj_pvalue": "FET p-value (BY-adjusted)",
        "sig_label": "adj. p-value < alpha",
    }, axis=1, inplace=True
)

table_sxpe_all_tests = TABLE_OUT_SUPPL.joinpath(
    f"table_sxpe_{ASSEMBLER}-qc-label-assoc.tsv"
)
df.to_csv(table_sxpe_all_tests, sep="\t", header=True, index=False)

table_sxpe_sig_assoc = TABLE_OUT_SUPPL.joinpath(
    f"table_sxpe_{ASSEMBLER}-sig-assoc.tsv"
)
common_assoc.to_csv(table_sxpe_sig_assoc, sep="\t", header=True, index=False)