In [1]:
import pathlib as pl
import datetime as dt
import io
import pandas as pd

LOCAL_BASE = "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies"

PROJECT_BASE = pl.Path(LOCAL_BASE).resolve(strict=True)

PROJECT_DATA_ROOT = pl.Path("/home/ebertp/work/projects/hgsvc").resolve()

PROJECT_NB_CACHE = PROJECT_BASE.joinpath("notebooks", ".cache").resolve()
PROJECT_NB_CACHE.mkdir(exist_ok=True, parents=True)

PLOT_ROOT = PROJECT_DATA_ROOT.joinpath("plotting").resolve()
PLOT_ROOT.mkdir(exist_ok=True, parents=True)

PLOT_OUT_MAIN_FIG1 = PLOT_ROOT.joinpath("2024_main_fig1")
PLOT_OUT_SUPPL_FIG = PLOT_ROOT.joinpath("2024_suppl_fig")

TABLE_OUT_SUPPL = PROJECT_DATA_ROOT.joinpath("2024_suppl_tables")
TABLE_OUT_SUPPL.mkdir(exist_ok=True, parents=True)

_ts = dt.datetime.now()
TIMESTAMP = _ts.strftime("%Y%m%dT%H%M")
TSNOW = TIMESTAMP
TODAY = TIMESTAMP

# Load HGSVC sample table

HGSVC_SAMPLES_TABLE = PROJECT_BASE.joinpath(
    "annotations", "projectmng", "hgsvc_samples.tsv"
)

HGSVC_SAMPLES = pd.read_csv(HGSVC_SAMPLES_TABLE, comment="#", sep="\t", header=0)
HGSVC_SAMPLES["sex"] = HGSVC_SAMPLES["sex"].str.lower()
HGSVC_SAMPLES["member"] = HGSVC_SAMPLES["member"].str.lower()
HGSVC_SAMPLES["is_child"] = HGSVC_SAMPLES["member"].apply(lambda x: x in ["son", "daughter", "child"])
HGSVC_SAMPLES["sample"] = HGSVC_SAMPLES["sample"].str.replace("GM", "NA")

DATA_PRODUCTION_TABLE = PROJECT_BASE.joinpath(
    "annotations", "projectmng", "data_production_status.tsv"
)

DATA_PRODUCTION_STATUS = pd.read_csv(DATA_PRODUCTION_TABLE, sep="\t", header=0, comment="#")
HGSVC_SAMPLE_BATCH_NUMBERS = dict()
for row in DATA_PRODUCTION_STATUS.itertuples():
    assert row.sample_batch in [1,2,3,-1]
    HGSVC_SAMPLE_BATCH_NUMBERS[row.sample] = row.sample_batch
    HGSVC_SAMPLE_BATCH_NUMBERS[row.sample.replace("NA", "GM")] = row.sample_batch
    HGSVC_SAMPLE_BATCH_NUMBERS[row.sample.replace("GM", "NA")] = row.sample_batch

HGSVC_SAMPLES["batch_num"] = HGSVC_SAMPLES["sample"].map(lambda sample: HGSVC_SAMPLE_BATCH_NUMBERS[sample])

HGSVC_TOTAL=65
HGSVC_ALL=HGSVC_SAMPLES.loc[HGSVC_SAMPLES["batch_num"] > 0, "sample"].values
assert HGSVC_ALL.size == HGSVC_TOTAL
HGSVC_MALE=30
HGSVC_FEMALE=35
HGSVC_MALES = []
HGSVC_FEMALES = []
for row in HGSVC_SAMPLES.itertuples():
    if row.batch_num < 1:
        continue
    if row.sex == "male":
        HGSVC_MALES.append(row.sample)
    elif row.sex == "female":
        HGSVC_FEMALES.append(row.sample)
    else:
        raise
assert len(HGSVC_MALES) == HGSVC_MALE
assert len(HGSVC_FEMALES) == HGSVC_FEMALE

HGSVC_MALES = sorted(HGSVC_MALES)
HGSVC_FEMALES = sorted(HGSVC_FEMALES)

# Special function to load region annotations

ROI_ANNOTATIONS = PROJECT_BASE.joinpath(
    "annotations", "roi", "roi_ref_coords.tsv"
)

# adding population colors

COLOR_TABLE_FILE = PROJECT_BASE.joinpath(
    "annotations", "projectmng", "hgsvc3_pop_colors.tsv"
)
COLOR_TABLE = None
if COLOR_TABLE_FILE.is_file():
    _buffer = io.StringIO()
    with open(COLOR_TABLE_FILE) as table:
        for line in table:
            if line.startswith("#"):
                continue
            _buffer.write(line)
    _buffer.seek(0)
    COLOR_TABLE = pd.read_csv(_buffer, sep="\t", header=0)

if COLOR_TABLE is not None:
    HGSVC_SAMPLES["rgb_rel_pop"] = "0,0,0"
    HGSVC_SAMPLES["rgb_rel_super"] = "0,0,0"
    pop_colors = []
    super_colors = []
    # this: 1KGP color table knows "AMR|AFR" continental group;
    # set this for samples in case population codes match
    adapted_super_pop = []
    for row in HGSVC_SAMPLES.itertuples():
        this_pop = row.population
        this_super = row.supergroup
        try:
            pop_rgb = COLOR_TABLE.loc[COLOR_TABLE["population"] == this_pop, "rgb_rel"].iloc[0]
        except IndexError:
            print(this_pop)
            continue
        alt_superpop = COLOR_TABLE.loc[COLOR_TABLE["population"] == this_pop, "continental_group"].iloc[0]
        super_rgb = COLOR_TABLE.loc[COLOR_TABLE["population"] == alt_superpop, "rgb_rel"].iloc[0]
        adapted_super_pop.append(alt_superpop)
        pop_colors.append(pop_rgb)
        super_colors.append(super_rgb)
    HGSVC_SAMPLES["supergroup"] = adapted_super_pop
    HGSVC_SAMPLES["rgb_rel_pop"] = pop_colors
    HGSVC_SAMPLES["rgb_rel_super"] = super_colors
    # HGSVC_SAMPLES = HGSVC_SAMPLES.merge(
    #     COLOR_TABLE, how="outer",
    #     left_on="supergroup",
    #     right_on="continental_group"
    # )


def process_data_freeze_table(table):
    
    select_columns = [
        "sample", "filename", "datatype",
        "cov_xfold_grt_0bp_at_3Gbp", "length_N50_grt_0bp",
        "cov_xfold_grt_100kbp_at_3Gbp", "length_N50_grt_100kbp"
    ]
    table = table.loc[table["filename"] == "all", select_columns].copy()
    for column in table.columns:
        if column.startswith("length"):
            table[column] /= 1000
            table[column] = table[column].round(1)
            table[column] = table[column].astype(float)
    
    pivoted = []
    for sample, stats in table.groupby("sample"):
        hifi_sub = stats.loc[stats["datatype"] == "hifi", :]
        ont_sub = stats.loc[stats["datatype"] == "ont", :]
        sample_stats = {
            "sample": sample,
            "hifi_cov": hifi_sub["cov_xfold_grt_0bp_at_3Gbp"].iloc[0],
            "hifi_n50": hifi_sub["length_N50_grt_0bp"].iloc[0],
            "ont_cov": ont_sub["cov_xfold_grt_0bp_at_3Gbp"].iloc[0],
            "ont_n50": ont_sub["length_N50_grt_0bp"].iloc[0],
            "ontul_cov": ont_sub["cov_xfold_grt_100kbp_at_3Gbp"].iloc[0],
            "ontul_n50": ont_sub["length_N50_grt_100kbp"].iloc[0]
        }
        pivoted.append(sample_stats)
        
    table = pd.DataFrame.from_records(pivoted)
    return table
    
    
DATA_SOURCE_FILE = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "hgsvc3_assembly_data_sources.draft.tsv"
)
DATA_FREEZE = None
if DATA_SOURCE_FILE.is_file():
    DATA_FREEZE = pd.read_csv(DATA_SOURCE_FILE, sep="\t", comment="#", header=0)
    DATA_FREEZE = process_data_freeze_table(DATA_FREEZE)
    HGSVC_SAMPLES = HGSVC_SAMPLES.merge(
        DATA_FREEZE,
        left_on="sample",
        right_on="sample",
        how="inner"
    )


def load_ref_roi(reference, roi_name):
    
    roi_table = pd.read_csv(ROI_ANNOTATIONS, sep="\t", header=0, comment="#")
    known_references = roi_table["reference"].unique()
    known_rois = roi_table["name"].unique()
    
    match_ref = [r for r in known_references if reference.lower() in r.lower()]
    match_roi = [r for r in known_rois if roi_name.lower() in r.lower()]
    if len(match_ref) == 0 or len(match_roi) == 0:
        raise ValueError(f"Cannot match ref or ROI: {reference} / {roi_name}")
    elif len(match_ref) == 1 and len(match_roi) == 1:
        select_ref = match_ref[0]
        select_roi = match_roi[0]
    else:
        raise ValueError(f"Ambiguous reference or ROI: {reference} / {roi_name}")
    row = roi_table.loc[(roi_table["reference"] == select_ref) & (roi_table["name"] == select_roi), :]
    assert row.shape[0] == 1
    return row["chrom"].values[0], row["start"].values[0], row["end"].values[0]


def get_nb_stamp(nb_name):
    all_nb = PROJECT_BASE.joinpath("notebooks").glob("**/*.ipynb")
    this_nb = [nb for nb in all_nb if nb_name in nb.name and "checkpoint" not in nb.parent.name]
    assert len(this_nb) == 1, f"Missing: {nb_name} /// {this_nb}"
    nb_path = this_nb[0].relative_to(PROJECT_BASE)
    stamp = f"{nb_path}|{TODAY}"
    return stamp


def get_samples_ordered_by_pop():
    
    tmp = HGSVC_SAMPLES.sort_values(["supergroup", "sample"], ascending=True, inplace=False)
    samples_pop = [(row.supergroup, row.sample) for row in tmp.itertuples()]
    assert len(samples_pop) == HGSVC_TOTAL
    return samples_pop
