In [1]:
%run "../00_project_config.ipynb"

import pathlib as pl
import pandas as pd
import collections as col

local_mount = pl.Path(
    "/mounts/hilbert/project"
)

remote_prefix = "/gpfs/project"

kmdb_folders = local_mount.joinpath(
    "projects/medbioinf/projects/assemblies/hybrids/kmer_db/wd/results/databases/trios"
)


sample_sheet_all_input = pl.Path(
    "../../samples/vrk_unps_all_samples.tsv"
)

sample_sheet_all_output = pl.Path(
        "../../samples/special_runs/vrk_trio_samples.tsv"
    )

def create_sseq_phasing_sample_sheet(kmer_dbs, sample_sheet_file_input, sample_sheet_file_output):
    
    sample_sheet = pd.read_csv(
        sample_sheet_file_input, sep="\t",
        header=0, comment="#"
    )
    sample_sheet = sample_sheet.loc[sample_sheet["sample"].isin(kmer_dbs), :].copy()
    sample_sheet["target"] = "trio"
    sample_sheet["hap1"] = sample_sheet["sample"].apply(lambda x: kmer_dbs[x]["hap1"])
    sample_sheet["hap2"] = sample_sheet["sample"].apply(lambda x: kmer_dbs[x]["hap2"])
        
    sample_sheet = sample_sheet[["sample", "target", "hap1", "hap2", "hifi", "ont"]]
    sample_sheet.sort_values("sample", inplace=True)

    remark = (
        "# - follow hifiasm CLI convention that\n"
        "# hap1 is paternal\n"
        "# and\n"
        "# hap2 is maternal\n"
        "# HG00733R - only Revio hifi data\n"
        "# HG00514 and NA19240 topped up with additional hifi Revio data\n"
    )
    
    with open(sample_sheet_file_output, "w") as tsv:
        _ = tsv.write(f"# {TODAY}\n")
        _ = tsv.write(f"# {sample_sheet.shape[0]} samples\n")
        _ = tsv.write(remark)
        sample_sheet.to_csv(
            tsv, sep="\t", header=True, index=False
        )
    return None


def load_kmer_databases():
    
    all_dbs = list(
        db for db in kmdb_folders.glob("*meryl.tar.gz")
        if "parental" not in db.name
    )
    
    collect_dbs = col.defaultdict(dict)
    for db in all_dbs:
        hap = None
        remote_path = str(db).replace(str(local_mount), remote_prefix)
        if "paternal" in db.name:
            hap = "hap1"
        if "maternal" in db.name:
            hap = "hap2"
        assert hap is not None
        if "NA24385" in db.name:
            collect_dbs["NA24385"][hap] = remote_path
        elif "HXT" in db.name:
            continue
        else:
            sample = db.name.split("-")[0]
            collect_dbs[sample][hap] = remote_path
            if sample == "HG00733":
                collect_dbs[f"{sample}R"][hap] = remote_path
    return collect_dbs

kmdbs = load_kmer_databases()
create_sseq_phasing_sample_sheet(kmdbs, sample_sheet_all_input, sample_sheet_all_output)

