In [10]:
import pathlib as pl
import pandas as pd
import collections as col

MOUNT_PREFIX = pl.Path("/mounts/hilbert/project")

REMOTE_PREFIX = "/gpfs/project"

data_path = pl.Path(
    "projects/medbioinf/data/00_RESTRUCTURE/project-centric/ceph_pedigree/2024-01/assemblies/males"
)

sample_infos = col.defaultdict(dict)

def make_remote(file_path):
    
    file_path = str(file_path)
    file_path = file_path.replace(str(MOUNT_PREFIX), REMOTE_PREFIX)
    return file_path


for fasta_file in MOUNT_PREFIX.joinpath(data_path).glob("**/*.fasta"):
    filename = fasta_file.name
    sample_id = fasta_file.parents[1].name
    if not sample_id.startswith("NA"):
        sample_id = fasta_file.parents[2].name
    assert sample_id.startswith("NA") or sample_id.startswith("200"), sample_id
    sample_id = f"{sample_id}-CEPH"
    if "hap1" in filename or "haplotype1" in filename:
        asm_unit = "hap1"
    elif "hap2" in filename or "haplotype2" in filename:
        asm_unit = "hap2"
    elif "unassigned" in filename:
        asm_unit = "unassigned"
    else:
        print(f"skipping over {fasta_file}")
        continue
    sample_infos[sample_id][asm_unit] = make_remote(fasta_file)
    
sample_sheet = [(k, d["hap1"], d["hap2"], d.get("unassigned", "")) for k, d in sample_infos.items()]

sample_sheet = pd.DataFrame.from_records(
    sample_sheet,
    columns=["sample", "asm_hap1", "asm_hap2", "asm_unassigned"]
)
sample_sheet.sort_values("sample", inplace=True)
sample_sheet["family"] = 1463
sample_sheet["sex"] = "male"
out_tsv_vrk = pl.Path("../../samples/special_runs/cephped_vrk_all.tsv").resolve()
out_tsv_hsm = pl.Path("../../samples/special_runs/cephped_hsm_all.tsv").resolve()
print(out_tsv)

sub_vrk = sample_sheet.loc[sample_sheet["asm_unassigned"] != "", :].copy()
sub_hsm = sample_sheet.loc[sample_sheet["asm_unassigned"] == "", :].copy()
sub_hsm.drop("asm_unassigned", axis=1, inplace=True)

with open(out_tsv_vrk, "w") as dump:
    _ = dump.write("# 20240502T1417\n")
    sub_vrk.to_csv(dump, sep="\t", header=True, index=False)

with open(out_tsv_hsm, "w") as dump:
    _ = dump.write("# 20240502T1417\n")
    sub_hsm.to_csv(dump, sep="\t", header=True, index=False)


/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/samples/special_runs/cephped_all.tsv
