In [2]:
%run "../00_project_config.ipynb"

import pathlib as pl
import collections as col
import pandas as pd
import os

local_mount = pl.Path("/mounts/hilbert/project")
assert local_mount.is_dir()

remote_prefix = pl.Path("/gpfs/project")

top_paths = [
    pl.Path(
        "projects/medbioinf/data/00_RESTRUCTURE/project-centric/hgsvc/processed/20240106_hifiasm_ont_batch1"
    ),
    pl.Path(
        "projects/medbioinf/data/00_RESTRUCTURE/project-centric/hgsvc/processed/20240318_hifiasm_ont_batch2"
    )
]

skip_seqs_path = local_mount.joinpath(
    "projects/medbioinf/data/00_RESTRUCTURE/shares/globus/outgoing",
    "hgsvc/ebi_upload/20230926_assembly_annotations/hifiasm/20240523_drop-dup-seqs"
)

sample_sheet_out = PROJECT_BASE.joinpath(
    "samples", "hsm_eval_hgsvc3_samples.tsv"
)

unphased_sample_sheet = PROJECT_BASE.joinpath(
    "samples", "vrk_unps_all_samples.tsv"
)

unphased_samples = pd.read_csv(unphased_sample_sheet, sep="\t", header=0, comment="#")

def replace_path(local_path):
    
    remote = str(remote_prefix)
    local = str(local_mount)
    
    new_path = str(local_path).replace(local, remote)
    return new_path

def is_empty(file_path):
    return os.stat(file_path).st_size == 0


def find_skip_seqs_file(sample_name):

    skip_seqs = skip_seqs_path.joinpath(
        f"{sample_name}.drop-dup-seqs.txt"
    )
    if skip_seqs.is_file():
        remote_path = replace_path(skip_seqs)
    else:
        remote_path = ""
    return remote_path


files_by_sample = col.defaultdict(dict)
file_count = col.Counter()

for top_path in top_paths:
    search_path = local_mount.joinpath(top_path)
    assert search_path.is_dir()
    for fasta_file in search_path.glob("**/*.fasta.gz"):
        assert "hap1" in fasta_file.name or "hap2" in fasta_file.name
        sample = fasta_file.parent.name
        sample = sample.replace("GM", "NA")
        try:
            plain = sample.split(".")[0]
            sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == plain, "sex"].values[0]
        except IndexError:
            raise ValueError(sample)
        if sample in ["NA19320"]:
            continue
        sample_name = sample + ".hsm-ps-sseq"
        sample_hifi = unphased_samples.loc[unphased_samples["sample"] == sample, "hifi"].values[0]
        sample_ont = unphased_samples.loc[unphased_samples["sample"] == sample, "ont"].values[0]
        batch_num = unphased_samples.loc[unphased_samples["sample"] == sample, "batch"].values[0]
        if "hap1" in fasta_file.name:
            asm_unit = "asm_hap1"
        elif "hap2" in fasta_file.name:
            asm_unit = "asm_hap2"
        else:
            raise
        remote_full_path = replace_path(fasta_file)
        file_count[sample_name] += 1
        files_by_sample[sample_name][asm_unit] = remote_full_path
        files_by_sample[sample_name]["sex"] = sex
        files_by_sample[sample_name]["reads_hifi"] = sample_hifi
        files_by_sample[sample_name]["reads_ont"] = sample_ont
        files_by_sample[sample_name]["verkko_batch"] = batch_num
        files_by_sample[sample_name]["skip_seqs"] = find_skip_seqs_file(sample_name)

sample_sheet = []
for sample, num_files in file_count.most_common():
    assert num_files == 2
    if num_files > 2:
        print(sample)
        raise
    row = pd.DataFrame.from_records(
        files_by_sample[sample], index=[sample])
    sample_sheet.append(row)
    
sample_sheet = pd.concat(sample_sheet, axis=0, ignore_index=False)
sample_sheet.sort_index(inplace=True)

sample_sheet = sample_sheet[
    [
        "verkko_batch", "sex",
        "asm_hap1", "asm_hap2",
        "reads_hifi", "reads_ont",
        "skip_seqs"
    ]
]

with open(sample_sheet_out, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    _ = dump.write(f"# {sample_sheet.shape[0]} samples\n")
    sample_sheet.to_csv(dump, sep="\t", header=True, index=True, index_label="sample")