In [6]:
%run "../00_project_config.ipynb"

import pathlib as pl
import collections as col
import pandas as pd
import os

local_mount = pl.Path("/mounts/hilbert/project")
assert local_mount.is_dir()

remote_prefix = pl.Path("/gpfs/project")

top_path = pl.Path(
    "projects/medbioinf/projects/assemblies/hybrids/verkko/wd/results/assemblies/verkko"
)

sample_sheet_out_all = PROJECT_BASE.joinpath(
    "samples", "vrk_eval_all_samples.tsv"
)

sample_sheet_out_hgsvc3 = PROJECT_BASE.joinpath(
    "samples", "vrk_eval_hgsvc3_samples.tsv"
)


# this is just to look up the read fofn files
unphased_sample_sheet = PROJECT_BASE.joinpath(
    "samples", "vrk_unps_all_samples.tsv"
)
unphased_samples = pd.read_csv(unphased_sample_sheet, sep="\t", header=0, comment="#")


def replace_path(local_path):
    
    remote = str(remote_prefix)
    local = str(local_mount)
    
    new_path = str(local_path).replace(local, remote)
    return new_path

def is_empty(file_path):
    return os.stat(file_path).st_size == 0


files_by_sample = col.defaultdict(dict)
file_count = col.Counter()
for fasta_file in local_mount.joinpath(top_path).glob("**/*.fasta.gz"):
    if "exemplar" in fasta_file.name:
        continue
    check_empty = fasta_file.with_suffix(".gz.EMPTY")
    if check_empty.is_file():
        set_file_na = True
    else:
        set_file_na = False
    
    sample = fasta_file.parent.name
    if "ps-none" in sample:
        continue
    if "red" in sample:
        continue
    if "MTJ" in sample:
        continue
    try:
        plain = sample.split(".")[0]
        sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == plain, "sex"].values[0]
    except IndexError:
        if "ps-hic" in sample:
            try:
                plain = sample[:7]
                sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == plain, "sex"].values[0]
            except IndexError:
                raise ValueError(f"{plain} / {sample}")
        elif "HG00733R" in sample:
            try:
                plain = sample[:7]
                sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == plain, "sex"].values[0]
            except IndexError:
                raise ValueError(f"{plain} / {sample}")
        else:
            raise ValueError(sample)
    sample = sample.replace("ps-", "vrk-ps-")
    sample_name = sample.split(".")[0]
    if sample_name.endswith("A"):
        sample_name = sample_name[:-1]
    sample_hifi = unphased_samples.loc[unphased_samples["sample"] == sample_name, "hifi"].values[0]
    sample_ont = unphased_samples.loc[unphased_samples["sample"] == sample_name, "ont"].values[0]
    asm_unit = fasta_file.name.split(".")[2].replace("-", "_")
    remote_full_path = replace_path(fasta_file)
    if set_file_na:
        remote_full_path = "n/a"
    file_count[sample] += 1
    files_by_sample[sample][asm_unit] = remote_full_path
    files_by_sample[sample]["sex"] = sex
    files_by_sample[sample]["reads_hifi"] = sample_hifi
    files_by_sample[sample]["reads_ont"] = sample_ont
    if any(x in sample for x in ["hic", "trio"]) or sample[7] in ["R", "A"]:
        files_by_sample[sample]["sample_set"] = "special"
    else:
        files_by_sample[sample]["sample_set"] = "hgsvc3"

sample_sheet = []
for sample, num_files in file_count.most_common():
    if num_files < 5:
        break
    row = pd.DataFrame.from_records(
        files_by_sample[sample], index=[sample])
    sample_sheet.append(row)
    
sample_sheet = pd.concat(sample_sheet, axis=0, ignore_index=False)
sample_sheet.sort_index(inplace=True)

with open(sample_sheet_out_all, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    _ = dump.write(f"# {sample_sheet.shape[0]} samples\n")
    sample_sheet.to_csv(dump, sep="\t", header=True, index=True, index_label="sample")
    
sample_sheet = sample_sheet.loc[sample_sheet["sample_set"] == "hgsvc3", :].copy()
sample_sheet.drop("sample_set", axis=1, inplace=True)
with open(sample_sheet_out_hgsvc3, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    _ = dump.write(f"# {sample_sheet.shape[0]} samples\n")
    sample_sheet.to_csv(dump, sep="\t", header=True, index=True, index_label="sample")
