In [4]:
%run "../00_project_config.ipynb"

import pandas as pd
import pathlib as pl
import os
import io
import json

json_path = pl.Path(
    "/mounts/hilbert/project",
    "projects/medbioinf/projects/assemblies/hybrids/verkko/wd/proc/assemblies/verkko"
)

sample_sheet_out = PROJECT_BASE.joinpath(
    "samples", "vrk_qcaln_samples.tsv"
)

out_records = []
for json_file in json_path.glob("*.ps-sseq.output.json"):
    with open(json_file, "r") as dump:
        outfiles = json.load(dump)
        sample = outfiles["sample"]
        hap1 = outfiles["hap1_fasta"]["abs_path"]
        hap2 = outfiles["hap2_fasta"]["abs_path"]
        unassign = outfiles["unassigned_fasta"]["abs_path"]
        disconn = outfiles["disconnected_fasta"]["abs_path"]
        rdna = outfiles["rdna_fasta"]["abs_path"]
        ebv = outfiles["ebv_fasta"]["abs_path"]
        mito = outfiles["mito_fasta"]["abs_path"]
        rdna_repr = outfiles["rdna_repr"]["abs_path"]
        ebv_repr = outfiles["ebv_repr"]["abs_path"]
        mito_repr = outfiles["mito_repr"]["abs_path"]
        try:
            sex = HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == sample, "sex"].values[0]
        except IndexError:
            raise ValueError(sample)
        out_records.append(
            (
                sample, sex, hap1, hap2, unassign, disconn,
                rdna, rdna_repr, mito, mito_repr, ebv, ebv_repr
            )
        )
        
df = pd.DataFrame.from_records(
    out_records, columns=[
        "sample", "sex", "asm_hap1", "asm_hap2", "asm_unassigned", "asm_disconnected",
        "asm_rdna", "asm_rdnaEX", "asm_mito", "asm_mitoEX", "asm_ebv", "asm_ebvEX"
    ]
)
df.sort_values("sample", inplace=True)

with open(sample_sheet_out, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    df.to_csv(dump, sep="\t", header=True, index=False)