In [1]:
%run "../00_project_config.ipynb"

import pathlib as pl
import pandas as pd

local_mount = pl.Path(
    "/mounts/hilbert/project"
)

# ordered by priority to ensure that path files
# for batch 1 are not updated to newer phasing
# pipeline release for batch 2
gaf_paths_folders = [
    local_mount.joinpath(
        "projects/medbioinf/projects/mihen108/test_wd/rukki_b62fe4"
    ),
    local_mount.joinpath(
        "projects/medbioinf/projects/mihen108/test_wd/rukki_v0.3.1-alpha"
    ),
]

sample_sheet_all_input = pl.Path(
    "../../samples/vrk_unps_all_samples.tsv"
)

sample_sheet_all_output = pl.Path(
        "../../samples/vrk_sseq_all_samples.tsv"
    )

sample_sheet_hgsvc3_input = pl.Path(
    "../../samples/vrk_unps_hgsvc3_samples.tsv"
)
sample_sheet_hgsvc3_output = pl.Path(
    "../../samples/vrk_sseq_hgsvc3_samples.tsv"
)


def create_sseq_phasing_sample_sheet(sample_sheet_file_input, sample_sheet_file_output):
    
    sample_sheet = pd.read_csv(
        sample_sheet_file_input, sep="\t",
        header=0, comment="#"
    )

    # DEBUG 2024-01-29
    # add all samples to sheet except for
    # last YRI sample NA19239 complete
    # sample/data mix-up still needs to be
    # sorted out and resequencing is underway
    ignore_samples = [
        "HG00733red1",
        "HG00733red2",
        "HG00733R1",
        "HG00733R2",
        "HG03683stage2",
        "HG03683stage4",
        "NA19983stage4",
    ]

    gaf_files = dict()
    not_updated_paths = 0
    for path_folder in gaf_paths_folders:
        for path_file in local_mount.joinpath(path_folder).glob("**/*_rukki_paths.gaf"):
            sample = path_file.name.split("_")[0]
            if sample in ignore_samples:
                print(f"Skipping {sample}")
                continue
            remote_file_path = str(path_file).replace(str(local_mount), "/gpfs/project")
            if "old" in remote_file_path.lower():
                print(f"Skipping over OLD phasing paths file: {remote_file_path}")
                continue
            if sample not in gaf_files:
                gaf_files[sample] = remote_file_path
            else:
                lookup_sample = sample[:7]
                sample_batch = int(
                    HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == lookup_sample, "batch_num"].iloc[0]
                )
                if sample_batch == 3:
                    print(f"Updating phasing paths for sample/batch 3: {sample}")
                    print(f"Was: {gaf_files[sample]}")
                    print(f"Is: {remote_file_path}")
                    gaf_files[sample] = remote_file_path
                else:
                    not_updated_paths += 1
                    print(f"Not updating phasing paths for sample/batch {sample_batch}: {sample}")

    # UPDATE 2023-12-09
    # sample PUR mother / HG00732 was topped up with Revio data,
    # hence batch 1 has now size 38 instead of 39
    # Those 38 samples should not have been updated with new
    # phasing paths
    assert not_updated_paths == 38, f"Batch 1 size of 38 / mismatch: {not_updated_paths}"

    sample_sheet["target"] = "sseq"
    sample_sheet["phasing_paths"] = "n/a"
    sample_sheet["phasing_paths"] = sample_sheet["sample"].replace(gaf_files)

    # remove samples w/o phasing paths
    sample_sheet = sample_sheet.loc[sample_sheet["sample"] != sample_sheet["phasing_paths"], :].copy()

    sample_sheet = sample_sheet[["sample", "target", "phasing_paths", "hifi", "ont"]]

    with open(sample_sheet_file_output, "w") as tsv:
        _ = tsv.write(f"# {TODAY}\n")
        _ = tsv.write(f"# {sample_sheet.shape[0]} samples\n")
        sample_sheet.to_csv(
            tsv, sep="\t", header=True, index=False
        )
    return None


create_sseq_phasing_sample_sheet(sample_sheet_all_input, sample_sheet_all_output)
create_sseq_phasing_sample_sheet(sample_sheet_hgsvc3_input, sample_sheet_hgsvc3_output)



Skipping HG00733red2
Skipping HG03683stage2
Skipping HG00733red1
Skipping HG03683stage4
Skipping NA19983stage4
Not updating phasing paths for sample/batch 1: HG03248
Skipping over OLD phasing paths file: /gpfs/project/projects/medbioinf/projects/mihen108/test_wd/rukki_v0.3.1-alpha/NA19238.OLD/NA19238_rukki_paths.gaf
Skipping HG00733red2
Not updating phasing paths for sample/batch 1: HG02587
Updating phasing paths for sample/batch 3: HG00732
Was: /gpfs/project/projects/medbioinf/projects/mihen108/test_wd/rukki_b62fe4/HG00732/HG00732_rukki_paths.gaf
Is: /gpfs/project/projects/medbioinf/projects/mihen108/test_wd/rukki_v0.3.1-alpha/HG00732/HG00732_rukki_paths.gaf
Skipping over OLD phasing paths file: /gpfs/project/projects/medbioinf/projects/mihen108/test_wd/rukki_v0.3.1-alpha/HG00096.OLD/HG00096_rukki_paths.gaf
Not updating phasing paths for sample/batch 1: HG03683
Not updating phasing paths for sample/batch 1: HG02818
Not updating phasing paths for sample/batch 1: HG04036
Skipping HG0368

Not updating phasing paths for sample/batch 1: HG01457
