In [4]:
%run "../00_project_config.ipynb"

import pathlib as pl
import pandas as pd
import io

local_mount = "/mounts/hilbert/project"
remote_root = "/gpfs/project"

BATCH_NUM = 3

if BATCH_NUM == 1:
    datestamp = "# 20230818\n\n"
    share_root_folder = pl.Path(remote_root).joinpath(
        "projects/medbioinf/data/00_RESTRUCTURE/shares/globus/outgoing/hgsvc/ebi_upload/"
        "20230818_verkko_batch1"
    )
    data_mover_script = PROJECT_BASE.joinpath(
        "scripts/datamng",
        "data_mover_batch1.sh"
    )
    data_mover_pairs = PROJECT_BASE.joinpath(
        "scripts/datamng",
        "data_mover_pairs_batch1.tsv"
    )
elif BATCH_NUM == 2:
    datestamp = "# 20230927\n\n"
    share_root_folder = pl.Path(remote_root).joinpath(
        "projects/medbioinf/data/00_RESTRUCTURE/shares/globus/outgoing/hgsvc/ebi_upload/"
        "20230927_verkko_batch2"
    )
    data_mover_script = PROJECT_BASE.joinpath(
        "scripts/datamng",
        "data_mover_batch2.sh"
    )
    data_mover_pairs = PROJECT_BASE.joinpath(
        "scripts/datamng",
        "data_mover_pairs_batch2.tsv"
    )
elif BATCH_NUM == 3:
    datestamp = "# 20240201\n\n"
    share_root_folder = pl.Path(remote_root).joinpath(
        "projects/medbioinf/data/00_RESTRUCTURE/shares/globus/outgoing/hgsvc/ebi_upload/"
        "20240201_verkko_batch3"
    )
    data_mover_script = PROJECT_BASE.joinpath(
        "scripts/datamng",
        "data_mover_batch3.sh"
    )
    data_mover_pairs = PROJECT_BASE.joinpath(
        "scripts/datamng",
        "data_mover_pairs_batch3.tsv"
    )
else:
    raise

folder_specs = [
    [
        "projects/medbioinf/projects/assemblies/hybrids/eval/wd/results/statistics/assemblies",
        "*.summary.tsv",
        "statistics/assemblies/{}"
    ],
    [
        "projects/medbioinf/projects/assemblies/hybrids/eval/wd/results/reports/ref_chrom_assign",
        "*.tsv",
        "reports/ref_chrom_assign/{}"
    ],
    [
        "projects/medbioinf/projects/assemblies/hybrids/eval/wd/results/reports/contamination",
        "*.norm.tsv",
        "reports/contamination/{}"
    ],
    [
        "projects/medbioinf/projects/assemblies/hybrids/eval/wd/results/assemblies",
        "**/*.fasta*",
        "assemblies/{}"
    ],
    [
        "projects/medbioinf/projects/assemblies/hybrids/eval/wd/results/alignments",
        "**/*.bam*",
        "alignments/contig_to_ref/{}/{}"
    ],
    [
        "projects/medbioinf/projects/assemblies/hybrids/verkko/wd/results/reports/seq_dedup",
        "*ps-sseq*.summary.tsv",
        "reports/seq_dedup/{}"
    ],
    [
        "projects/medbioinf/projects/assemblies/hybrids/verkko/wd/results/assemblies/verkko",
        "**/*ps-sseq.exemplar*",
        "assemblies/{}"
    ]
]

data_mover_script.parent.mkdir(exist_ok=True, parents=True)

data_mover = io.StringIO()
data_mover.write("#!/usr/bin/bash\n\n")
data_mover.write(datestamp)

source_target_table = []
for source_folder, source_glob, target_sub in folder_specs:
    source_files = sorted(pl.Path(local_mount).joinpath(source_folder).glob(source_glob))
    if not source_files:
        raise ValueError(source_folder)
    
    for source_file in source_files:
        filename = source_file.name
        if "ps-trio" in filename or "ps-hic" in filename:
            continue
        if "karyo-est" in filename:
            continue
        sample = source_file.name.split(".")[0]
        if sample not in HGSVC_SAMPLES["sample"].values:
            continue
        sample_batch = int(HGSVC_SAMPLES.loc[HGSVC_SAMPLES["sample"] == sample, "batch_num"].values[0])
        if sample_batch != BATCH_NUM:
            continue
        if target_sub.startswith("alignments"):
            if "hg38" in source_file.name:
                ref = "hg38"
            else:
                ref = "t2tv2"
            subfolder = target_sub.format(ref, sample)
        else:
            subfolder = target_sub.format(sample)
        remote_source = str(source_file).replace(local_mount, remote_root)
        target_path = share_root_folder.joinpath(subfolder, source_file.name)
        source_target_table.append((remote_source, target_path))

pairs = pd.DataFrame.from_records(source_target_table, columns=["source_path", "target_path"])
pairs.to_csv(data_mover_pairs, header=True, index=False, sep="\t")

for row in pairs.itertuples():
    target_folder = pl.Path(row.target_path).parent
    data_mover.write(
        f"mkdir -p {target_folder}\n"
    )
    data_mover.write(
        f"rsync --progress --checksum {row.source_path} {row.target_path}\n"
    )
    
with open(data_mover_script, "w") as dump:
    dump.write(data_mover.getvalue())