In [22]:
import pathlib
import functools
import pickle as pck
import io

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pathlib.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

_MYNAME="create-link-script"
_MYSTAMP=get_nb_stamp(_MYNAME)

LOCAL_MOUNT = pl.Path(
    "/mounts/hilbert/project"
)

REMOTE_PREFIX = "/gpfs/project"

EVAL_RESULTS_TOPLEVEL = LOCAL_MOUNT.joinpath(
    "projects/medbioinf/projects/assemblies/hybrids/eval/wd/results"
)
assert EVAL_RESULTS_TOPLEVEL.is_dir()

# 2024-12-19 old location for consortium-internal sharing
#GLOBUS_SHARE_TOPLEVEL = (
#    "projects/medbioinf/data/00_RESTRUCTURE/shares/globus/outgoing/hgsvc"
#    "/ebi_upload/20230926_assembly_annotations/hhu/assembly_evaluation"
#    "/results"
#)

# 2024-12-19 new target path for man pub
GLOBUS_SHARE_TOPLEVEL = (
    "projects/medbioinf/data/00_RESTRUCTURE/shares/globus/outgoing/hgsvc"
    "/ebi_upload/"
    "20241218_phase3-main-pub_data/hhu/assembly_data"
)

CACHE_LINK_NAMES = PROJECT_NB_CACHE.joinpath("link_eval_results.cache.pck")

SCRIPT_OUT = PROJECT_BASE.joinpath(
    "scripts", "datamng", "data_mover_eval.pub.sh"
)

CHECK_SAMPLES = set(HGSVC_FEMALES).union(HGSVC_MALES)
ASSEMBLIES = set(["vrk-ps-sseq", "hsm-ps-sseq"])

FORCE_RERUN = False

# 2024-12-19 since IGSR does not allow duplicate filenames (! - not path, names)
# make the files unique by adapting the file extension

KNOWN_EXT = [
    ".bam", ".bam.bai",
    ".tsv", ".tsv.gz",
    ".vcf.gz", ".paf.gz",
    ".tar.gz", ".bed.gz",
    ".bed.gz.tbi", ".fasta.gz",
    ".fasta.gz.fai", ".fasta.gz.gzi",
    ".sed", ".bed", ".vcf.gz.tbi"
]

def replace_ext(filename):

    replaced = False
    for ext in KNOWN_EXT:
        if not filename.endswith(ext):
            continue
        new_filename = filename.replace(ext, f".phase3-pub{ext}")
        replaced = True
        break
    if not replaced:
        raise ValueError(f"Unknown ext: {filename}")
    return new_filename


def skip_file(filename):

    if any(f"{s}." in filename for s in CHECK_SAMPLES) and any(f"{a}" in filename for a in ASSEMBLIES):
        return False
    return True


def replace_prefix(file_path):

    fp = str(file_path).replace(
        str(LOCAL_MOUNT),
        REMOTE_PREFIX
    )
    return fp


def infer_target_path(file_path):

    suffix = str(file_path).split("results/")[-1]
    target_path = pl.Path(REMOTE_PREFIX).joinpath(GLOBUS_SHARE_TOPLEVEL).joinpath(
        suffix
    )
    return target_path


if FORCE_RERUN or not CACHE_LINK_NAMES.is_file():

    link_files = []
    for file_path in EVAL_RESULTS_TOPLEVEL.rglob("**/*"):
        if file_path.is_dir():
            continue
        if skip_file(file_path.name):
            continue
        link_files.append(file_path)

    with open(CACHE_LINK_NAMES, "wb") as dump:
        _ = pck.dump(link_files, dump)

else:
    with open(CACHE_LINK_NAMES, "rb") as dump:
        link_files = pck.load(dump)

script_buffer = io.StringIO()
script_buffer.write("#!/usr/bin/bash\n\n")
script_buffer.write(f"# {_MYSTAMP}\n\n") 
total_files = len(link_files)
created = set()
for fn, lf in enumerate(sorted(link_files), start=1):
    source = replace_prefix(lf)
    target = infer_target_path(lf)
    
    script_buffer.write(f"# {fn}/{total_files}\n")
    parent_folder = target.parent
    if parent_folder not in created:
        script_buffer.write(f"mkdir -p {target.parent}\n")
        created.add(parent_folder)

    # 2024-12-19 fix: empty files not allowed on IGSR
    # in case of inquiries, make this easier to trace
    if ".EMPTY" in lf.name:
        script_buffer.write(f"# ln {source} {target}\n\n")
    else:
        target_name = target.name
        try:
            new_target_name = replace_ext(target_name)
        except ValueError:
            print(lf)
            raise
        target = target.parent.joinpath(new_target_name)
        script_buffer.write(f"ln {source} {target}\n\n")

with open(SCRIPT_OUT, "w") as dump:
    _ = dump.write(script_buffer.getvalue())






    
        
    
    
    