In [2]:
import datetime as dt
import pathlib as pl
import hashlib as hl
import pandas as pd

date = dt.date.today()
date = date.strftime("%Y%m%d")

repo_path = pl.Path(".").resolve().parent.parent

local_run = True

local_imbb_mount = pl.Path("/mounts/hilbert/project/projects/medbioinf")
remote_imbb_path = pl.Path("/gpfs/project/projects/medbioinf")

data_root_path = local_imbb_mount if local_run else remote_imbb_path

subfolders = [
    "data/00_RESTRUCTURE/project-centric/hgsvc/pacbio_hifi/",
]

def merge_overlapping_paths(abs_path, rel_path):
    """Why the f*** does difflib not work on
    strings generated from paths???
    """
    top_level = abs_path
    connect_level = rel_path.split("/")[0]
    while 1:
        if top_level.name == connect_level:
            break
        top_level = top_level.parent
        if top_level == pl.Path("/"):
            raise ValueError(f"cannot merge: {abs_path} / {rel_path}")
    return top_level.parent

assert data_root_path.is_dir()

missing_fastqs = []
for subfolder in subfolders:
    search_path = data_root_path.joinpath(subfolder)
    for lst_file in search_path.glob("**/sample-folder.lst"):
        with open(lst_file, "r") as listing:
            for line in listing:
                if not line.strip():
                    continue
                top_level = merge_overlapping_paths(search_path, line)
                sample_folder = top_level.joinpath(line.strip())
                assert sample_folder.is_dir()
                
                for bam_file in sample_folder.glob("**/*.bam"):
                    if "subreads" in bam_file.name:
                        continue
                    fastq = bam_file.with_suffix(".fastq.gz")
                    plain_fastq = bam_file.with_suffix(".fastq")
                    # DEBUG for new JAX data
                    # 2023-Aug
                    if "with_5mc" in str(fastq).lower():
                        fastq = pl.Path(str(fastq).replace(".with_5mC", ""))
                        plain_fastq = pl.Path(str(plain_fastq).replace(".with_5mC", ""))                    
                    if not fastq.is_file():
                        if plain_fastq.is_file():
                            assert plain_fastq.is_file()
                            # someone uploaded uncompressed fastq ...
                            rel_path = str(fastq.relative_to(sample_folder))
                            path_hash = hl.md5(rel_path.encode("utf-8")).hexdigest()
                            if local_run:
                                input_path = str(plain_fastq).replace(str(local_imbb_mount), str(remote_imbb_path))
                                fastq_path = str(fastq).replace(str(local_imbb_mount), str(remote_imbb_path))
                            else:
                                input_path = str(plain_fastq)
                                fastq_path = str(fastq)
                            missing_fastqs.append(
                                (path_hash, rel_path, input_path, fastq_path)
                            )
                        else:
                            rel_path = str(fastq.relative_to(sample_folder))
                            path_hash = hl.md5(rel_path.encode("utf-8")).hexdigest()
                            if local_run:
                                bam_path = str(bam_file).replace(str(local_imbb_mount), str(remote_imbb_path))
                                fastq_path = str(fastq).replace(str(local_imbb_mount), str(remote_imbb_path))
                            else:
                                bam_path = str(bam_file)
                                fastq_path = str(fastq)
                            missing_fastqs.append(
                                (path_hash, rel_path, bam_path, fastq_path)
                            )
                        
                        
if missing_fastqs:
    df = pd.DataFrame.from_records(
        missing_fastqs,
        columns=["path_hash", "rel_path", "input_path", "output_path"]
    )

    df.to_csv(
        repo_path.joinpath("samples", f"{date}_missing-fastq.tsv"),
        header=True, index=False, sep="\t"
    )