In [3]:
%run "50_match_sample_files.ipynb"

import pandas as pd
import pathlib as pl
import collections as col
import os
import stat
import hashlib as hl

DRY_RUN = False
VERBOSE = True
DEBUG = False

MANUALLY_CURATED_SOURCES = {
    "UW_WH": ("UW", "hgsvc"),
    "JAX_PA": ("JAX", "hgsvc"),
    "epi2me": ("epi2me", "epi2me")
}

fofn_files = []
fofn_files_out = PROJECT_BASE.joinpath("samples", "fofn_table.tsv")

def set_file_permissions(file_path):
    
    perm = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH
    os.chmod(file_path, perm)
    return


def check_is_identical(file_path, new_name, new_content):
    
    if not file_path.is_file():
        return False
    
    existing_content = []
    with open(file_path, "r") as existing_file:
        for line in existing_file:
            if line.startswith("#"):
                continue
            existing_content.append(line.strip())
        
    is_id_content = existing_content == [str(fp) for fp in new_content]
    is_id_name = file_path.name == new_name
   
    return is_id_content and is_id_name


def compute_read_files_hash(read_files):
    
    casted = sorted([str(p) for p in read_files])
    casted = ",".join(casted)
    file_hash = hl.sha256(casted.encode("utf-8")).hexdigest()
    return file_hash


fofn_collisions = set()

updated_sample_fofns = set()

for sample_name, sample_obj in SAMPLES.items():
    if sample_name == sample_obj.alt:
        continue

    sample_files = []
    sample_files_main = FILES_EXIST_PER_SAMPLE[sample_name]
    sample_files_alt = FILES_EXIST_PER_SAMPLE[sample_obj.alt]
    
    for sf_main in sample_files_main:
        if sf_main in sample_files:
            continue
        sample_files.append(sf_main)
    
    for sf_alt in sample_files_alt:
        if sf_alt in sample_files:
            continue
        sample_files.append(sf_alt)
                
    assert len(sample_files) == len(sample_files_main) + len(sample_files_alt)
    
    last_group = None
    for sample_file in sample_files:
        key = sample_file.read_type, sample_file.file_group
        sample_obj.sample_files[key].append(sample_file.data_rel_path)
        if sample_file.file_group != last_group:
            sample_obj.source_dates[sample_file.file_group].append(
                sample_file.group_date
            )
            sample_obj.source_dates[sample_file.file_group].append(
                sample_file.file_date
            )
            last_group = sample_file.file_group
        else:
            sample_obj.source_dates[sample_file.file_group].append(
                sample_file.file_date
            )
    
    if sample_obj.hifi_complete:
        for (read_type, read_group), read_files in sample_obj.sample_files.items():
            
            read_files_hash = compute_read_files_hash(read_files)
                        
            if read_type != "hifi":
                continue
            source_dates = sample_obj.source_dates[read_group]

            try:
                source_date = str(int(source_dates[0]))
            except ValueError:
                source_date, _ = col.Counter(source_dates).most_common(1)[0]

            read_group_info = MERGED.loc[MERGED["file_group"] == read_group, ["data_source", "project"]]
            if read_group_info.empty:
                read_group_info = set(
                    ALL_FILES.loc[ALL_FILES["file_group"] == read_group, "curated_by"].values
                )
                data_source, project = MANUALLY_CURATED_SOURCES[read_group_info]
            else:
                data_source = set(read_group_info["data_source"].values).pop()
                project = set(read_group_info["project"].values).pop()
                
            if data_source == project and data_source in MANUALLY_CURATED_SOURCES:
                data_source, project = MANUALLY_CURATED_SOURCES[data_source]
            elif data_source == project:
                assert data_source.startswith("HGSVC") and project.startswith("HGSVC"), f"{data_source} / {project}"
                project = "hgsvc"
                data_source = data_source[5:].upper()
            else:
                pass
            
            source_path = sample_obj.get_file_group_lca_path(read_files)
            verified_file_dir = PATH_PREFIX.local.joinpath(
                DATA_ROOT, source_path
            )
            
            # step - write verified file
            verified_file_name = f"{sample_obj.name}.{len(read_files)}-cells.verified"
            verified_file_path = verified_file_dir.joinpath(verified_file_name)
            verified_read_files = sorted([fp.name for fp in read_files])
            
            if check_is_identical(verified_file_path, verified_file_name, verified_read_files):
                if DEBUG:
                    print(f"=== ID - skipping update of {verified_file_path}")
            elif not DRY_RUN:
                assert verified_file_dir.is_dir()
                with open(verified_file_path, "w") as dump:
                    _ = dump.write(f"# {TODAY}\n")
                    _ = dump.write("\n".join(verified_read_files) + "\n")
                set_file_permissions(verified_file_path)
            else:
                if VERBOSE:
                    print(f">>> DRY RUN - not updating: {verified_file_path}")
            
            # step - write fofn file
            fofn_name = f"{sample_obj.name}_{read_type}_fastq.{project}-{data_source}-{source_date}.fofn"
            fofn_path = LOCAL_SAMPLE_ROOT.joinpath(
                sample_obj.name, fofn_name
            )
            
            # this is sub-optimal because it is not sort-invariant
            # however, assuming that data production is finally complete,
            # consequences should be limited
            if fofn_path in fofn_collisions:
                if VERBOSE:
                    print(f"FOFN path collision detected: {fofn_path.name} exists!")
                fofn_path = fofn_path.with_suffix(f".{read_files_hash[:8]}.fofn")
                fofn_name = fofn_path.name
                
            fofn_collisions.add(fofn_path)
            
            if check_is_identical(fofn_path, fofn_name, sorted(read_files)):
                if DEBUG:
                    print(f"=== ID - skipping update of {fofn_path}")
            elif not DRY_RUN:
                with open(fofn_path, "w") as dump:
                    for rf in sorted(read_files):
                        assert str(rf).startswith("project-centric")
                        _ = dump.write(str(rf) + "\n")
                set_file_permissions(fofn_path)
            else:
                if VERBOSE:
                    print(f">>> DRY RUN - not updating: {fofn_path}")
            
            if sample_obj.ont_complete:
                # add Hifi fofn file to fofn table only if ONT is also complete
                remote_fofn_path = str(fofn_path).replace(str(PATH_PREFIX.local), str(PATH_PREFIX.remote))
                fofn_files.append((sample_obj.name, read_type, remote_fofn_path))
                
    if sample_obj.ont_complete:
        for (read_type, read_group), read_files in sample_obj.sample_files.items():
            if read_type != "ont":
                continue
                
            read_files_hash = compute_read_files_hash(read_files)
                
            source_dates = sample_obj.source_dates[read_group]
            try:
                source_date = str(int(source_dates[0]))
            except ValueError:
                source_date, _ = col.Counter(source_dates).most_common(1)[0]
            
            read_group_info = MERGED.loc[MERGED["file_group"] == read_group, ["data_source", "project"]]
            if read_group_info.empty:
                read_group_info = set(
                    ALL_FILES.loc[ALL_FILES["file_group"] == read_group, "curated_by"].values
                )
                data_source, project = MANUALLY_CURATED_SOURCES[read_group_info]
            else:
                data_source = set(read_group_info["data_source"].values).pop()
                project = set(read_group_info["project"].values).pop()

            if data_source == project and data_source in MANUALLY_CURATED_SOURCES:
                data_source, project = MANUALLY_CURATED_SOURCES[data_source]
            elif data_source == project:
                assert data_source.startswith("HGSVC") and project.startswith("HGSVC"), f"{data_source} / {project}"
                project = "hgsvc"
                data_source = data_source[5:].upper()
            else:
                pass
               
            source_path = sample_obj.get_file_group_lca_path(read_files)
            verified_file_dir = PATH_PREFIX.local.joinpath(
                DATA_ROOT, source_path
            )
            
            # step - write verified file
            verified_file_name = f"{sample_obj.name}.{len(read_files)}-cells.verified"
            verified_file_path = verified_file_dir.joinpath(verified_file_name)
            verified_read_files = sorted([fp.name for fp in read_files])
            
            if check_is_identical(verified_file_path, verified_file_name, verified_read_files):
                if DEBUG:
                    print(f"=== ID - skipping update of {verified_file_path}")
            elif not DRY_RUN:
                assert verified_file_dir.is_dir()
                with open(verified_file_path, "w") as dump:
                    _ = dump.write(f"# {TODAY}\n")
                    _ = dump.write("\n".join(verified_read_files) + "\n")
                set_file_permissions(verified_file_path)
            else:
                if VERBOSE:
                    print(f">>> DRY RUN - not updating: {verified_file_path}")
            
            # step - write fofn file
            fofn_name = f"{sample_obj.name}_{read_type}_fastq.{project}-{data_source}-{source_date}.fofn"
            fofn_path = LOCAL_SAMPLE_ROOT.joinpath(
                sample_obj.name, fofn_name
            )
            
            # this is sub-optimal because it is not sort-invariant
            # however, assuming that data production is finally complete,
            # consequences should be limited
            if fofn_path in fofn_collisions:
                if VERBOSE:
                    print(f"FOFN path collision detected: {fofn_path.name} exists!")
                fofn_path = fofn_path.with_suffix(f".{read_files_hash[:8]}.fofn")
                fofn_name = fofn_path.name
                
            fofn_collisions.add(fofn_path)
            
            if check_is_identical(fofn_path, fofn_name, sorted(read_files)):
                if DEBUG:
                    print(f"=== ID - skipping update of {fofn_path}")
            elif not DRY_RUN:
                with open(fofn_path, "w") as dump:
                    for rf in sorted(read_files):
                        assert str(rf).startswith("project-centric")
                        _ = dump.write(str(rf) + "\n")
                set_file_permissions(fofn_path)
            else:
                if VERBOSE:
                    print(f">>> DRY RUN - not updating: {fofn_path}")
            
            # add ONT fofn file to fofn table only if Hifi is also complete
            if sample_obj.hifi_complete:
                remote_fofn_path = str(fofn_path).replace(str(PATH_PREFIX.local), str(PATH_PREFIX.remote))
                fofn_files.append((sample_obj.name, read_type, remote_fofn_path))
            
fofn_files = pd.DataFrame.from_records(fofn_files, columns=["sample", "read_type", "fofn_path"])
fofn_files.sort_values(["sample", "read_type"], inplace=True)

if not DRY_RUN:
    with open(fofn_files_out, "w") as dump:
        _ = dump.write(f"# {TODAY}\n")
        fofn_files.to_csv(dump, sep="\t", header=True, index=False)



Skipping over 19320 - not enough data

Skipping over 19320 - not enough data


FOFN path collision detected: HG00733_hifi_fastq.hgsvc-UW-2023.fofn exists!
