In [2]:
%run "../00_project_config.ipynb"
%run "05_annotations.ipynb"
%run "10_data_objects.ipynb"
%run "20_process_sample_table.ipynb"
%run "30_process_data_sources.ipynb"

MANUALLY_CURATED_FOLDERS = {
    "20230501_HGSVC_UL_ONT-UW": "UW_WH",
    "20211013_ONT_Rebasecalled": "JAX_PA",
    "20230706_HGSVC_EEE_UL_ONT": "UW_WH",
    "20230703_HGSVC_EEE_HIFI": "UW_WH",
    "20220831_JAX_HiFi": "JAX_PA",
    "20230905_HGSVC_EEE_UL_ONT": "UW_WH",
    "20231126_UW_HiFi": "UW_WH",
    "20240117_UW_HiFi": "UW_WH"
}

# Added for cases where an additional cell was
# sequenced to replace a contaminated one
# --- NA18939
RESEQ_REPLACEMENTS = {
    "m64076_230717_214340-bc2082": "UW_WH",
}

AUTO_TABLE_ALL_KNOWN = PROJECT_BASE.joinpath("annotations", "autogen", "annotated_files.tsv")
AUTO_TABLE_ALL_KNOWN.parent.mkdir(parents=True, exist_ok=True)
AUTO_TABLE_ALL_FILES = PROJECT_BASE.joinpath("annotations", "autogen", "existing_files.tsv")
AUTO_TABLE_ALL_FILES.parent.mkdir(parents=True, exist_ok=True)
AUTO_TABLE_ERR_FILES = PROJECT_BASE.joinpath("annotations", "autogen", "error_files.tsv")
AUTO_TABLE_ERR_FILES.parent.mkdir(parents=True, exist_ok=True)

matched_records = []
# iterate through all files annotated in a metadata
# table (that largely applies to only batch 1)
for row in KNOWN_FILES.itertuples(index=True):
    # for each annotated file (row), check if it
    # matches with existing files of the respective sample
    files_for_sample = FILES_EXIST_PER_SAMPLE[row.sample]
    
    matched_files = []
    num_matches = 0
    sample_batch = SAMPLES[row.sample].batch_num
    
    for file in files_for_sample:
        if row.read_type != file.read_type:
            continue
        if row.cell in file.file_name:
            # perfect / metadata match
            assert row.read_type == file.read_type
            matched_records.append(
                (
                    row.Index, sample_batch,
                    file.data_rel_path, file.file_date,
                    file.file_group, file.group_date, 1
                )
            )
            matched_files.append(file)
            num_matches += 1
    if num_matches > 1:
        # error
        for mf in matched_files:
            print(mf)
        raise ValueError(f"Multi-match: {row}")
        
    if num_matches == 1:
        # change the underlying data file object
        matched_files[0].set_matched_entry(row.Index)
        matched_files[0].set_curator("metadata_table")
    
matched_records = pd.DataFrame.from_records(
    matched_records,
    columns=["index", "processing_batch", "file_rel_path", "file_date", "file_group", "group_date", "matched"]
)
matched_records.index = matched_records["index"]
matched_records.drop("index", inplace=True, axis=1)

MERGED = KNOWN_FILES.merge(matched_records, how="outer", left_index=True, right_index=True)
MERGED["matched"].fillna(0, inplace=True)
MERGED["processing_batch"].fillna(-1, inplace=True)
MERGED.fillna("n/a", inplace=True)
MERGED["matched"] = MERGED["matched"].astype(int)
MERGED["processing_batch"] = MERGED["processing_batch"].astype(int)
MERGED["file_name"] = MERGED["file_rel_path"].apply(lambda x: "n/a" if x == "n/a" else pl.Path(x).name)
MERGED.sort_index(inplace=True)

curated_records = []
unknown_records = []
# iterate through all files per sample
for sample, files_for_sample in FILES_EXIST_PER_SAMPLE.items():
    sample_batch = SAMPLES[sample].batch_num
    for sample_file in files_for_sample:
        if sample_file.matched_entry is None:
            # check if manually curated
            for folder, curator in MANUALLY_CURATED_FOLDERS.items():
                if folder in str(sample_file.data_rel_path):
                    sample_file.set_matched_entry(-1)
                    sample_file.set_curator(curator)
                    sample_file.set_sample_batch(sample_batch)
                    break

            # check if new seq. cell
            if sample_file.matched_entry is None:
                for cell_id, curator in RESEQ_REPLACEMENTS.items():
                    if cell_id in str(sample_file.data_rel_path):
                        sample_file.set_matched_entry(-1)
                        sample_file.set_curator(curator)
                        sample_file.set_sample_batch(sample_batch)
                        break

            if sample_file.matched_entry is None:
                sample_file.set_curator("error")
                sample_file.set_sample_batch(sample_batch)
                unknown_records.append(sample_file.get_table_row(batch_num=True))
            else:
                curated_records.append(sample_file.get_table_row(batch_num=True))

ERR_FILES = pd.DataFrame.from_records(
    unknown_records, columns=sample_file.get_table_header(batch_num=True)
)                

curated_records = pd.DataFrame.from_records(
    curated_records, columns=sample_file.get_table_header(batch_num=True))
curated_records["alt_id"] = curated_records["file_name"]
curated_records["file_rel_path"] = curated_records["file_path"]
curated_records["cell"] = "cell-id-na"
curated_records["data_source"] = curated_records["curated_by"]
curated_records["file_date"] = "unset"
curated_records["file_source"] = "manual"
curated_records["project"] = curated_records["curated_by"]

curated_records.drop("file_path", axis=1, inplace=True)

MERGED["curated_by"] = "metadata"

MERGED = pd.concat([MERGED, curated_records], axis=0, ignore_index=False)

remove_no_file = MERGED["file_name"] == "n/a"
if sum(remove_no_file) > 0:
    ERR_FILES = pd.concat([ERR_FILES, MERGED.loc[remove_no_file, :].copy()], axis=0, ignore_index=False)
    MERGED = MERGED.loc[~remove_no_file, :].copy()

MERGED.sort_values(["sample", "read_type", "file_rel_path"], inplace=True)

MERGED.reset_index(drop=True, inplace=True)

# files annotated or in a curated folder
with open(AUTO_TABLE_ALL_KNOWN, "w") as dump_table:
    _ = dump_table.write(f"# {TODAY}\n")
    MERGED.to_csv(
        dump_table,
        sep="\t",
        header=True,
        index=True,
        index_label="index"
    )

    
# # files that exist in a file system location
# # with a sample listing file
# ALL_FILES = pd.DataFrame.from_records(
#     all_files_table,
#     columns=all_files_header
# )
# ALL_FILES.sort_values(["sample", "read_type", "file_group", "file_name"], inplace=True)
# with open(AUTO_TABLE_ALL_FILES, "w") as dump_table:
#     _ = dump_table.write(f"# {TODAY}\n")
#     ALL_FILES.to_csv(
#         dump_table,
#         sep="\t",
#         header=True,
#         index=False
#     )

# unknown files
with open(AUTO_TABLE_ERR_FILES, "w") as dump_table:
    _ = dump_table.write(f"# {TODAY}\n")
    ERR_FILES.to_csv(
        dump_table,
        sep="\t",
        header=True,
        index=False
    )


Skipping over 19320 - not enough data

Skipping over 19320 - not enough data
