In [1]:
%run "../00_project_config.ipynb"

import pandas as pd
import pathlib as pl

def load_complete_annotation(file_path, data_source, read_type, file_source):
    
    df = pd.read_csv(file_path, sep="\t", header=0, comment="#")
    df["sample"] = df["SAMPLE"]
    df["cell"] = df["CELL"]
    df.drop(["SAMPLE", "CELL"], axis=1, inplace=True)
    df = df[["sample", "cell"]]
    df["alt_id"] = df["cell"]
    
    # special bug fix because annotated file names
    # are not identical to actual data files ...
    if read_type == "hifi" and data_source == "HGSVCjax":
        df["cell"] = df["cell"].str.extract("(m[0-9_]+)", expand=False)
    
    df["read_type"] = read_type
    df["data_source"] = data_source
    df["file_source"] = file_path.name
    df["project"] = data_source
    
    return df


def load_incomplete_annotations(file_path, data_source, read_type, file_source, header):
    
    df = pd.read_csv(file_path, sep="\t", header=None, names=header, comment="#", skiprows=1)
    if "read_type" not in df.columns:
        df["read_type"] = read_type
    if "alt_id" not in df.columns:
        df["alt_id"] = df["cell"]
    if "data_source" not in df.columns:
        df["data_source"] = data_source
    if "project" not in df.columns:
        df["project"] = data_source
    df["file_source"] = file_path.name
    df["cell"] = df["cell"].str.strip()
    df["sample"] = df["sample"].str.strip()
    df["read_type"] = df["read_type"].str.strip()
    
    return df


complete_annotations = [
    ("HGSVCjax", "hifi", "20230306_hgsvc_hifi_cell-jax.tsv"),
    ("HGSVCjax", "ont", "20230306_hgsvc_ont_cell-jax.tsv"),
    ("HGSVCuw", "hifi", "20230627_hgsvc_hifi_cell-UW.tsv"),
    ("HGSVCuw", "ont", "20230627_hgsvc_ont_cell-UW.tsv")
]

incomplete_annotations = [
    ("HGSVCjax", "hifi", "20230308_hgsvc_hifi-jax.old.tsv", ["sample", "cell"]),
    ("HGSVCumigs", "hifi", "20230308_hgsvc_hifi-umigs.old.tsv", ["sample", "cell", "alt_id"]),
    ("HGSVCumigs", "hifi", "20230314_hgsvc_hifi-cells.umigs.tsv", ["sample", "cell", "alt_id"]),
    ("ena", "hifi", "external_hifi.tsv", ["sample", "cell", "read_type", "data_source", "project"]),
    ("epi2me", "ont", "external_ont_epi2me.tsv", ["sample", "cell", "read_type", "data_source", "project"]),
    ("hprc", "ont", "external_ont_hprc.tsv", ["sample", "cell", "read_type", "data_source", "project"])   
]

all_complete = [
    load_complete_annotation(
        PROJECT_BASE.joinpath("annotations", "external", table_file),
        data_source, read_type, table_file
    ) for data_source, read_type, table_file in complete_annotations
]

all_complete = pd.concat(all_complete, axis=0, ignore_index=False)

all_incomplete = [
    load_incomplete_annotations(
        PROJECT_BASE.joinpath("annotations", "external", table_file),
        data_source, read_type, table_file, header
    ) for data_source, read_type, table_file, header in incomplete_annotations
]

all_incomplete = pd.concat(all_incomplete, axis=0, ignore_index=False)

KNOWN_FILES = pd.concat([all_complete, all_incomplete], axis=0, ignore_index=False)
KNOWN_FILES.sort_values(["sample", "read_type", "data_source", "cell"], inplace=True)
KNOWN_FILES.reset_index(drop=True, inplace=True)
