In [2]:
import pathlib
import pandas as pd
import collections as col
import difflib as diffl

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"

_PROJECT_CONFIG_NB = str(pathlib.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

def read_accession_table(file_path, load_columns, renamed=("file_accession", "project_accession", "file_name")):

    if file_path.suffix == ".tsv":
        sep = "\t"
    elif file_path.suffix == ".csv":
        sep = ","
    else:
        raise ValueError(file_path.suffix)
    
    try:
        df = pd.read_csv(file_path, sep=sep, usecols=load_columns)
        df.rename(
            dict((lc, rc) for lc, rc in zip(load_columns, renamed)),
            axis=1, inplace=True
        )
    except ValueError:
        print(file_path.name)
        raise

    file_lut = dict()
    for row in df.itertuples():
        just_file = row.file_name
        if "/" in just_file:
            just_file = just_file.split("/")[-1] 
        if just_file.endswith(".fastq.gz"):
            plain = just_file.rsplit(".", 2)[0]
        elif just_file.endswith(".bam"):
            plain = just_file.rsplit(".", 1)[0]
        elif just_file.endswith(".hifi_reads"):
            plain = just_file
        elif "illumina" in just_file.lower():
            continue
        elif "6U00" in just_file:
            continue
        elif "hifi" in just_file.lower():
            plain = just_file
        elif "-clr-" in just_file.lower():
            continue
        elif just_file.startswith("PL") and just_file.endswith("run"):
            # 2024-08-14 manual fix for EDEVI data
            plain = just_file
        else:
            raise ValueError(file_path.name, row.file_name)
            continue
            
        file_lut[plain] = (row.file_accession, row.project_accession)

    return file_lut
        
LOAD_FOLDER = PROJECT_BASE.joinpath("annotations", "external", "accessions")

accession_tables = [
    (
        "20240522_HGSVC3-ENA_fastq_upload-summary_UW.YoungjunKwon.tsv",
        ("runId", "project_accession", "file_name"),
        None
    ),
    (
        "20240521_PacBioHiFi_runs_JAX.PilleHallast.tsv",
        ("id", "project_accession", "file_name"),
        None
    ),
    (
        "20240521_ONT-UL_runs_JAX.PilleHallast.tsv",
        ("id", "project_accession", "file_name"),
        None
    ),
    (
        "PRJEB58376_ENA_filereport.tsv",
        ("run_accession", "study_accession", "submitted_ftp"),
        None
    ),
    (
        "PRJNA339722_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias"),
        None
    ),
    (
        "PRJNA731524_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias"),
        None
    ),
    (
        "PRJNA988114_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias"),
        "ONT for 2 samples / chrY paper / UW"
    ),
    (
        "PRJEB36100_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias"),
        "See KM Email 2024-07-10/11 - UW data / HGSVC2"
    ),
    (
        "20240813_HGSVC3-ENA_FASTQ_Accessions_2024_08_13.YK.tsv",
        ("runId", "project_accession", "file_name"),
        None
    ),
    (
        "20240716_ena-upload_jax-hifi-singleton.csv",
        ("id", "studyId", "filename"),
        "Single file upload from JAX / NA19238 hifi"
    ),
    (
        "PRJNA698480_ENA_filereport.tsv",
        ("run_accession", "study_accession", "run_alias"),
        "PacBio data HGSVC2"
    ),
    (
        "PRJEB41778_filereport_ENA.tsv",
        ("run_accession", "study_accession", "submitted_ftp"),
        "PacBio data HGSVC2"
    )
]

all_files = col.defaultdict(set)
for table_file, load_columns, _ in accession_tables:
    acc_files = read_accession_table(LOAD_FOLDER.joinpath(table_file), load_columns)
    for k,v in acc_files.items():
        all_files[k].add(v)

data_freeze = pd.read_csv(
    PROJECT_BASE.joinpath("annotations", "data_freezes", "hgsvc3_assembly_data_sources.draft.tsv"),
    comment="#", sep="\t"
)

def check_approx_match(query, targets):

    debug = False
    debug_files = []
    debug = any([query == f for f in debug_files])

    match_quality = "fuzzy-match"

    # 2024-08-14 manual fix for EDEVI data
    if "EDEVI" in query:
        parts = query.split("_")
        lib_id = [p for p in parts if p.startswith("PL")]
        if len(lib_id) == 1:
            query = lib_id[0]
            match_quality = "manual-fix"
        else:
            pass

    # 2024-08-14 manual fix for problematic submission
    # HG02818 lib 1 has three submitted BAMs carrying different movie IDs
    # but that are part of the same run. The used file carries the movie ID
    # from one BAM, but the run alias is set to the name of another BAM,
    # hence we switch out the query here
    # This is in metadata for project PRJNA731524
    # See email thread with KM and Chad Tomlinson July 2023 (!)
    # The HG02818 data may have been submitted multiple times as well,
    # potentially also under PRJNA339722 but that project does not contain
    # sufficient metadata to match the files
    if query == "HG02818_m64043_200209_061852.SRR14611219.lib1":
        query = "m64043_200206_173947.ccs"
        match_quality = "manual-fix"
    if query == "m64043_200207_235213.ccs":
        # same as before
        query = "m64043_200206_173947.ccs"
        match_quality = "manual-fix"

    # 2024-08-14 manual fix for submission that is inconsistent in its use of
    # run vs movie identifiers
    # email exchange with KM 2024-07-10/11
    if query == "m54329U_200719_061020.ccs":
        query = "HG00514-HiFi-r54329U_20200717_234302-B01"
        match_quality = "manual-fix"
    if query == "m54329U_200717_235548.ccs":
        query = "HG00514-HiFi-r54329U_20200717_234302-A01"
        match_quality = "manual-fix"
    if query == "m54329U_200715_194535.ccs":
        query = "HG00514-HiFi-r54329U_20200715_193257-A01"
        match_quality = "manual-fix"
    
    sm = diffl.SequenceMatcher()
    sm.set_seq2(query)
    max_sim = 0
    selected_t = None
    for t in targets:
        sm.set_seq1(t)
        match = sm.find_longest_match()
        frac_q = match.size / len(query)
        frac_t = match.size / len(t)
        frac = max(frac_q, frac_t)
        if frac > max_sim:
            max_sim = frac
            selected_t = t

    if debug:
        print(query)
        print(max_sim)
        print(selected_t)
        raise
    
    if max_sim > 0.999:
        return selected_t, match_quality
    elif max_sim > 0.8:
        #print(query, " <---> ", selected_t)
        return selected_t, match_quality
    else:
        return None

missing = []
found = []
known_files = list(all_files.keys())

for row in data_freeze.itertuples():
    if row.filename == "all":
        continue
    if row.datatype == "strandseq":
        continue    
    if row.filename.endswith(".fastq.gz"):
        plain = row.filename.rsplit(".", 2)[0]
    else:
        raise ValueError(row.filename)

    if row.sample == "NA24385":
        if row.datatype == "ont":
            project_accession = "AWS:S3:human-pangenomics:NHGRI_UCSC_panel/HG002/nanopore/ultra-long"
        else:
            project_accession = row.accession
        found.append(
            (
                row.sample, row.datatype, plain,
                "exact-match", "exact-match",
                row.accession, "external", project_accession,
                0, "yes", row.Index
            )
        )
        continue

    # 2024-08-14 manual fixes after discussions with all involved parties
    if row.filename == "20230626_230620_23-lee-007_PCA100115_3F-run15_guppy-5.0.11-sup-prom_fastq_pass.fastq.gz":
        found.append(
            (
                row.sample, row.datatype, plain,
                "exact-match", "exact-match", "no-accession", "no-accession",
                "no-accession", 0, "no-contaminant-drop", row.Index
            )
        )
        continue
    
    if plain not in all_files:
        approx_match, matchq = check_approx_match(plain, known_files)
        if approx_match is not None:
            file_accessions = all_files[approx_match]
            if len(file_accessions) > 1:
                dup_acc = 1
            else:
                dup_acc = 0
            for accessions in file_accessions:
                file_acc, proj_acc= accessions
                found.append(
                    (
                        row.sample, row.datatype, plain,
                        approx_match, matchq, row.accession,
                        file_acc, proj_acc, dup_acc, "yes", row.Index
                    )
                )
        else:
            missing.append((row.sample, row.datatype, plain, row.remote_path, row.accession))
    else:
        file_accessions = all_files[plain]
        if len(file_accessions) > 1:
            dup_acc = 1
        else:
            dup_acc = 0
    
        for accessions in file_accessions:
            file_acc, proj_acc = accessions
            found.append(
                (
                    row.sample, row.datatype, plain,
                    "exact-match", "exact-match",
                    row.accession, file_acc, proj_acc,
                    dup_acc, "yes", row.Index
                )
            )

if missing:
    print("Missing files / dumping to file")
    with open("hgsvc3_missing_accessions.hifi-ont.ALL.tsv", "w") as dump:
        dump.write("\t".join(["sample", "datatype", "file_name", "remote_path", "accession"]) + "\n")
        for m in missing:
            dump.write("\t".join(m) + "\n")


found = pd.DataFrame.from_records(
    found,
    columns=[
        "sample", "datatype", "filename", "matched_name", "match_quality",
        "annotated_project", "file_accession", "project_accession", "has_duplicated_accession",
        "reuse_data", "df_idx"
    ]
)
dups = found.loc[found["has_duplicated_accession"] > 0, :]

drop_dup_indices = []
for acc, file_infos in dups.groupby("filename"):
    # if annotated == matched, select that
    select_by_project = file_infos["annotated_project"] == file_infos["project_accession"]
    if select_by_project.any():
        drop_index = file_infos.loc[~select_by_project, :].index[0]
        drop_dup_indices.append(drop_index)
    else:
        raise ValueError(file_infos)

found = found.drop(drop_dup_indices, axis=0, inplace=False)
found.sort_values(["sample", "datatype", "filename"], inplace=True)
found.reset_index(drop=True, inplace=True)
found.drop(["has_duplicated_accession", "annotated_project"], axis=1, inplace=True)
assert found["df_idx"].nunique() == found.shape[0]

output_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "hgsvc3_assembly_data_sources.accessions.tsv"
)

with open(output_table, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    found.to_csv(dump, sep="\t", header=True, index=False)