In [26]:
%run "../../00_project_config.ipynb"
%run "../00_path_config.ipynb"

import pathlib as pl
import pandas as pd
import re
import itertools as itt
import hashlib as hl
import collections as col
import pickle as pck

acc_table_file = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "out-120.hgsvc3_data_sources.prep.tsv"
)
acc_table = pd.read_csv(acc_table_file, sep="\t", header=0, comment="#")

sseq_remote_cache_file = pl.Path(".").resolve().parent.joinpath(
    ".cache", "sseq_igsr_folders.pck"
)
assert sseq_remote_cache_file.is_file()

with open(sseq_remote_cache_file, "rb") as dump:
    sseq_raw_cache = pck.load(dump)

def process_sseq_cache(sseq_cache):
    
    sseq_folders = col.defaultdict(list)
    for wd, sub_entries in sseq_cache.items(): 
        if "hgsvc2" in wd.lower():
            sub_path = wd.split("HGSVC2")[-1].strip().strip("/")
            remote_path = "IGSR:HGSVC2:/" + sub_path
        elif "hgsvc3" in wd.lower():
            sub_path = wd.split("HGSVC3")[-1].strip().strip("/")
            remote_path = "IGSR:HGSVC3:/" + sub_path
        else:
            raise
        if sub_path.endswith("fastq"):
            search_path = sub_path.split("/")[-2]
        else:
            search_path = sub_path.split("/")[-1]            
        for entry in sub_entries:
            if "readme" in entry.lower() or "manifest" in entry.lower():
                continue
            # infer SIN
            mobj = re.search("[0-9]{5}", entry)
            if mobj is None:
                raise ValueError(wd, entry)
            s, e = mobj.span()
            sin = f"SIN:{entry[s:e]}"
            assert len(sin) == 9, entry
            sample_remote = remote_path + f"/{entry}"
            sseq_folders[sin].append(
                (search_path, sample_remote)
            )
    return sseq_folders


sseq_remotes = process_sseq_cache(sseq_raw_cache)


def get_remote_prefix_header():
    
    header = ""
    with open(acc_table_file, "r") as table:
        _ = table.readline()
        for line in table:
            if not line.startswith("#"):
                break
            header += line.strip()
            header += "\n"
    assert header
    return header

file_table_file = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "out-130.hgsvc3_data_sources.files.tsv"
)
file_table = pd.read_csv(file_table_file, sep="\t", header=0, comment="#")

output_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "out-140.hgsvc3_data_sources.unpolished.tsv"
)

sample_info_lut = dict()
for row in acc_table.itertuples():
    infos = {
        "family": row.family,
        "member": row.member,
        "population": row.population,
        "supergroup": row.supergroup,
        "hgsvc_sample_number": row.hgsvc_sample_number,
        "phase": row.phase,
        "sample": row.sample,
        "sex": row.sex,
        "is_child": row.is_child,
        "verkko_assembly_batch": row.verkko_assembly_batch
    }
    sample_info_lut[row.SIN] = infos   
sample_info = pd.DataFrame.from_records(sample_info_lut).transpose()
sample_info = sample_info.reset_index(drop=False, inplace=False)
sample_info.rename({"index": "SIN"}, axis=1, inplace=True)


join_keys = []
for row in acc_table.itertuples():
    select_sin = file_table["SIN"] == row.SIN
    select_datatype = file_table["datatype"] == row.datatype
    remote_exact = file_table["remote_path"] == row.remote_path
    remote_maybe = file_table["remote_path"] == f"{row.remote_path}/{row.sample}"
    remote_fuzzy = file_table["remote_path"] == f"{row.remote_path}/{'GM'+row.sample[2:]}"
    not_all = file_table["filename"] != "all"
    
    selector = not_all & select_sin & select_datatype & (remote_exact | remote_maybe | remote_fuzzy)
    
    if selector.any():
        file_rows = file_table.index[selector]
        all_keys = sorted([row.Index] + file_rows)
        join_key = "".join(map(str, all_keys))
        join_key = hl.md5(join_key.encode("utf-8")).hexdigest()
        join_keys.append(
            (join_key, row.Index, file_rows)
        )       
            
acc_table["join_key"] = "unknown"
file_table["join_key"] = "unknown"

for jk, acc_idx, file_idx in join_keys:
    acc_table.loc[acc_idx, "join_key"] = jk
    file_table.loc[file_idx, "join_key"] = jk
    
subset_acc = acc_table.loc[acc_table["join_key"] != "unknown", :].copy()
subset_acc.drop("remote_path", axis=1, inplace=True)
subset_file = file_table.loc[file_table["join_key"] != "unknown", :].copy()

direct_match = subset_acc.merge(subset_file, on=["join_key", "SIN", "datatype"], how="outer")

subset_acc2 = acc_table.loc[acc_table["join_key"] == "unknown", :].copy()
subset_file2 = file_table.loc[file_table["join_key"] == "unknown", :].copy()

# same procedure for matches w/o remote
join_keys = []
for row in subset_acc2.itertuples():
    select_sin = subset_file2["SIN"] == row.SIN
    select_datatype = subset_file2["datatype"] == row.datatype
    not_all = subset_file2["filename"] != "all"
    
    selector = select_sin & select_datatype & not_all
    
    if selector.any():
        # NB: this requires that subset_file2
        # was not re-indexed!
        file_rows = subset_file2.index[selector]
        all_keys = sorted([row.Index] + file_rows)
        join_key = "".join(map(str, all_keys))
        join_key = hl.md5(join_key.encode("utf-8")).hexdigest()
        join_keys.append(
            (join_key, row.Index, file_rows)
        )       

for jk, acc_idx, file_idx in join_keys:
    assert acc_table.loc[acc_idx, "join_key"] == "unknown"
    acc_table.loc[acc_idx, "join_key"] = jk
    file_table.loc[file_idx, "join_key"] = jk

# the remote for strand-seq is approximately correct,
# hence save that here before dropping that info
sseq_acc = acc_table.loc[acc_table["datatype"] == "strandseq", :].copy()

seen_idx = set()
for sseq_sin, remotes in sseq_remotes.items():
    if sseq_sin not in sseq_acc["SIN"].values:
        continue
    subset = sseq_acc.loc[sseq_acc["SIN"] == sseq_sin, :]
    
    matched = 0
    update_remotes = []
    for row in subset.itertuples():
        for remote in remotes:
            if remote[0] in row.remote_path or row.remote_path == "unknown":
                matched += 1
                assert row.Index not in seen_idx
                update_remotes.append((row.Index, remote[1]))
                seen_idx.add(row.Index)
                break
    assert matched == subset.shape[0]
    
    for idx, new_remote in update_remotes:
        sseq_acc.loc[idx, "remote_path"] = new_remote

acc_table.drop("remote_path", axis=1, inplace=True)

select_acc = acc_table["join_key"] != "unknown"
select_files = file_table["join_key"] != "unknown"

merged = acc_table.loc[select_acc, :].copy(
    ).merge(file_table.loc[select_files, :].copy(), on=["join_key", "SIN", "datatype"])


remain_acc = ~select_acc
assert (acc_table.loc[remain_acc, "datatype"] == "strandseq").all()
# to be appended - all strandseq pointers
# this is thus equivalent to the 'sseq_acc' object
# set aside above

# the following requires some manual fixes
remain_files = ~select_files
remain_files = file_table.loc[remain_files, :].copy()
remain_files["accession"] = "unknown"


remain_files = remain_files.merge(sample_info, left_on=["SIN"], right_on=["SIN"])

# NB: sseq_acc info added verbatim to keep remote_path intact
complete_merge = pd.concat(
    [merged, sseq_acc, remain_files],
    axis=0, ignore_index=False
)

for col in complete_merge.columns:
    if "cov" in col:
        complete_merge[col].fillna(0., inplace=True)
    if "num" in col or "length" in col:
        complete_merge[col].fillna(0, inplace=True)
        complete_merge[col] = complete_merge[col].astype(int)

# set filename to unknown for strandseq
select_dtype = complete_merge["datatype"] == "strandseq"
complete_merge.loc[select_dtype, "filename"] = "unknown"
select_no_remote = complete_merge["remote_path"] == "unknown"
complete_merge.loc[select_dtype & select_no_remote, "igsr_folder_exists"] = "unknown"
complete_merge.loc[select_dtype & select_no_remote, "igsr_file_exists"] = "unknown"
select_igsr_remote = complete_merge["remote_path"].str.contains("HGSVC")
complete_merge.loc[select_dtype & select_igsr_remote, "igsr_folder_exists"] = "yes"
complete_merge.loc[select_dtype & select_igsr_remote, "igsr_file_exists"] = "unknown"


source_prefix_header = get_remote_prefix_header()
with open(output_table, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    _ = dump.write(source_prefix_header)
    complete_merge.to_csv(dump, sep="\t", header=True, index=False)

print(complete_merge.shape)

(921, 30)
