In [2]:
%run "../../00_project_config.ipynb"
%run "../00_path_config.ipynb"

import pathlib as pl
import pandas as pd
import re
import itertools as itt

prep_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "out-110.chrY_sheet1_data_sources.prep.tsv"
)

out_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "out-120.hgsvc3_data_sources.prep.tsv"
)

df = pd.read_csv(prep_table, sep="\t", header=0, comment="#")

unk_samples = ~df["sample"].isin(HGSVC_SAMPLES["sample"])
unk_samples = sorted(df.loc[unk_samples, "sample"].values)

df = df.loc[df["sample"].isin(HGSVC_SAMPLES["sample"]), :].copy()
df.drop(["population", "super_population"], axis=1, inplace=True)

full_table = HGSVC_SAMPLES.merge(df, left_on="sample", right_on="sample", how="outer")
full_table.rename(
    {
        "order_num": "hgsvc_sample_number",
        "batch_num": "verkko_assembly_batch"
    }, axis=1, inplace=True
)
full_table["member"] = full_table["member"].replace({"-": "single"})
full_table["family"] = full_table["family"].replace({"-": "single"})
full_table.loc[full_table["sample"] == full_table["family"], "family"] = "single"
full_table = full_table.loc[full_table["verkko_assembly_batch"] > 0, :].copy()

def simplify_remote(remote_location):
    
    if pd.isnull(remote_location):
        return "unknown"
    loc = remote_location.strip()
    if loc.startswith("rebasecalled"):
        loc = loc.split("-")[1].strip()
    loc = loc.strip("https:/")
    if "PRJ" in loc:
        return "unknown"
    return loc


def simplify_accession(accession_num):
    
    if pd.isnull(accession_num):
        return "unknown"
    acc = re.match("PRJ[NAEB]+[0-9]+", accession_num)
    if acc is None:
        if "T2T" in accession_num:
            return "unknown"
        raise ValueError(accession_num)
    s, e = acc.span()
    acc = accession_num[s:e]
    return acc


prefixes = {
    "hgsvc2": "IGSR:HGSVC2:",
    "hgsvc3": "IGSR:HGSVC3:",
    "awshprc": "AWS:HPRC:"
}

prefix_paths = {
    "IGSR:HGSVC2:": "ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2",
    "IGSR:HGSVC3:": "ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC3",
    "AWS:HPRC:": "s3-us-west-2.amazonaws.com/human-pangenomics/index.html?prefix="
}


def prefix_remote(remote_url):
    
    if remote_url == "unknown":
        prefix = ""
        path = "unknown"
    elif "data_collections/HGSVC3" in remote_url:
        prefix = prefixes["hgsvc3"]
        path = remote_url.split("HGSVC3")[-1]
    elif "data_collections/HGSVC2" in remote_url:
        prefix = prefixes["hgsvc2"]
        path = remote_url.split("HGSVC2")[-1]
    elif "amazonaws.com/human-pangenomics" in remote_url:
        prefix = prefixes["awshprc"]
        path = remote_url.split("prefix=")[-1]
    elif "github.com" in remote_url:
        prefix = ""
        path = "unknown"
    elif "NA" in remote_url:
        prefix = ""
        path = "unknown"
    else:
        raise ValueError(remote_url)
    return f"{prefix}{path}"   


def prefix_split_remotes_accession(data_row, datatype):
    
    assert datatype in ["hifi", "ont", "strandseq"]
    sample = data_row.name
    remote_fields = getattr(data_row, f"remote_{datatype}")
    remote_fields = remote_fields.strip().split(",")    
    remote_fields = [prefix_remote(remote.strip().strip("/")) for remote in remote_fields]
    
    acc_fields = getattr(data_row, f"accession_{datatype}")
    acc_fields = acc_fields.strip().split(",")
    acc_fields = [acc.strip() for acc in acc_fields]
    
    fill_value = acc_fields[-1] if len(acc_fields) < len(remote_fields) else remote_fields[-1]
    
    fields = []
    for remote, acc in itt.zip_longest(remote_fields, acc_fields, fillvalue=fill_value):
        subfolder = remote.rsplit("/")[-1]
        if subfolder == "fastq" or subfolder == "bam":
            subfolder = remote.rsplit("/")[-2]
        if subfolder[2:] in sample:
            subfolder = remote.rsplit("/")[-2]
        fields.append((sample, datatype, remote, subfolder, acc))
    return tuple(fields)


full_table["remote_hifi"] = full_table["remote_hifi"].apply(simplify_remote)
full_table["remote_ont"] = full_table["remote_ont"].apply(simplify_remote)
full_table["remote_strandseq"] = full_table["remote_strandseq"].apply(simplify_remote)

full_table["accession_hifi"] = full_table["accession_hifi"].apply(simplify_accession)
full_table["accession_ont"] = full_table["accession_ont"].apply(simplify_accession)
full_table["accession_strandseq"] = full_table["accession_strandseq"].apply(simplify_accession)

full_table["SIN"] = full_table["sample"].apply(lambda x: f"SIN:{x[2:]}")

full_table.set_index("SIN", inplace=True)

hifi_fields = list(
    itt.chain(
        *full_table.apply(prefix_split_remotes_accession, axis=1, args=("hifi",)).values
    ))
ont_fields = list(
    itt.chain(
        *full_table.apply(prefix_split_remotes_accession, axis=1, args=("ont",)).values
    ))
sseq_fields = list(
    itt.chain(
        *full_table.apply(prefix_split_remotes_accession, axis=1, args=("strandseq",)).values
    ))

full_table.drop(
    [
        "remote_hifi", "accession_hifi",
        "remote_ont", "accession_ont",
        "remote_strandseq", "accession_strandseq"
    ], axis=1, inplace=True
)

column_names = ["SIN", "datatype", "remote_path", "remote_collection", "accession"]

hifi = pd.DataFrame.from_records(hifi_fields, columns=column_names)
hifi.set_index("SIN", inplace=True)
ont = pd.DataFrame.from_records(ont_fields, columns=column_names)
ont.set_index("SIN", inplace=True)
sseq = pd.DataFrame.from_records(sseq_fields, columns=column_names)
sseq.set_index("SIN", inplace=True)

merged = pd.concat([hifi, ont, sseq], axis=0, ignore_index=False)

full_table = full_table.merge(merged, left_index=True, right_index=True, how="outer")
full_table.sort_values(["hgsvc_sample_number", "datatype"], inplace=True)

with open(out_table, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    _ = dump.write(f"# SOURCE PREFIX PATHS\n")
    for k, v in prefix_paths.items():
        _ = dump.write(f"# {k} '{v}'\n")
    full_table.to_csv(dump, sep="\t", header=True, index=True, index_label="SIN")
        