In [32]:
%run "../../00_project_config.ipynb"
%run "../00_path_config.ipynb"

import pathlib as pl
import pandas as pd
import re
import pickle as pck
import itertools as itt
import hashlib as hl
import numpy as np
import collections as col

input_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "out-140.hgsvc3_data_sources.unpolished.tsv"
)

HHU_GLOBUS = "TMP:HHU:GLOBUS:/temp_data_sharing"
HHU_GLOBUS_ID = "TMP:HHU:GLOBUS:UUID:5f441149-3339-4c4b-be1b-64c87d62a1e2"

output_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes",
    "hgsvc3_assembly_data_sources.draft.tsv"
)

def get_remote_prefix_header():
    
    header = ""
    with open(input_table, "r") as table:
        _ = table.readline()
        for line in table:
            if not line.startswith("#"):
                break
            header += line.strip()
            header += "\n"
    assert header
    return header


def print_select(df, selector):
    
    print("---")
    total_records = selector.sum()
    runner = 0
    for row in df.loc[selector].itertuples():
        runner += 1
        print(f"{runner}/{total_records} [IDX:{row.Index}]")
        print(row)
        print('---')
    return


df = pd.read_csv(input_table, sep="\t", comment="#", header=0)
df.drop(["subfolder", "join_key", "remote_collection"], axis=1, inplace=True)

not_all = df["filename"] != "all"

# set source for HG02818 HiFi
select_sample = df["sample"] == "HG02818"
select_dtype = df["datatype"] == "hifi"
select_files = (df["filename"].str.contains("lib")).values
selector = not_all & select_sample & select_dtype & select_files
df.loc[selector, "accession"] = "PRJNA731524"
df.loc[selector, "remote_path"] = "AWS:HPRC:working/HPRC_PLUS/HG02818/raw_data/PacBio_HiFi"
df.loc[selector, "igsr_folder_exists"] = "no"

selector = not_all & select_sample & select_dtype & np.logical_not(select_files)
df.loc[selector, "accession"] = "PRJNA339722"
df.loc[selector, "remote_path"] = "AWS:HPRC:working/HPRC_PLUS/HG02818/raw_data/PacBio_HiFi"
df.loc[selector, "igsr_folder_exists"] = "no"

# set accession/source for NA24385
select_sample = df["sample"] == "NA24385"
select_dtype = df["datatype"] == "hifi"
filesets = """
PRJNA731524/NA24385_m54329U_201103_231616.Q20.fastq.gz
PRJNA731524/NA24385_m64076_201013_225902.Q20.fastq.gz
PRJNA731524/NA24385_m64076_201016_191536.Q20.fastq.gz
PRJNA731524/NA24385_m64076_210309_014547.hifi_reads.fastq.gz
PRJNA731524/NA24385_m64076_210310_104300.hifi_reads.fastq.gz
PRJNA586863/NA24385_m64011_190830_220126.Q20.fastq.gz
PRJNA586863/NA24385_m64011_190901_095311.Q20.fastq.gz
PRJNA586863/NA24385_m64012_190920_173625.Q20.fastq.gz
PRJNA586863/NA24385_m64012_190921_234837.Q20.fastq.gz
PRJNA813010/NA24385_m64004_210224_230828.hifi_reads.fastq.gz
PRJNA813010/NA24385_m64014_210227_165255.hifi_reads.fastq.gz
PRJNA813010/NA24385_m64015e_210223_010616.hifi_reads.fastq.gz
PRJNA813010/NA24385_m64015e_210224_100310.hifi_reads.fastq.gz
"""
filesets = filesets.strip().split()
filecollect = col.defaultdict(set)
for entry in filesets:
    acc, fn = entry.split("/", 1)
    filecollect[acc].add(fn)
    
for acc, fileset in filecollect.items():
    select_files = df["filename"].isin(fileset)
    selector = not_all & select_sample & select_dtype & select_files
    df.loc[selector, "accession"] = acc
    df.loc[selector, "igsr_folder_exists"] = "no"
    df.loc[selector, "igsr_file_exists"] = "no"
    df.loc[selector, "remote_path"] = HHU_GLOBUS

select_sample = df["sample"] == "NA24385"
select_dtype = df["datatype"] == "ont"
selector = not_all & select_sample & select_dtype
df.loc[selector, "accession"] = "unknown"
df.loc[selector, "igsr_folder_exists"] = "no"
df.loc[selector, "igsr_file_exists"] = "no"
df.loc[selector, "remote_path"] = HHU_GLOBUS


# check for unexpected N/A
selector = pd.isnull(df).any(axis=1)
assert not selector.any()
print_select(df, selector)

column_sort_order = [
    "SIN", "hgsvc_sample_number", "sample",
    "phase", "verkko_assembly_batch",
    "population", "supergroup", "family",
    "member", "sex", "is_child",
    "datatype", "filename", "remote_path", "accession",
    "igsr_folder_exists", "igsr_file_exists",
    "cov_xfold_grt_0bp_at_3Gbp", "total_length_grt_0bp",
    "total_num_grt_0bp", "length_N50_grt_0bp", "length_auN_grt_0bp",
    "cov_xfold_grt_100kbp_at_3Gbp", "total_num_grt_100kbp",
    "length_N50_grt_100kbp", "length_auN_grt_100kbp", "total_num_grt_1Mbp"
]

sort_columns = set(column_sort_order)
has_columns = set(df.columns.values)

miss_in_sort = has_columns - sort_columns
if miss_in_sort:
    raise AssertionError(miss_in_sort)
not_in_df = sort_columns - has_columns
if not_in_df:
    raise KeyError(not_in_df)


df = df[column_sort_order]
df.sort_values(
    ["hgsvc_sample_number", "datatype", "filename"],
    inplace=True, ascending=[True, True, False]
)


def get_sample_count_header(df):
    
    samples = df.drop_duplicates("SIN", inplace=False)["verkko_assembly_batch"].value_counts()
    total_samples = samples.sum()
    
    header = f"# Total HGSVC3 samples: {total_samples}\n"
    for batch, num in samples.items():
        header += f"# Verkko batch {batch}: {num} samples\n"
        
    return header
    
source_header = get_remote_prefix_header()
verkko_header = get_sample_count_header(df)
with open(output_table, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    _ = dump.write(source_header)
    _ = dump.write(f"# {HHU_GLOBUS_ID}\n")
    _ = dump.write(verkko_header)
    df.to_csv(dump, sep="\t", header=True, index=False)

---
