In [4]:
%run "../../00_project_config.ipynb"
%run "../00_path_config.ipynb"

import pathlib as pl
import pandas as pd
import re
import pickle as pck
import requests as req

# possible remotes
prefix_paths = {
    "IGSR:HGSVC2:": "ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/working",
    "IGSR:HGSVC3:": "ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC3/working",
}

output_table = PROJECT_BASE.joinpath(
    "annotations", "data_freezes", "out-130.hgsvc3_data_sources.files.tsv"
)

file_cache_dump = pl.Path(".").resolve().parent.joinpath(
    ".cache", "data-source-files.pck"
)
assert file_cache_dump.is_file()

with open(file_cache_dump, "rb") as dump:
    file_cache = pck.load(dump)

online_cache_dump = pl.Path(".").resolve().parent.joinpath(
    ".cache", "igsr-online-files.pck"
)
assert online_cache_dump.is_file()

if online_cache_dump.is_file():
    with open(online_cache_dump, "rb") as dump:
        online_cache = pck.load(dump)
else:
    online_cache = dict()
    
    
def check_online_status(subfolder, filename):
    
    folder_ok = "no"
    file_ok = "no"
    root_prefix = "unknown"
    
    for prefix, path in prefix_paths.items():
        check_folder = f"https://{path}/{subfolder}"
        resp = req.head(check_folder, allow_redirects=True)
        if resp.status_code == 200:
            folder_ok = "yes"
            root_prefix = prefix
            check_file = f"{check_folder}/{filename}"
            resp = req.head(check_file, allow_redirects=True)
            if resp.status_code == 200:
                file_ok = "yes"
            break
    status = {
        "igsr_folder_exists": folder_ok,
        "igsr_file_exists": file_ok,
        "root_prefix": root_prefix
    }
    return status
           
    
# turn cache into file to folder map
def postprocess_file_cache(file_cache, online_cache):
    
    datatype = None
    subfolder = None
    file_map = dict()
    for k, v in file_cache.items():
        if "nanopore" in str(k):
            datatype = "ont"
            subfolder = str(k).split("nanopore")[-1].strip("/")
        if "pacbio_hifi" in str(k):
            datatype = "hifi"
            subfolder = str(k).split("pacbio_hifi")[-1].strip("/")
        if "mother" in subfolder or "father" in subfolder:
            print("skipping ", subfolder)
            continue
        if "GM19320" in subfolder or "NA19320" in subfolder:
            print("skipping ", subfolder)
            continue
        assert datatype is not None, k
        for file in v:
            file_name = file.name
            assert file_name not in file_map
            if "GM19320" in file_name or "NA19320" in file_name:
                print("skipping ", file_name)
                continue            
            
            try:
                online_status = online_cache[file_name]
            except KeyError:
                online_status = check_online_status(subfolder, file_name)
                online_cache[file_name] = online_status
                
            if online_status["root_prefix"] != "unknown":
                remote_path = online_status["root_prefix"] + "/working/" + subfolder
            else:
                remote_path = "unknown"

            file_map[file_name] = {
                "datatype": datatype,
                "subfolder": subfolder,
                "remote_path": remote_path,
                "igsr_folder_exists": online_status["igsr_folder_exists"],
                "igsr_file_exists": online_status["igsr_file_exists"]
            }
    return file_map, online_cache
        
file_lut, online_cache = postprocess_file_cache(file_cache, online_cache)

with open(online_cache_dump, "wb") as dump:
    pck.dump(online_cache, dump)
    
stats_files = pl.Path(
    "/home/ebertp/work/projects/hgsvc/2023_batch_data_tables/read_stats"
).glob("*.tsv")


select_stats = [
    "total_length_grt_0bp", "total_num_grt_0bp",
    "cov_xfold_grt_0bp_at_3Gbp", "length_N50_grt_0bp",
    "length_auN_grt_0bp"
]

add_ont = [
    "total_num_grt_100kbp", "cov_xfold_grt_100kbp_at_3Gbp",
    "length_N50_grt_100kbp", "length_auN_grt_100kbp",
    "total_num_grt_1Mbp"
]

def read_stats_minimal_info(file_path, file_lut):
    
    sample = file_path.name.split("_")[0]
    datatype = file_path.name.split("_")[-1].split(".")[0]
    
    keep_stats = select_stats
    if datatype == "ont":
        keep_stats.extend(add_ont)
    
    df = pd.read_csv(file_path, sep="\t", header=0)
    df = df.loc[df["statistic"].isin(keep_stats), :].copy()
    df.rename({"source": "filename"}, axis=1, inplace=True)
    df["SIN"] = f"SIN:{sample[2:]}"
    df["datatype"] = datatype
    df.reset_index(drop=True, inplace=True)
    
    df = df.pivot(
        index=["SIN", "filename", "datatype"],
        columns=["statistic"]
    )
    df.columns = df.columns.droplevel(0)
    df = df.reset_index(drop=False, inplace=False)
        
    df["subfolder"] = df["filename"].apply(
        lambda x: file_lut.get(x, {"subfolder": "unknown"})["subfolder"]
    )
    df["remote_path"] = df["filename"].apply(
        lambda x: file_lut.get(x, {"remote_path": "unknown"})["remote_path"]
    )
    df["igsr_folder_exists"] = df["filename"].apply(
        lambda x: file_lut.get(x, {"igsr_folder_exists": "no"})["igsr_folder_exists"]
    )
    df["igsr_file_exists"] = df["filename"].apply(
        lambda x: file_lut.get(x, {"igsr_file_exists": "no"})["igsr_file_exists"]
    )
    
    return df

all_file_stats = []
for stats_file in stats_files:
    sample = stats_file.name.split("_")[0]
    if len(sample) > 7 or sample in ["GM19320", "NA19320"]:
        print("skipping ", sample)
        continue
    data_stats = read_stats_minimal_info(stats_file, file_lut)
    all_file_stats.append(data_stats)
    
all_file_stats = pd.concat(all_file_stats, axis=0, ignore_index=False)
all_file_stats.fillna(0, inplace=True)
all_file_stats.sort_values(["SIN", "filename"], inplace=True)
all_file_stats.set_index("SIN", inplace=True)

#### following: explicit sanity checks
# no file that is on the file system (known) can be unused [explicit exceptions below]
# all files that were used must be known (be on the file system)

all_used_files = set(all_file_stats["filename"].values) - set(["all"])

all_known_files = set(file_lut.keys())

unused_files = all_known_files - all_used_files
missed_files = all_used_files - all_known_files
assert len(missed_files) == 0, sorted(missed_files)

revio_733 = [
    "m84039_230304_003049_s1.hifi_reads.bc2039.fastq.gz",
    "m84039_230307_013359_s4.hifi_reads.bc2039.fastq.gz",
    "m84039_230308_011313_s3.hifi_reads.bc2039.fastq.gz",
    "m84046_230324_222350_s1.hifi_reads.bc2039.fastq.gz"
]

epi_hg002 = [
    "NA24385_20190125_UL_ext-prom1-1-E5-H5-PAD29338_guppy-5.0.11-sup-prom_fastq_pass.fastq.gz",
    "NA24385_20190127_UL_ext-prom1-1-A9-D9-PAD28937_guppy-5.0.11-sup-prom_fastq_pass.fastq.gz",
    "NA24385_20190128_UL_ext-prom1-1-A9-D9-PAD28937_guppy-5.0.11-sup-prom_fastq_pass.fastq.gz",
    "NA24385_20190128_UL_ext-prom1-1-E9-H9-PAD28926_guppy-5.0.11-sup-prom_fastq_pass.fastq.gz",
    "NA24385_20190129_UL_ext-prom1-1-A9-D9-PAD28937_guppy-5.0.11-sup-prom_fastq_pass.fastq.gz",
    "NA24385_20190129_UL_ext-prom1-1-E9-H9-PAD28926_guppy-5.0.11-sup-prom_fastq_pass.fastq.gz"
]

save_to_ignore = 0
ignored = []
for unf in unused_files:
    file_info = file_lut[unf]
    if "mother" in file_info["subfolder"] or "father" in file_info["subfolder"]:
        # filtered above now
        save_to_ignore += 1
        ignored.append(unf)
        continue
    if unf in revio_733:
        save_to_ignore += 1
        ignored.append(unf)
        continue
    if unf in epi_hg002:
        save_to_ignore += 1
        ignored.append(unf)
        continue
    print(unf)

# last assertion of sanity checks
assert len(all_used_files) + save_to_ignore == len(all_known_files)  

# norm column data types
for c in all_file_stats.columns:
    if "length" in c or "num" in c:
        all_file_stats[c] = all_file_stats[c].astype(int)

with open(output_table, "w") as dump:
    _ = dump.write(f"# {TODAY}\n")
    all_file_stats.to_csv(dump, sep="\t", header=True, index=True, index_label="SIN")
        

skipping  NA24143_HG004_mother
skipping  NA24149_HG003_father
skipping  HG00733R
skipping  HG00733red1
skipping  HG00733red2
skipping  MTJCCL157
skipping  HG00733red1
skipping  MTJSMPMIX
skipping  MTJSMPMIX
skipping  HG00733red2
skipping  HG00733R
skipping  MTJCCL157
