In [2]:
%run "../00_project_config.ipynb"

import pathlib as pl
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt


data_folder = PROJECT_DATA_ROOT.joinpath("2023_batch_data_tables/read_stats")

out_table_batch1 = PROJECT_DATA_ROOT.joinpath("2023_batch_data_tables/hgsvc3_batch1_read_stats.tsv")
out_subset_batch1 = PROJECT_DATA_ROOT.joinpath("2023_batch_data_tables/hgsvc3_batch1_min_subset.tsv")
out_table_batch2 = PROJECT_DATA_ROOT.joinpath("2023_batch_data_tables/hgsvc3_batch2_read_stats.tsv")
out_subset_batch2 = PROJECT_DATA_ROOT.joinpath("2023_batch_data_tables/hgsvc3_batch2_min_subset.tsv")
out_table_batch3 = PROJECT_DATA_ROOT.joinpath("2023_batch_data_tables/hgsvc3_batch3_read_stats.tsv")
out_subset_batch3 = PROJECT_DATA_ROOT.joinpath("2023_batch_data_tables/hgsvc3_batch3_min_subset.tsv")

select_stats = [
    "total_length_grt_0bp",
    "total_num_grt_0bp",
    "cov_xfold_grt_0bp_at_3Gbp",
    "length_N50_grt_0bp",
    "total_length_grt_15kbp",
    "total_num_grt_15kbp",
    "cov_xfold_grt_15kbp_at_3Gbp",
    "total_length_grt_100kbp",
    "total_num_grt_100kbp",
    "length_N50_grt_100kbp",
    "cov_xfold_grt_100kbp_at_3Gbp",
]

merged_read_stats = []
for table_file in data_folder.glob("*.tsv"):
    sample, read_type = table_file.name.split(".")[0].split("_")
    if sample in ["HG00733red1", "HG00733red2", "GM19320", "NA19320"]:
        continue
    assert read_type in ["hifi", "ont"]
    df = pd.read_csv(table_file, header=0, sep="\t")
    df["sample"] = sample
    df["read_type"] = read_type
    select_sample = DATA_PRODUCTION_STATUS["sample"] == sample
    if sum(select_sample) == 0:
        select_sample = DATA_PRODUCTION_STATUS["sample"] == "GM" + sample[2:]
    assert sum(select_sample) == 1
    df["sample_batch"] = int(DATA_PRODUCTION_STATUS.loc[select_sample, "sample_batch"].values[0])
    df = df.loc[df["statistic"].isin(select_stats), :].copy()
    merged_read_stats.append(df)
    
merged_read_stats = pd.concat(merged_read_stats, axis=0, ignore_index=False)
merged_read_stats.sort_values(["sample", "read_type", "source", "statistic"], inplace=True)

merged_read_stats = merged_read_stats[["sample", "sample_batch", "read_type", "source", "statistic", "value"]]

batch1_data = merged_read_stats.loc[merged_read_stats["sample_batch"] == 1, :]
batch1_data.to_csv(out_table_batch1, sep="\t", header=True, index=False)

batch2_data = merged_read_stats.loc[merged_read_stats["sample_batch"] == 2, :]
batch2_data.to_csv(out_table_batch2, sep="\t", header=True, index=False)

batch3_data = merged_read_stats.loc[merged_read_stats["sample_batch"] == 3, :]
batch3_data.to_csv(out_table_batch3, sep="\t", header=True, index=False)

# dump minimal subset for easier manual checking of dataset completeness

min_select_stats = ["cov_xfold_grt_0bp_at_3Gbp", "total_num_grt_0bp", "total_length_grt_0bp"]
min_select_source = ["all"]

subset = [
    (batch1_data, out_subset_batch1),
    (batch2_data, out_subset_batch2),
    (batch3_data, out_subset_batch3)
]

for batch_data, out_file in subset:
    select_source = batch_data["source"].isin(min_select_source)
    select_stats = batch_data["statistic"].isin(min_select_stats)

    min_subset = batch_data.loc[select_source & select_stats, :].copy()
    min_subset.sort_values(["sample", "read_type", "statistic"], inplace=True)
    min_subset.to_csv(out_file, sep="\t", header=True, index=False)