In [5]:

import pandas as pd
import pathlib as pl

sample_info = pl.Path(
    "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/annotations/projectmng",
    "hgsvc_samples.tsv"
)

samples = pd.read_csv(sample_info, comment="#", sep="\t", header=0)
sample_sex = dict((row.sample.replace("GM", "NA"), row.sex.lower()) for row in samples.itertuples())

read_info = pl.Path(
    "/home/ebertp/work/projects/hgsvc/2023_batch_data_tables",
)

hifi_cov = dict()
ont_cov = dict()
for batch_table in read_info.glob("*_min_subset.tsv"):
    cov_data = pd.read_csv(batch_table, sep="\t", header=0)
    cov_data = cov_data.loc[cov_data["statistic"] == "cov_xfold_grt_0bp_at_3Gbp", :].copy()
    for row in cov_data.itertuples():
        if row.read_type == "hifi":
            hifi_cov[row.sample] = row.value
        elif row.read_type == "ont":
            ont_cov[row.sample] = row.value
        else:
            raise
        
input_folder = pl.Path("/home/ebertp/work/projects/hgsvc/2023_assm_gene/asmgene_raw")

rows = []
for txt_file in input_folder.glob("*.txt"):
    parts = txt_file.name.split(".")
    sample = parts[0]
    asm_unit = parts[2]
    with open(txt_file, "r") as listing:
        for line in listing:
            if not line.startswith("X"):
                continue
            if line.startswith("H"):
                continue
            _, label, ref_count, asm_count = line.strip().split()
            if label.endswith("+"):
                label = f"grt_{label[:-1]}"
            elif label.endswith("-"):
                label = f"lst_{label[:-1]}"
            else:
                pass
            record = (sample, asm_unit, label, int(ref_count), int(asm_count))
            rows.append(record)
            
df = pd.DataFrame(
    rows, columns=["sample", "asm_unit", "stats_label", "ref_count", "asm_count"]
)

ref_count_max = df["ref_count"].max()
df["asm_count_pct"] = (df["asm_count"] / ref_count_max * 100.).round(3)
df["sex"] = df["sample"].replace(sample_sex)
df["hifi_cov"] = df["sample"].replace(hifi_cov)
df["ont_cov"] = df["sample"].replace(ont_cov)
df.sort_values(["sample", "asm_unit"], inplace=True)

summary_table = input_folder.parent.joinpath("asmgene_raw.summary.tsv")

df.to_csv(summary_table, sep="\t", header=True, index=False)