In [1]:
import pathlib as pl
import pandas as pd
import statistics
import collections as col

pd.set_option('future.no_silent_downcasting', True)

%cd -q "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/notebooks"
_PROJECT_CONFIG_NB = str(pl.Path("00_project_config.ipynb").resolve(strict=True))

%run $_PROJECT_CONFIG_NB

summary_files = PROJECT_DATA_ROOT.joinpath(
    "2024_gap_eval"
).glob("*ctg-summary*tsv")


def split_columns(columns, sex_lookup):

    males, females = [], []
    for c in columns:
        if sex_lookup[c.rsplit(".", 1)[0]] == "male":
            males.append(c)
        else:
            females.append(c)
    assert len(males) > 0
    return males, females


concat = []
hap_sex_lookup = None
all_stats_columns = col.defaultdict(list)
for summary_file in summary_files:
    err_t = summary_file.name.split(".")[-2].split("-")[-1]
    summary = pd.read_csv(summary_file, sep="\t", header=0)
    if len(concat) == 0:
        sub = summary[["chrom", "start", "end", "gap_id", "gap_length"]].copy()
        sub.set_index("gap_id", inplace=True)
        concat.append(sub)
        sex_columns = [c for c in summary.columns if c.endswith("sex")]
        hap_sex_lookup = dict(
            (c.rsplit(".", 1)[0], summary.at[0, c]) for c in sex_columns
        )
        
    summary.set_index("gap_id", inplace=True)

    stats_columns = [c for c in summary.columns if "closed_" in c]
    assert len(stats_columns) == 130
    sub = summary[stats_columns].copy()
    all_stats_columns[("all", "any")].extend(stats_columns)
    all_stats_columns[(err_t, "any")].extend(stats_columns)
    
    male_columns, female_columns = split_columns(stats_columns, hap_sex_lookup)
    all_stats_columns[("all", "male")].extend(male_columns)
    all_stats_columns[(err_t, "male")].extend(male_columns)
    all_stats_columns[("all", "female")].extend(female_columns)
    all_stats_columns[(err_t, "female")].extend(female_columns)

    concat.append(sub)

summary = pd.concat(concat, axis=1, ignore_index=False)
summary.loc[:, all_stats_columns[("all", "any")]] = summary.loc[:, all_stats_columns[("all", "any")]].replace({-1: 0}, inplace=False)

summary["max_closed_haps"] = 130
summary.loc[summary["chrom"] == "chrY", "max_closed_haps"] = 30
summary.loc[summary["chrom"] == "chrX", "max_closed_haps"] = 100

set_values = []
for (err_t, _) in list(all_stats_columns.keys()):
    if err_t == "all":
        continue
    summary[f"closed_haps_n_{err_t}"] = 0.
    summary[f"closed_haps_pct_{err_t}"] = 0.

    for row in summary.itertuples():
        if row.chrom == "chrY":
            get_columns = all_stats_columns[(err_t, "male")]
        elif row.chrom == "chrX":
            get_columns = all_stats_columns[(err_t, "female")]
        else:
            get_columns = all_stats_columns[(err_t, "any")]
        agg = summary.loc[row.Index, get_columns].sum()
        set_values.append(
            (row.Index, f"closed_haps_n_{err_t}", agg)
        )
        pct = round(agg/row.max_closed_haps * 100, 1)
        set_values.append(
            (row.Index, f"closed_haps_pct_{err_t}", pct)
        )

for idx, column, value in set_values:
    summary.loc[idx, column] = value


print(summary["closed_haps_pct_1pct"].describe())
select_auto = ~summary["chrom"].isin(["chrX", "chrY"])
print("===")
print("summary autosomes at 1pct:")
print(summary.loc[select_auto, "closed_haps_pct_1pct"].describe())

summary.reset_index(drop=True, inplace=True)

reorder_columns = ["chrom", "start", "end", "gap_length", "max_closed_haps"]
for (err_t, sample_set) in list(all_stats_columns.keys()):
    if err_t == "all":
        continue
    if sample_set != "any":
        continue
    reorder_columns.append(
        f"closed_haps_n_{err_t}"
    )
    reorder_columns.append(
        f"closed_haps_pct_{err_t}"
    )

sample_columns = sorted([c for c in summary.columns if ".hap" in c])
reorder_columns.extend(sample_columns)
summary = summary[reorder_columns]

out_file = PROJECT_DATA_ROOT.joinpath(
    "2024_gap_eval", "sample_summary.all-t.pre-supp-table.tsv"
)
summary.to_csv(out_file, sep="\t", header=True, index=False)



count    592.000000
mean      86.943243
std       21.561182
min        0.000000
25%       88.000000
50%       96.100000
75%       98.500000
max      100.000000
Name: closed_haps_pct_1pct, dtype: float64
===
summary autosomes at 1pct:
count    520.000000
mean      87.049423
std       22.530452
min        0.000000
25%       89.200000
50%       96.200000
75%       98.500000
max      100.000000
Name: closed_haps_pct_1pct, dtype: float64
