In [41]:
import pathlib as pl
import pandas as pd
import collections as col

stats_table_file = pl.Path("agg_miso_stats.tsv")

agg_stats = pd.read_csv(stats_table_file, sep="\t", header=0)

print(agg_stats.loc[agg_stats["miso_size"] > 1e6, :])

raise



df = pd.read_csv("ssq_eval_allRegions.tsv", sep="\t", header=0)
sub = df.loc[df["states"] == "ww", :].copy()

sub["sample"] = sub["ID"].apply(lambda x: x.split("_")[1])
sub["sample"] = sub["sample"].str.replace("GM", "NA")
sub["seqtag"] = sub["ID"].apply(lambda x: {"H1": "hap1", "H2": "hap2"}[x.split("_")[-1]])

sub.sort_values("sample", inplace=True)

window_size = 1000

cov_files_folder = pl.Path("/home/ebertp/work/projects/hgsvc/2023_readassm_align/primcov")
cov_files = col.defaultdict(dict)
for hdf in cov_files_folder.glob("*vrk*.h5"):
    sample = hdf.name.split(".")[0]
    read_type = hdf.name.split(".")[2]
    cov_files[sample][read_type] = hdf
    
def extract_windowed_coverage(table_row, win_size, read_type, hdf_file):
    with pd.HDFStore(hdf_file, "r") as hdf:
        stats = hdf["/statistics"]
        seq_name = table_row.seqnames
        tagged_name = f"{seq_name}.{table_row.seqtag}"
        try:
            store_key = stats.loc[seq_name, "store_key"]
        except KeyError:
            try:
                store_key = stats.loc[tagged_name, "store_key"]
            except KeyError:
                return dict()
        
        cov_data = hdf[f"/{store_key}"]
    cov_median = cov_data.median()

    pctiles = [0.25, 0.5, 0.75]

    midpoint = int(table_row.start + table_row.width/2)
    coord_pairs = [
        (max(0, table_row.start-win_size), table_row.start+win_size),
        (midpoint-win_size, midpoint+win_size),
        (table_row.end-win_size, table_row.end+win_size)
    ]
    pair_labels = ["begin", "mid", "end"]

    all_stats = {f"{read_type}_median": int(cov_median)}
    for (s, e), label in zip(coord_pairs, pair_labels):
        cov_stats = cov_data[s:e].describe(percentiles=pctiles)
        try:
            labeled_stats = {
                f"{read_type}_{label}_min": int(cov_stats["min"]),
                f"{read_type}_{label}_q1": int(cov_stats["25%"]),
                f"{read_type}_{label}_q2": int(cov_stats["50%"]),
                f"{read_type}_{label}_q3": int(cov_stats["75%"]),
                f"{read_type}_{label}_max": int(cov_stats["max"]),
            }
        except ValueError:
            print(table_row)
            print(label, s, e)
            print(cov_data.shape)
            print(cov_median)
            print(cov_stats)
            raise
        all_stats.update(labeled_stats)
    return all_stats
        
stats_table = []
for n, row in enumerate(sub.itertuples(), start=1):
    print(f"{n}/{sub.shape[0]}")
    hifi_file = cov_files[row.sample]["hifi"]
    ont_file = cov_files[row.sample]["ont"]
    
    row_stats = {
        "sample": row.sample,
        "asm_unit": row.seqtag,
        "contig": row.seqnames,
        "start": row.start,
        "end": row.end,
        "watson": row.Ws,
        "crick": row.Cs,
        "miso_size": row.width,
        "contig_length": row._9
    }
    cov_stats = extract_windowed_coverage(row, window_size, "hifi", hifi_file)
    row_stats.update(cov_stats)
    cov_stats = extract_windowed_coverage(row, window_size, "ont", ont_file)
    row_stats.update(cov_stats)
    
    stats_table.append(row_stats)
        
stats_table = pd.DataFrame.from_records(stats_table)
stats_table.fillna(0, inplace=True)

stats_table.to_csv("agg_miso_stats.tsv", sep="\t", header=True, index=False)

      sample asm_unit              contig     start       end  watson  crick   
52   HG01505     hap1  haplotype1-0000008  79756892  81105445     958     52  \
62   HG01596     hap1  haplotype1-0000016  23882435  27208650     103      4   
131  HG03009     hap1  haplotype1-0000033  24803388  27257819      27      1   
155  HG03452     hap1  haplotype1-0000030  14933496  18143940     163     15   
181  HG03807     hap2  haplotype2-0000083         1   3302852   11375   3277   
223  NA19331     hap2  haplotype2-0000179         1   1687702    2038    615   
296  NA21487     hap2  haplotype2-0000145   3500070   8517941    9733    761   
297  NA21487     hap2  haplotype2-0000145   8662426  10841423    5328    234   

     miso_size  contig_length  hifi_median  ...  ont_mid_min  ont_mid_q1   
52     1348554       81105445         18.0  ...         18.0        20.0  \
62     3326216       27208650         19.0  ...         26.0        26.0   
131    2454432       34232606         20.0  ...    

RuntimeError: No active exception to reraise