In [16]:
%run "../00_project_config.ipynb"

import pandas as pd
import hashlib as hl

table_files = PROJECT_BASE.joinpath(
    "annotations/external"
).glob("porubsky*.tsv")

out_table = PROJECT_BASE.joinpath(
    "annotations/roi", "porubsky2023_hprc_common-gaps.tsv"
)

def norm_boolean_value(value):
    
    if isinstance(value, bool):
        return value
    if isinstance(value, str):
        if value.lower() == "false" or value == "0":
            return False
        if value.lower() == "true" or value == "1":
            return True
        raise ValueError(value)
    if isinstance(value, int):
        if value == 0:
            return False
        if value == 1:
            return True
        raise ValueError(value)
    

def make_region_name(row):
    
    region_id = f"{row.chrom}:{row.start}-{row.end}"
    region_name = hl.md5(region_id.encode("utf-8")).hexdigest()
    return region_name
    

merged = []
for table_file in table_files:
    df = pd.read_csv(table_file, sep="\t", header=0)
    rename_cols = {}
    for c in df.columns:
        if c == "seqnames":
            rename_cols[c] = "chrom"
            continue
        if c == "score":
            rename_cols[c] = "hap_broken"
            continue
        if c == "width":
            rename_cols[c] = "name"
            continue
        new_name = c.replace(".", "_")
        rename_cols[c] = new_name
    df.rename(rename_cols, axis=1, inplace=True)
    df["SD_associated"] = df["SD_associated"].apply(norm_boolean_value)
    df["CHM13_known_coverage_issue"] = df["CHM13_known_coverage_issue"].apply(norm_boolean_value)
    df["name"] = df.apply(make_region_name, axis=1)
    print(df.shape)
    merged.append(df)
    
merged = pd.concat(merged, axis=0, ignore_index=False)
merged.drop_duplicates("name", keep="first", inplace=True)
merged.sort_values(["chrom", "start"], inplace=True)

win_size = int(1e4)

merged["win_start"] = merged["start"] // win_size * win_size
merged["win_end"] = merged["end"] // win_size * win_size + win_size
assert (merged["win_start"] < merged["win_end"]).all()

merged.to_csv(out_table, sep="\t", header=True, index=False)

(592, 7)
(44, 7)
