In [42]:
import pandas as pd
import re

censat_header = "chrom	start	end	label	score	strand	thickstart	thickend	rgb".split()

t2t_censat = pd.read_csv(
    "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/annotation/chm13v2.0_censat_v2.1.bed",
    sep="\t", header=None, names=censat_header,
    usecols=censat_header[:4]
)

t2t_par = pd.read_csv(
    "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0_PAR.bed",
    sep="\t", header=None, names=["chrom", "start", "end"]
)

t2t_censat["retain"] = t2t_censat["label"].str.contains("hor|hsat[123]|rdna", regex=True, flags=re.IGNORECASE)
t2t_censat = t2t_censat.loc[t2t_censat["retain"], :].copy()
t2t_censat = t2t_censat[["chrom", "start", "end"]]

mask_regions = t2t_censat.groupby("chrom").agg({"start": "min", "end": "max"})

final_regions = []
for row in mask_regions.itertuples():
    if row.start < int(1e6):
        final_regions.append((row.Index, 0, row.end))
    elif row.Index == "chrX":
        par1 = t2t_par.loc[0, :]
        par2 = t2t_par.loc[1, :]
        # for X, sufficient distance to PARs
        final_regions.append((row.Index, par1.start, par1.end))
        final_regions.append((row.Index, par2.start, par2.end))
        final_regions.append((row.Index, row.start, row.end))
    elif row.Index == "chrY":
        par1 = t2t_par.loc[2, :]
        par2 = t2t_par.loc[3, :]
        final_regions.append((row.Index, par1.start, par1.end))
        # distance is only ~50 kbp, so merge everything
        final_regions.append((row.Index, row.start, par2.end))
    else:
        final_regions.append((row.Index, row.start, row.end))

def _norm_chrom_sort(region):
    chrom = region[0]
    try:
        n = int(chrom.strip("chr"))
    except ValueError:
        n = {"chrX": 23, "chrY": 24}[chrom]
    return n

final_regions = sorted(final_regions, key=_norm_chrom_sort)
out_table = "/home/ebertp/work/code/cubi/project-run-hgsvc-hybrid-assemblies/annotations/roi/suppl/"
out_table += "t2tv2_censat-v21_par_exclude.bed"

with open(out_table, "w") as dump:
    for (chrom, start, end) in final_regions:
        dump.write(f"{chrom}\t{start}\t{end}\n")
