In [22]:
import pandas as pd

In [23]:
def att2dict(s, sep) -> dict:
    fields = s.strip().split(';')
    att = dict()
    for x in fields:
        temp = x.strip().split(sep)
        if len(temp) < 2: continue
        k = temp[0]
        v = temp[1]
        att[k] = v
    return att

def load_fix_report(fn) -> dict:
    df = pd.read_csv(fn, sep='\t')
    fixed_gene_spans = dict()
    for _, row in df.iterrows():
        gid = row['GeneID']
        assert gid not in fixed_gene_spans
        fixed_gene_spans[gid] = (int(row['NewStart']), int(row['NewEnd']))
    return fixed_gene_spans

def load_gene_spans(fn) -> dict:
    hdr = ['ctg', 'src', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
    df = pd.read_csv(fn, sep='\t', header=None, comment='#')
    df.columns = hdr
    gene_spans = dict()
    for _, row in df.iterrows():
        if row['type'] == 'gene':
            att_tbl = att2dict(row['attributes'], sep='=')
            gid = att_tbl['ID']
            assert gid not in gene_spans
            gene_spans[gid] = (int(row['start']), int(row['end']))
    return gene_spans

In [39]:
fixed_fn = "/ccb/salz2/kh.chao/LiftOn_HG002/results/HG002_MAT/miniprot_new_copies/gene_fix_report.txt"
fixed_gene_spans = load_fix_report(fixed_fn)

In [40]:
curr_fn = "/ccb/salz4-3/hji20/hg002-q100-annotation/results/5_tidy/mat_loff/post_swap/final.sorted.fmted.gff"
curr_gene_spans = load_gene_spans(curr_fn)

In [41]:
diff_tbl = dict()
for gid in fixed_gene_spans:
    gstart, gend = fixed_gene_spans[gid]
    if gstart != curr_gene_spans[gid][0] or gend != curr_gene_spans[gid][1]:
        diff_tbl[gid] = (gstart, gend, curr_gene_spans[gid][0], curr_gene_spans[gid][1])
print(len(diff_tbl))

140


In [42]:
out_fn = "/ccb/salz4-3/hji20/hg002-q100-annotation/results/5_tidy/mat_loff/post_swap/kh_diff.csv"
with open(out_fn, 'w') as fh:
    fh.write(f'gene_id,start_old,end_old,start_new,end_new\n')
    for gid in diff_tbl:
        res = diff_tbl[gid]
        fh.write(f'{gid},{res[0]},{res[1]},{res[2]},{res[3]}\n')