In [1]:
import pandas as pd

In [74]:
loff_fn = "/ccb/salz4-3/hji20/hg002-q100-annotation/results/5_tidy/pat_rslippage.gff"
refseq_fn = "/ccb/salz4-3/hji20/hg002-q100-annotation/results/refseq/GCA_018852605.3_hg002v1.1.pat_genomic.gff"

In [70]:
qry_tids = [
    "NM_004152.3",
    "NM_001301371.1",
    "NM_001301020.1",
    "NM_002537.3",
    "NM_015068.3",
    "NM_001184961.1",
    "NM_001301302.1",
    "NM_001134939.1",
    "NM_016178.2",
    "NM_001172437.2"
]

In [71]:
def att2dict(s, sep) -> dict:
    fields = s.strip().split(';')
    att = dict()
    for x in fields:
        temp = x.strip().split(sep)
        if len(temp) < 2: continue
        k = temp[0]
        v = temp[1]
        att[k] = v
    return att

def load_gan_refseq(qry_tids, fn):
    hdr = ['ctg', 'src', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
    df = pd.read_csv(fn, sep='\t', header=None)
    df.columns = hdr
    gan_tbl = dict()
    for _, row in df.iterrows():
        if row['type'] == 'exon':
            att_tbl = att2dict(row['attributes'], sep='=')
            tid = att_tbl['Parent'].split('-')[1]
            if tid in qry_tids:
                if tid not in gan_tbl:
                    gan_tbl[tid] = [[], []]
                gan_tbl[tid][0].append((row['start'], row['end'], row['strand']))
        elif row['type'] == 'CDS':
            att_tbl = att2dict(row['attributes'], sep='=')
            tid = att_tbl['Parent'].split('-')[1]
            if tid in qry_tids:
                if tid not in gan_tbl:
                    gan_tbl[tid] = [[], []]
                gan_tbl[tid][1].append((row['start'], row['end'], row['strand']))
    return gan_tbl

def load_gan_loff(qry_tids, fn):
    hdr = ['ctg', 'src', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
    df = pd.read_csv(fn, sep='\t', header=None)
    df.columns = hdr
    load = False
    gan_tbl = dict()
    for _, row in df.iterrows():
        if row['type'] == 'transcript':
            att_tbl = att2dict(row['attributes'], sep='=')
            if att_tbl['origin_ID'] in qry_tids:
                tid = att_tbl['origin_ID']
                gan_tbl[tid] = [[], []]
                load = True
            else:
                load = False
        else:
            if load:
                if row['type'] == 'exon':
                    gan_tbl[tid][0].append((row['start'], row['end'], row['strand']))
                elif row['type'] == 'CDS':
                    gan_tbl[tid][1].append((row['start'], row['end'], row['strand']))
    return gan_tbl

In [76]:
refseq_gan_tbl = load_gan_refseq(qry_tids, refseq_fn)

In [72]:
loff_gan_tbl = load_gan_loff(qry_tids, loff_fn)

In [64]:
def is_tx_eq(tx_1, tx_2) -> bool:
    e_chain_1 = sorted(tx_1[0], key=lambda x: x[0], reverse=False)
    e_chain_2 = sorted(tx_2[0], key=lambda x: x[0], reverse=False)
    if len(e_chain_1) != len(e_chain_2): return False
    for i in range(len(e_chain_1)):
        e1 = e_chain_1[i]
        e2 = e_chain_2[i]
        if e1[0] != e2[0] or e1[1] != e2[1]:
            return False
    c_chain_1 = sorted(tx_1[1], key=lambda x: x[0], reverse=False)
    c_chain_2 = sorted(tx_2[1], key=lambda x: x[0], reverse=False)
    if len(c_chain_1) != len(c_chain_2): return False
    for i in range(len(c_chain_1)):
        c1 = c_chain_1[i]
        c2 = c_chain_2[i]
        if c1[0] != c2[0] or c1[1] != c2[1]:
            return False
    return True

In [77]:
for tid in qry_tids:
    assert tid in refseq_gan_tbl and tid in loff_gan_tbl
    res = is_tx_eq(refseq_gan_tbl[tid], loff_gan_tbl[tid])
    print(f'{tid}\t{res}')

NM_004152.3	True
NM_001301371.1	True
NM_001301020.1	True
NM_002537.3	True
NM_015068.3	True
NM_001184961.1	True
NM_001301302.1	True
NM_001134939.1	True
NM_016178.2	True
NM_001172437.2	True
