In [10]:
import pysam
import pandas as pd
from tqdm import tqdm

In [3]:
fn = "/ccb/salz4-3/hji20/off-target-probe-checker/data/gencode_pc_g2t_mapping.csv"
df = pd.read_csv(fn)
t2g = dict()
for i, row in df.iterrows():
    gid = row['gene_id']
    tid = row['transcript_id']
    t2g[tid] = gid 

In [4]:
def convert_md2bit(s):
    running = ""
    bit_s = ""
    for c in s:
        if c.isdigit():
            running += c
        else:
            if len(running) > 0:
                bit_s += '1' * int(running)
            bit_s += '0'
            running = ""
    if len(running) > 0:
        bit_s += '1' * int(running)
    return bit_s

# in case of indels
def convert_cigar2bit(tup):
    bit_s = ""
    for x in tup:
        op, l = x
        if op == 0: # match
            bit_s += '1' * l
        elif op == 1 or op == 4: # soft clip or ins
            bit_s += '0' * l
    return bit_s

def convert_cigar2bit_del(tup, n):
    bit_s_lst = []
    bit_s = ""
    for x in tup:
        op, l = x
        if op == 0: # match
            bit_s += '1' * l
        elif op == 1 or op == 4: # soft clip or ins
            bit_s += '0' * l
        elif op == 2:
            temp = '0' * len(bit_s)
            bit_s += '0' * (n - len(bit_s))
            bit_s_lst.append(bit_s)
            bit_s = temp
    bit_s_lst.append(bit_s) # TODO: check if this correct
    return bit_s_lst

In [5]:
fn = "/ccb/salz4-3/hji20/off-target-probe-checker/results/bt2/xenium_human_breast_gepps.pc_only.bam"
ainfos = dict()
crit_bit = "0" * 10 + "1" * 20 + "0" * 10
crit_deci = int(crit_bit, 2)
with pysam.AlignmentFile(fn, 'rb') as fh:
    for brec in fh:
        if brec.is_unmapped or brec.is_supplementary:
            continue
        qname = brec.query_name
        tname = brec.reference_name
        if qname not in ainfos:
            ainfos[qname] = set()
        cigar = brec.cigarstring
        if cigar == '40M':
            nm = int(brec.get_tag('NM'))
            if nm == 0:
                ainfos[qname].add(t2g[tname].split('.')[0])
            else: # examine MD tag
                md = brec.get_tag('MD')
                md_bit = convert_md2bit(md)
                if crit_deci & int(md_bit, 2) == crit_deci:
                    ainfos[qname].add(t2g[tname].split('.')[0])
        else:
            if 'D' in cigar:
                md_bits = convert_cigar2bit_del(brec.cigartuples, 40)
                hit = False
                for bit in md_bits:
                    if crit_deci & int(bit, 2) == crit_deci:
                        hit = True
                if hit:
                    ainfos[qname].add(t2g[tname].split('.')[0])
            else: # includes 'I', 'S', and 'M' (but not 'D')
                md_bit = convert_cigar2bit(brec.cigartuples)
                if crit_deci & int(md_bit, 2) == crit_deci:
                    ainfos[qname].add(t2g[tname].split('.')[0])

In [33]:
# considering only perfect matches
ctr = 0
for x in ainfos:
    if len(ainfos[x]) > 1:
        ctr += 1
ctr

93

In [35]:
# mismatches in the 10bp windows at the beginning and end of probe
ctr = 0
for x in ainfos:
    if len(ainfos[x]) > 1:
        ctr += 1
ctr

112

In [6]:
# indels, mismatches, and soft clips considered
over_ctr = 0
under_ctr = 0
for x in ainfos:
    if len(ainfos[x]) > 1:
        over_ctr += 1
    elif len(ainfos[x]) == 0:
        under_ctr += 1
print(f'{over_ctr}\t{under_ctr}')

183	1


In [8]:
out_fn = "/ccb/salz4-3/hji20/off-target-probe-checker/hayden/probe2targets.tsv"
with open(out_fn, 'w') as fh:
    for x in ainfos:
        temp = ','.join(ainfos[x])
        fh.write(f'{x}\t{len(ainfos[x])}\t[{temp}]\n')

for convenience, let's also convert the GENCODE `gene_id` to more canonical `gene_name`

In [9]:
fn = "/ccb/salz4-3/hji20/off-target-probe-checker/data/gencode.v47.basic.annotation.fmted.gff"
df = pd.read_csv(fn, sep='\t', header=None)
df.columns = ['chr', 'src', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
df.head()

Unnamed: 0,chr,src,feature,start,end,score,strand,frame,attribute
0,chr1,HAVANA,gene,11121.0,24894.0,.,+,.,ID=ENSG00000290825.2;gene_id=ENSG00000290825.2...
1,chr1,HAVANA,transcript,11426.0,14409.0,.,+,.,ID=ENST00000832828.1;Parent=ENSG00000290825.2;...
2,chr1,HAVANA,exon,11426.0,11671.0,.,+,.,ID=exon:ENST00000832828.1:1;Parent=ENST0000083...
3,chr1,HAVANA,exon,12010.0,12227.0,.,+,.,ID=exon:ENST00000832828.1:2;Parent=ENST0000083...
4,chr1,HAVANA,exon,12613.0,12721.0,.,+,.,ID=exon:ENST00000832828.1:3;Parent=ENST0000083...


In [11]:
def get_ginfo(s):
    temp = s.split(';')
    gid = None
    gname = None
    for x in temp:
        kv = x.split('=')
        if len(kv) != 2: continue
        if kv[0] == 'ID':
            gid = kv[1]
        elif kv[0] == 'gene_name':
            gname = kv[1]
            break
    return gid, gname

ginfos = dict() # <k,v> = <gid, gname>
ctr = 0
for i, row in tqdm(df.iterrows()):
    if row['feature'] == 'gene':
        gid, gname = get_ginfo(row['attribute'])
        ctr += 1
        if not gid or not gname:
            print("error gene attributes")
            break
        ginfos[gid] = gname
print(f'{ctr} genes loaded')

2223391it [01:15, 29468.96it/s]

78724 genes loaded





In [13]:
ginfos_fmted = dict()
for gid in ginfos:
    gid_fmted = gid.split('.')[0]
    ginfos_fmted[gid_fmted] = ginfos[gid]

In [14]:
out_fn = "/ccb/salz4-3/hji20/off-target-probe-checker/hayden/probe2targets.named.tsv"
with open(out_fn, 'w') as fh:
    for x in ainfos:
        temp = ','.join([ginfos_fmted[g] for g in ainfos[x]])
        fh.write(f'{x}\t{len(ainfos[x])}\t[{temp}]\n')