In [1]:
import pyfastx
import os
from tqdm import tqdm
import pandas as pd

In [2]:
def load_mums(fn) -> dict:
    mums = dict()
    with open(fn, 'r') as fh:
        for ln in fh:
            clean_ln = ln.strip()
            if clean_ln[0] == '>':
                qname = clean_ln.split()[1].replace("> ", "")
            else:
                temp = clean_ln.split()
                if qname not in mums:
                    mums[qname] = [(temp[0], int(temp[1]), int(temp[2]), int(temp[3]))]
                else:
                    mums[qname].append((temp[0], int(temp[1]), int(temp[2]), int(temp[3])))
    return mums

def check_lft_and_rgt(mrec, qname, qry_fa, tgt_fa, max_nm=1):
    qlen = len(qry_fa[qname].seq)
    tname, tst, qst, mlen = mrec
    if mlen == 40:
        return (True, '1' * mlen, 0)
    qst -= 1
    tst -= 1
    qen = qst + mlen
    ten = tst + mlen
    lft_qos = qst
    rgt_qos = qlen - qen
    qseq = qry_fa[qname].seq
    tseq = tgt_fa[tname].seq
    lft_tseq = tseq[tst - lft_qos:tst]
    rgt_tseq = tseq[ten:ten + rgt_qos]
    lft_qseq = qseq[qst - lft_qos:qst]
    rgt_qseq = qseq[qen:qen + rgt_qos]
    if len(lft_tseq) != lft_qos: # tseq runs out at 5'
        return (False, None, -1)
    if len(rgt_tseq) != rgt_qos: # tseq runs out at 3'
        return (False, None, -1)
    lft_mvec = ""
    lft_nm = 0
    for i in range(lft_qos):
        if lft_tseq[i] == lft_qseq[i]:
            lft_mvec += "1"
        else:
            lft_mvec += "0"
            lft_nm += 1
    rgt_mvec = ""
    rgt_nm = 0
    for i in range(rgt_qos):
        if rgt_tseq[i] == rgt_qseq[i]:
            rgt_mvec += "1"
        else:
            rgt_mvec += "0"
            rgt_nm += 1
    mvec = lft_mvec + ('1' * mlen) + rgt_mvec
    assert len(mvec) ==  qlen # sanity check
    nm = lft_nm + rgt_nm
    return (nm <= max_nm, mvec, nm)

def load_tinfos(fn) -> dict:
    tinfos = dict()
    df = pd.read_csv(fn)
    with open(fn, 'r') as fh:
        for _, row in df.iterrows():
            tinfos[row['transcript_id']] = (row['gene_id'], row['gene_name'], row['transcript_type']) # Caleb: add transcript type
    return tinfos

In [3]:
data_dir = "/ccb/salz4-3/hji20/off-target-probe-checker/data/dev"
qry_fa = pyfastx.Fasta(os.path.join(data_dir, 'xenium_human_breast_gene_expression_panel_probe_sequences.fwd.fasta'))
tgt_fa = pyfastx.Fasta(os.path.join(data_dir, 'gencode_basic/gencode.v47.basic.annotation.fmted.fa'))

In [4]:
fn = "/ccb/salz4-3/hji20/off-target-probe-checker/otpc/test_results/dev_mummer/test.mums"
mums = load_mums(fn)

In [5]:
mums

{'ENSG00000134440|NARS|b886a02': [('ENST00000256854.10', 1302, 1, 40)],
 'ENSG00000134440|NARS|de75df5': [('ENST00000256854.10', 552, 1, 40)],
 'ENSG00000134440|NARS|e87b554': [('ENST00000256854.10', 171, 1, 40)],
 'ENSG00000134440|NARS|0036e0f': [('ENST00000256854.10', 977, 1, 40)],
 'ENSG00000134440|NARS|15af113': [('ENST00000256854.10', 900, 1, 40)],
 'ENSG00000134440|NARS|61200ab': [('ENST00000256854.10', 317, 1, 40)],
 'ENSG00000134440|NARS|2bd068a': [('ENST00000256854.10', 1134, 1, 40)],
 'ENSG00000134440|NARS|ce7ac90': [('ENST00000256854.10', 710, 1, 40)],
 'ENSG00000134440|NARS|1e8dc26': [('ENST00000256854.10', 825, 1, 40)],
 'ENSG00000134440|NARS|1d159d8': [('ENST00000256854.10', 1982, 1, 40)],
 'ENSG00000261371|PECAM1|1de4898': [('ENST00000563924.6', 1314, 1, 40)],
 'ENSG00000261371|PECAM1|fae7f98': [('ENST00000563924.6', 1856, 1, 40)],
 'ENSG00000261371|PECAM1|457a640': [('ENST00000563924.6', 1442, 1, 40)],
 'ENSG00000261371|PECAM1|938b0ab': [('ENST00000563924.6', 835, 1, 40

In [7]:
# > ENSG00000145247|OCIAD2|69d056a        Len = 40
#   ENST00000620187.4        380         1        40
#   ENST00000273860.8        317         1        40
#   ENST00000508632.6        303         1        40
#   ENST00000517657.1        141        19        22
#   ENST00000727145.1        718        19        22

qname = 'ENSG00000145247|OCIAD2|69d056a'
tname = 'ENST00000517657.1'
qseq = qry_fa[qname].seq
qlen = len(qseq)
print(f'{qname}\t{qseq}\t{qlen}')

ENSG00000145247|OCIAD2|69d056a	ATTATGCGAGAATGTCAGGAAGAAAGTTTCTGGAAGAGAG	40


In [18]:
qst = 19
qst -= 1 # shift to 0-based coords
tst = 141
tst -= 1 # inclusive coords

mlen = 22
qen = qst + mlen
ten = tst + mlen
print(f'{qst}\t{qen}\t{tst}\t{ten}\t{mlen}')
print(qry_fa[qname].seq[qst:qen])
print(tgt_fa[tname].seq[tst:ten])

18	40	140	162	22
GAAGAAAGTTTCTGGAAGAGAG
GAAGAAAGTTTCTGGAAGAGAG


In [19]:
qry_fa[qname].seq

'ATTATGCGAGAATGTCAGGAAGAAAGTTTCTGGAAGAGAG'

In [None]:
lft_qos = qst
rgt_qos = qlen - qen
print(lft_qos)
print(rgt_qos)
qseq = qry_fa[qname].seq
tseq = tgt_fa[tname].seq
lft_tseq = tseq[tst - lft_qos:tst]
rgt_tseq = tseq[ten:ten + rgt_qos]
lft_qseq = qseq[qst - lft_qos:qst]
rgt_qseq = qseq[qen:qen + rgt_qos]
print(f'tgt left: {lft_tseq}')
print(f'tgt right: {rgt_tseq}')
print(f'qry left: {lft_qseq}')
print(f'qry right: {rgt_qseq}')

18


1

In [52]:
lft_mvec = ""
lft_nm = 0
for i in range(lft_qos):
    if lft_tseq[i] == lft_qseq[i]:
        lft_mvec += "1"
    else:
        lft_mvec += "0"
        lft_nm += 1

print(lft_mvec)
print(lft_nm)

111110011101111110
4


In [56]:
mrec = ('ENST00000727145.1', 718, 19, 22)
qname = 'ENSG00000145247|OCIAD2|69d056a'
check_lft_and_rgt(mrec, qname, qry_fa, tgt_fa)

(False, '1111100111011111101111111111111111111111', 4)

In [65]:
preds = dict()
ctr = 0
for qname in tqdm(mums, total=len(mums)):
    preds[qname] = []
    for mrec in mums[qname]:
        res = check_lft_and_rgt(mrec, qname, qry_fa, tgt_fa)
        if res[0] and res[2] == 1:
            ctr += 1
        # TODO: introduce a python struct to store these info
        preds[qname].append((mrec, check_lft_and_rgt(mrec, qname, qry_fa, tgt_fa)))
ctr

  0%|                                                                                                                                                              | 0/4775 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4775/4775 [00:15<00:00, 313.47it/s]


303

In [68]:
tinfos = load_tinfos("/ccb/salz4-3/hji20/off-target-probe-checker/results/breast_gencode_basic/t2g.csv")

In [71]:
tinfos

{'ENST00000832828.1': ('ENSG00000290825', 'DDX11L16', 'lncRNA'),
 'ENST00000450305.2': ('ENSG00000223972',
  'DDX11L1',
  'transcribed_unprocessed_pseudogene'),
 'ENST00000831158.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831210.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831361.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831289.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831499.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831463.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831292.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831355.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831487.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000831582.1': ('ENSG00000310526', 'WASH7P', 'lncRNA'),
 'ENST00000488147.2': ('ENSG00000227232',
  'WASH7P',
  'transcribed_unprocessed_pseudogene'),
 'ENST00000619216.1': ('ENSG00000278267', 'MIR6859-1', 'miRNA'),
 'ENST00000834619.1': ('ENSG00000243485', 'MIR130

In [72]:
preds_2 = dict()
for qname in preds:
    target_ids = set()
    target_names = set()
    tx_types = set()
    for mrec, pred in preds[qname]:
        gid, gname, tx_type = tinfos[mrec[0]]
        target_ids.add(gid)
        target_names.add(gname)
        tx_types.add(tx_type)
    preds_2[qname] = (len(target_ids), target_ids, target_names, tx_types)

In [74]:
def write_preds(preds, fn):
    with open(fn, 'w') as fh:
        fh.write('probe_id\tn_targets\ttarget_ids\ttarget_names\tcigars\ttranscript_types\n')
        for qname in preds:
            n, target_ids, target_names, tx_types = preds[qname]
            id_s = ','.join(target_ids)
            name_s = ','.join(target_names)
            type_s = ','.join(tx_types)
            fh.write(f'{qname}\t{n}\t[{id_s}]\t[{name_s}]\tNA\t[{type_s}]\n')

In [75]:
fn = "/ccb/salz4-3/hji20/off-target-probe-checker/otpc/test_results/dev_mummer/probe2targets.tsv"
write_preds(preds_2, fn)

In [1]:
def char2sym(char):
    if char == '0':
        return 'X'
    return '='

def compress_bvec(bvec):
    out = []
    curr_char = bvec[0]
    ctr = 1
    for char in bvec[1:]:
        if char == curr_char:
            ctr += 1
        else:
            out.append(f"{char2sym(curr_char)}{ctr}")
            curr_char = char
            ctr = 1
    out.append(f"{char2sym(curr_char)}{ctr}")
    return ''.join(out)

In [2]:
s = "1111100111011111101111111111111111111111"
compress_bvec(s)

'=5X2=3X1=6X1=22'